Prepare for first stage of deployment: do not bump format version and do not write data in new format but recognoze new format

Make ruff happy
Add test for compression
2026-03-05 09:20:38 +00:00 · 2024-03-15 10:02:51 +02:00 · 2024-03-14 18:05:30 +02:00 · 2024-03-14 16:45:45 +02:00 · 2024-03-14 14:21:35 +02:00 · 2024-03-14 08:33:37 +02:00
343 changed files with 9317 additions and 24129 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -22,7 +22,6 @@
 !s3_scrubber/
 !safekeeper/
 !storage_broker/
-!storage_controller/
 !trace/
 !vendor/postgres-*/
 !workspace_hack/
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -150,7 +150,7 @@ runs:

        # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
        # and to keep files on the host to upload them to the database
-        time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/"
+        time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"

        # Generate redirect
        cat <<EOF > ${WORKDIR}/index.html
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -10,7 +10,7 @@ inputs:
    required: true
  api_host:
    desctiption: 'Neon API host'
-    default: console-stage.neon.build
+    default: console.stage.neon.tech
 outputs:
  dsn:
    description: 'Created Branch DSN (for main database)'
--- a/.github/actions/neon-branch-delete/action.yml
+++ b/.github/actions/neon-branch-delete/action.yml
@@ -13,7 +13,7 @@ inputs:
    required: true
  api_host:
    desctiption: 'Neon API host'
-    default: console-stage.neon.build
+    default: console.stage.neon.tech

 runs:
  using: "composite"
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -13,7 +13,7 @@ inputs:
    default: 15
  api_host:
    desctiption: 'Neon API host'
-    default: console-stage.neon.build
+    default: console.stage.neon.tech
  provisioner:
    desctiption: 'k8s-pod or k8s-neonvm'
    default: 'k8s-pod'
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -10,7 +10,7 @@ inputs:
    required: true
  api_host:
    desctiption: 'Neon API host'
-    default: console-stage.neon.build
+    default: console.stage.neon.tech

 runs:
  using: "composite"
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -18,7 +18,6 @@ on:

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
-  cancel-in-progress: false

 env:
  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -147,16 +147,15 @@ jobs:
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
-                      { "platform": "neon-captest-new",              "db_size": "50gb" },
-                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
+          "include": [{ "platform": "neon-captest-freetier",   "db_size": "3gb"  },
+                      { "platform": "neon-captest-new",        "db_size": "50gb" },
+                      { "platform": "neonvm-captest-freetier", "db_size": "3gb"  },
+                      { "platform": "neonvm-captest-new",      "db_size": "50gb" }]
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
-                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
+                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -172,7 +171,7 @@ jobs:

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
-                                                     { "platform": "rds-aurora"   }]')
+                                                   { "platform": "rds-aurora"   }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -191,7 +190,7 @@ jobs:

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                     { "platform": "rds-aurora",   "scale": "10" }]')
+                                                    { "platform": "rds-aurora",   "scale": "10" }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -254,9 +253,6 @@ jobs:
          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
-          neonvm-captest-sharding-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
-            ;;
          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
@@ -274,15 +270,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Benchmark init
      uses: ./.github/actions/run-python-test-set
@@ -409,15 +401,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: ClickBench benchmark
      uses: ./.github/actions/run-python-test-set
@@ -519,15 +507,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Run TPC-H benchmark
      uses: ./.github/actions/run-python-test-set
@@ -613,15 +597,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -21,7 +21,6 @@ defaults:

 concurrency:
  group: build-build-tools-image-${{ inputs.image-tag }}
-  cancel-in-progress: false

 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -461,7 +461,6 @@ jobs:

      - name: Pytest regression tests
        uses: ./.github/actions/run-python-test-set
-        timeout-minutes: 60
        with:
          build_type: ${{ matrix.build_type }}
          test_selection: regress
@@ -735,7 +734,7 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3

      - uses: docker/login-action@v3
        with:
@@ -792,7 +791,7 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
        with:
          # Disable parallelism for docker buildkit.
          # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
@@ -865,7 +864,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.28.1
+      VM_BUILDER_VERSION: v0.23.2

    steps:
      - name: Checkout
@@ -1121,34 +1120,18 @@ jobs:
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
+
+            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
+            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
+            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
              -f deployPgSniRouter=false \
              -f deployProxy=false \
              -f deployStorage=true \
              -f deployStorageBroker=true \
-              -f deployStorageController=true \
-              -f branch=main \
-              -f dockerTag=${{needs.tag.outputs.build-tag}} \
-              -f deployPreprodRegion=true
-
-            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
-              -f deployStorage=true \
-              -f deployStorageBroker=true \
-              -f deployStorageController=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
-              -f deployPgSniRouter=true \
-              -f deployProxy=true \
-              -f deployStorage=false \
-              -f deployStorageBroker=false \
-              -f deployStorageController=false \
-              -f branch=main \
-              -f dockerTag=${{needs.tag.outputs.build-tag}} \
-              -f deployPreprodRegion=true
-
            gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
              -f deployPgSniRouter=true \
              -f deployProxy=true \
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -28,9 +28,7 @@ jobs:
      - name: Get build-tools image tag for the current commit
        id: get-build-tools-tag
        env:
-          # Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
-          # we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
-          COMMIT_SHA: ${{ github.sha }}
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          LAST_BUILD_TOOLS_SHA=$(
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -20,7 +20,6 @@ defaults:

 concurrency:
  group: pin-build-tools-image-${{ inputs.from-tag }}
-  cancel-in-progress: false

 permissions: {}

--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -62,14 +62,14 @@ jobs:

  trigger-e2e-tests:
    needs: [ tag ]
-    runs-on: ubuntu-latest
+    runs-on: [ self-hosted, gen3, small ]
    env:
      TAG: ${{ needs.tag.outputs.build-tag }}
+    container:
+      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
+      options: --init
    steps:
      - name: check if ecr image are present
-        env:
-          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
-          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
        run: |
          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
@@ -79,55 +79,41 @@ jobs:
            fi
          done

-      - name: Set e2e-platforms
-        id: e2e-platforms
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        run: |
-          # Default set of platforms to run e2e tests on
-          platforms='["docker", "k8s"]'
-
-          # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
-          # If the workflow run is not a pull request, add k8s-neonvm to the list.
-          if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
-            for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
-              case "$f" in
-                vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
-                  platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
-                  ;;
-                *)
-                  # no-op
-                  ;;
-              esac
-            done
-          else
-            platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
-          fi
-
-          echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
-
      - name: Set PR's status to pending and request a remote CI test
-        env:
-          E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
-          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
-          REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud"
+          # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
+          # but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
+          # to place a job run status update later.
+          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
+          # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
+          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}

-          gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \
-            --method POST \
-            --raw-field "state=pending" \
-            --raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
-            --raw-field "context=neon-cloud-e2e"
+          REMOTE_REPO="${{ github.repository_owner }}/cloud"

-          gh workflow --repo ${REMOTE_REPO} \
-            run testing.yml \
-              --ref "main" \
-              --raw-field "ci_job_name=neon-cloud-e2e" \
-              --raw-field "commit_hash=$COMMIT_SHA" \
-              --raw-field "remote_repo=${GITHUB_REPOSITORY}" \
-              --raw-field "storage_image_tag=${TAG}" \
-              --raw-field "compute_image_tag=${TAG}" \
-              --raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
-              --raw-field "e2e-platforms=${E2E_PLATFORMS}"
+          curl -f -X POST \
+          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"state\": \"pending\",
+              \"context\": \"neon-cloud-e2e\",
+              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
+            }"
+
+          curl -f -X POST \
+          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
+          -H "Accept: application/vnd.github.v3+json" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
+          --data \
+            "{
+              \"ref\": \"main\",
+              \"inputs\": {
+                \"ci_job_name\": \"neon-cloud-e2e\",
+                \"commit_hash\": \"$COMMIT_SHA\",
+                \"remote_repo\": \"${{ github.repository }}\",
+                \"storage_image_tag\": \"${TAG}\",
+                \"compute_image_tag\": \"${TAG}\",
+                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
+              }
+            }"
--- a/5
+++ b/5
@@ -1,13 +1,12 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/storage_controller @neondatabase/storage
+/control_plane/attachment_service @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
+/libs/postgres_ffi/ @neondatabase/compute
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
 /libs/vm_monitor/ @neondatabase/autoscaling
 /pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
-/pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
 /proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
 /vendor/ @neondatabase/compute
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
 members = [
    "compute_tools",
    "control_plane",
-    "control_plane/storcon_cli",
+    "control_plane/attachment_service",
    "pageserver",
    "pageserver/compaction",
    "pageserver/ctl",
@@ -12,7 +12,6 @@ members = [
    "proxy",
    "safekeeper",
    "storage_broker",
-    "storage_controller",
    "s3_scrubber",
    "workspace_hack",
    "trace",
@@ -44,7 +43,6 @@ license = "Apache-2.0"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
-atomic-take = "1.1.0"
 azure_core = "0.18"
 azure_identity = "0.18"
 azure_storage = "0.18"
@@ -54,12 +52,10 @@ async-stream = "0.3"
 async-trait = "0.1"
 aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "1.14"
-aws-sdk-iam = "1.15.0"
+aws-sdk-secretsmanager = { version = "1.14.0" }
 aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
 aws-smithy-types = "1.1.4"
 aws-credential-types = "1.1.4"
-aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
-aws-types = "1.1.7"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -80,7 +76,6 @@ either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
 fail = "0.5.0"
-fallible-iterator = "0.2"
 fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
@@ -93,12 +88,11 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
-http = {version = "1.1.0", features = ["std"]}
 http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
-hyper-tungstenite = "0.13.0"
+hyper-tungstenite = "0.11"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
@@ -106,9 +100,8 @@ jsonwebtoken = "9"
 lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
+lz4_flex = "0.11.1"
 md5 = "0.7.0"
-measured = { version = "0.0.21", features=["lasso"] }
-measured-process = { version = "0.0.21" }
 memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -128,7 +121,7 @@ procfs = "0.14"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
-redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
+redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
@@ -156,12 +149,11 @@ smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
-"subtle"  = "2.5.0"
 svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
-test-context = "0.3"
+test-context = "0.1"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"
 tikv-jemalloc-ctl = "0.5"
@@ -252,7 +244,7 @@ debug = true

 # disable debug symbols for all packages except this one to decrease binaries size
 [profile.release.package."*"]
-debug = true
+debug = false

 [profile.release-line-debug]
 inherits = "release"
--- a/1
+++ b/1
@@ -44,7 +44,6 @@ COPY --from=pg-build /home/nonroot/pg_install/v15/include/postgresql/server pg_i
 COPY --from=pg-build /home/nonroot/pg_install/v16/include/postgresql/server pg_install/v16/include/postgresql/server
 COPY --chown=nonroot . .

-ENV _RJEM_MALLOC_CONF="prof:true"
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -58,12 +58,6 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
    && mv protoc/include/google /usr/local/include/google \
    && rm -rf protoc.zip protoc

-# s5cmd
-ENV S5CMD_VERSION=2.2.2
-RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
-    && chmod +x s5cmd \
-    && mv s5cmd /usr/local/bin/s5cmd
-
 # LLVM
 ENV LLVM_VERSION=17
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
@@ -141,7 +135,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.77.0
+ENV RUSTC_VERSION=1.76.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
@@ -155,7 +149,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    cargo install --git https://github.com/paritytech/cachepot && \
    cargo install rustfilt && \
    cargo install cargo-hakari && \
-    cargo install cargo-deny --locked && \
+    cargo install cargo-deny && \
    cargo install cargo-hack && \
    cargo install cargo-nextest && \
    rm -rf /home/nonroot/.cargo/registry && \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -944,9 +944,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl

-# Create remote extension download directory
-RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
-
 # Install:
 # libreadline8 for psql
 # libicu67, locales for collations (including ICU and plpgsql_check)
--- a/README.md
+++ b/README.md
@@ -238,14 +238,6 @@ If you encounter errors during setting up the initial tenant, it's best to stop

 ## Running tests

-### Rust unit tests
-
-We are using [`cargo-nextest`](https://nexte.st/) to run the tests in Github Workflows.
-Some crates do not support running plain `cargo test` anymore, prefer `cargo nextest run` instead.
-You can install `cargo-nextest` with `cargo install cargo-nextest`.
-
-### Integration tests
-
 Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).

 ```sh
--- a/clippy.toml
+++ b/clippy.toml
@@ -2,8 +2,6 @@ disallowed-methods = [
    "tokio::task::block_in_place",
    # Allow this for now, to deny it later once we stop using Handle::block_on completely
    # "tokio::runtime::Handle::block_on",
-    # use tokio_epoll_uring_ext instead
-    "tokio_epoll_uring::thread_local_system",
 ]

 disallowed-macros = [
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -32,29 +32,6 @@ compute_ctl -D /var/db/postgres/compute \
            -b /usr/local/bin/postgres
 ```

-## State Diagram
-
-Computes can be in various states. Below is a diagram that details how a
-compute moves between states.
-
-```mermaid
-%% https://mermaid.js.org/syntax/stateDiagram.html
-stateDiagram-v2
-  [*] --> Empty : Compute spawned
-  Empty --> ConfigurationPending : Waiting for compute spec
-  ConfigurationPending --> Configuration : Received compute spec
-  Configuration --> Failed : Failed to configure the compute
-  Configuration --> Running : Compute has been configured
-  Empty --> Init : Compute spec is immediately available
-  Empty --> TerminationPending : Requested termination
-  Init --> Failed : Failed to start Postgres
-  Init --> Running : Started Postgres
-  Running --> TerminationPending : Requested termination
-  TerminationPending --> Terminated : Terminated compute
-  Failed --> [*] : Compute exited
-  Terminated --> [*] : Compute exited
-```
-
 ## Tests

 Cargo formatter:
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -818,15 +818,9 @@ impl ComputeNode {
                        Client::connect(zenith_admin_connstr.as_str(), NoTls)
                            .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
                    // Disable forwarding so that users don't get a cloud_admin role
-
-                    let mut func = || {
-                        client.simple_query("SET neon.forward_ddl = false")?;
-                        client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
-                        client.simple_query("GRANT zenith_admin TO cloud_admin")?;
-                        Ok::<_, anyhow::Error>(())
-                    };
-                    func().context("apply_config setup cloud_admin")?;
-
+                    client.simple_query("SET neon.forward_ddl = false")?;
+                    client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
+                    client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                    drop(client);

                    // reconnect with connstring with expected name
@@ -838,29 +832,24 @@ impl ComputeNode {
        };

        // Disable DDL forwarding because control plane already knows about these roles/databases.
-        client
-            .simple_query("SET neon.forward_ddl = false")
-            .context("apply_config SET neon.forward_ddl = false")?;
+        client.simple_query("SET neon.forward_ddl = false")?;

        // Proceed with post-startup configuration. Note, that order of operations is important.
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
-        create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?;
-        cleanup_instance(&mut client).context("apply_config cleanup_instance")?;
-        handle_roles(spec, &mut client).context("apply_config handle_roles")?;
-        handle_databases(spec, &mut client).context("apply_config handle_databases")?;
-        handle_role_deletions(spec, connstr.as_str(), &mut client)
-            .context("apply_config handle_role_deletions")?;
+        create_neon_superuser(spec, &mut client)?;
+        cleanup_instance(&mut client)?;
+        handle_roles(spec, &mut client)?;
+        handle_databases(spec, &mut client)?;
+        handle_role_deletions(spec, connstr.as_str(), &mut client)?;
        handle_grants(
            spec,
            &mut client,
            connstr.as_str(),
            self.has_feature(ComputeFeature::AnonExtension),
-        )
-        .context("apply_config handle_grants")?;
-        handle_extensions(spec, &mut client).context("apply_config handle_extensions")?;
-        handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?;
-        create_availability_check_data(&mut client)
-            .context("apply_config create_availability_check_data")?;
+        )?;
+        handle_extensions(spec, &mut client)?;
+        handle_extension_neon(&mut client)?;
+        create_availability_check_data(&mut client)?;

        // 'Close' connection
        drop(client);
@@ -868,7 +857,7 @@ impl ComputeNode {
        // Run migrations separately to not hold up cold starts
        thread::spawn(move || {
            let mut client = Client::connect(connstr.as_str(), NoTls)?;
-            handle_migrations(&mut client).context("apply_config handle_migrations")
+            handle_migrations(&mut client)
        });
        Ok(())
    }
@@ -1273,12 +1262,10 @@ LIMIT 100",
        .await
        .map_err(DownloadError::Other);

-        if download_size.is_ok() {
-            self.ext_download_progress
-                .write()
-                .expect("bad lock")
-                .insert(ext_archive_name.to_string(), (download_start, true));
-        }
+        self.ext_download_progress
+            .write()
+            .expect("bad lock")
+            .insert(ext_archive_name.to_string(), (download_start, true));

        download_size
    }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -6,8 +6,8 @@ use std::path::Path;
 use anyhow::Result;

 use crate::pg_helpers::escape_conf_value;
-use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize};
-use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
+use crate::pg_helpers::PgOptionsSerialize;
+use compute_api::spec::{ComputeMode, ComputeSpec};

 /// Check that `line` is inside a text file and put it there if it is not.
 /// Create file if it doesn't exist.
@@ -17,7 +17,6 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
        .write(true)
        .create(true)
        .append(false)
-        .truncate(false)
        .open(path)?;
    let buf = io::BufReader::new(&file);
    let mut count: usize = 0;
@@ -92,27 +91,6 @@ pub fn write_postgres_conf(
        }
    }

-    if cfg!(target_os = "linux") {
-        // Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is
-        // disabled), then the control plane has enabled swap and we should set
-        // dynamic_shared_memory_type = 'mmap'.
-        //
-        // This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047.
-        let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory")
-            // ignore any errors - they may be expected to occur under certain situations (e.g. when
-            // not running in Linux).
-            .unwrap_or_else(|_| String::new());
-        if overcommit_memory_contents.trim() == "2" {
-            let opt = GenericOption {
-                name: "dynamic_shared_memory_type".to_owned(),
-                value: Some("mmap".to_owned()),
-                vartype: "enum".to_owned(),
-            };
-
-            write!(file, "{}", opt.to_pg_setting())?;
-        }
-    }
-
    // If there are any extra options in the 'settings' field, append those
    if spec.cluster.settings.is_some() {
        writeln!(file, "# Managed by compute_ctl: begin")?;
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String {
    format!("'{}'", res)
 }

-pub trait GenericOptionExt {
+trait GenericOptionExt {
    fn to_pg_option(&self) -> String;
    fn to_pg_setting(&self) -> String;
 }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -2,7 +2,7 @@ use std::fs::File;
 use std::path::Path;
 use std::str::FromStr;

-use anyhow::{anyhow, bail, Context, Result};
+use anyhow::{anyhow, bail, Result};
 use postgres::config::Config;
 use postgres::{Client, NoTls};
 use reqwest::StatusCode;
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            RoleAction::Create => {
                // This branch only runs when roles are created through the console, so it is
                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
-                // from neon_superuser.
+                // from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
                let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("running role create query: '{}'", &query);
@@ -698,8 +698,7 @@ pub fn handle_grants(

        // it is important to run this after all grants
        if enable_anon_extension {
-            handle_extension_anon(spec, &db.owner, &mut db_client, false)
-                .context("handle_grants handle_extension_anon")?;
+            handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
        }
    }

@@ -746,12 +745,7 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
    // - extension was already installed and is up to date
    let query = "ALTER EXTENSION neon UPDATE";
    info!("update neon extension version with query: {}", query);
-    if let Err(e) = client.simple_query(query) {
-        error!(
-            "failed to upgrade neon extension during `handle_extension_neon`: {}",
-            e
-        );
-    }
+    client.simple_query(query)?;

    Ok(())
 }
@@ -810,40 +804,43 @@ $$;"#,
        "",
        "",
        "",
-        "",
        // Add new migrations below.
+        r#"
+DO $$
+DECLARE
+    role_name TEXT;
+BEGIN
+    FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
+    LOOP
+        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
+        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
+    END LOOP;
+END
+$$;"#,
    ];

-    let mut func = || {
-        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-        client.simple_query(query)?;
+    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+    client.simple_query(query)?;

-        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-        client.simple_query(query)?;
+    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+    client.simple_query(query)?;

-        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-        client.simple_query(query)?;
+    query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+    client.simple_query(query)?;

-        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-        client.simple_query(query)?;
+    query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+    client.simple_query(query)?;

-        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-        client.simple_query(query)?;
-        Ok::<_, anyhow::Error>(())
-    };
-    func().context("handle_migrations prepare")?;
+    query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+    client.simple_query(query)?;

-    let query = "SELECT id FROM neon_migration.migration_id";
-    let row = client
-        .query_one(query, &[])
-        .context("handle_migrations get migration_id")?;
+    query = "SELECT id FROM neon_migration.migration_id";
+    let row = client.query_one(query, &[])?;
    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
    let starting_migration_id = current_migration;

-    let query = "BEGIN";
-    client
-        .simple_query(query)
-        .context("handle_migrations begin")?;
+    query = "BEGIN";
+    client.simple_query(query)?;

    while current_migration < migrations.len() {
        let migration = &migrations[current_migration];
@@ -851,9 +848,7 @@ $$;"#,
            info!("Skip migration id={}", current_migration);
        } else {
            info!("Running migration:\n{}\n", migration);
-            client.simple_query(migration).with_context(|| {
-                format!("handle_migrations current_migration={}", current_migration)
-            })?;
+            client.simple_query(migration)?;
        }
        current_migration += 1;
    }
@@ -861,14 +856,10 @@ $$;"#,
        "UPDATE neon_migration.migration_id SET id={}",
        migrations.len()
    );
-    client
-        .simple_query(&setval)
-        .context("handle_migrations update id")?;
+    client.simple_query(&setval)?;

-    let query = "COMMIT";
-    client
-        .simple_query(query)
-        .context("handle_migrations commit")?;
+    query = "COMMIT";
+    client.simple_query(query)?;

    info!(
        "Ran {} migrations",
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -12,7 +12,6 @@ clap.workspace = true
 comfy-table.workspace = true
 futures.workspace = true
 git-version.workspace = true
-humantime.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "storage_controller"
+name = "attachment_service"
 version = "0.1.0"
 edition.workspace = true
 license.workspace = true
@@ -16,37 +16,31 @@ testing = []
 [dependencies]
 anyhow.workspace = true
 aws-config.workspace = true
-bytes.workspace = true
+aws-sdk-secretsmanager.workspace = true
 camino.workspace = true
 clap.workspace = true
-fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
-hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
-itertools.workspace = true
-lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
 reqwest.workspace = true
-routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
-measured.workspace = true

 diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
 r2d2 = { version = "0.8.10" }

-utils = { path = "../libs/utils/" }
-metrics = { path = "../libs/metrics/" }
-control_plane = { path = "../control_plane" }
-workspace_hack = { version = "0.1", path = "../workspace_hack" }
+utils = { path = "../../libs/utils/" }
+metrics = { path = "../../libs/metrics/" }
+control_plane = { path = ".." }
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }

--- a/control_plane/attachment_service/migrations/.keep
+++ b/control_plane/attachment_service/migrations/.keep
--- a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
--- a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
--- a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
--- a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
--- a/control_plane/attachment_service/src/auth.rs
+++ b/control_plane/attachment_service/src/auth.rs
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -1,4 +1,3 @@
-use std::sync::Arc;
 use std::{collections::HashMap, time::Duration};

 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
@@ -15,32 +14,19 @@ use utils::{

 use crate::service::Config;

+const BUSY_DELAY: Duration = Duration::from_secs(1);
 const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);

-const NOTIFY_REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
-
 pub(crate) const API_CONCURRENCY: usize = 32;

-struct UnshardedComputeHookTenant {
-    // Which node is this tenant attached to
-    node_id: NodeId,
-
-    // Must hold this lock to send a notification.
-    send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
-}
 struct ShardedComputeHookTenant {
    stripe_size: ShardStripeSize,
    shard_count: ShardCount,
    shards: Vec<(ShardNumber, NodeId)>,
-
-    // Must hold this lock to send a notification.  The contents represent
-    // the last successfully sent notification, and are used to coalesce multiple
-    // updates by only sending when there is a chance since our last successful send.
-    send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
 }

 enum ComputeHookTenant {
-    Unsharded(UnshardedComputeHookTenant),
+    Unsharded(NodeId),
    Sharded(ShardedComputeHookTenant),
 }

@@ -52,20 +38,9 @@ impl ComputeHookTenant {
                shards: vec![(tenant_shard_id.shard_number, node_id)],
                stripe_size,
                shard_count: tenant_shard_id.shard_count,
-                send_lock: Arc::default(),
            })
        } else {
-            Self::Unsharded(UnshardedComputeHookTenant {
-                node_id,
-                send_lock: Arc::default(),
-            })
-        }
-    }
-
-    fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>> {
-        match self {
-            Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock,
-            Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock,
+            Self::Unsharded(node_id)
        }
    }

@@ -78,8 +53,8 @@ impl ComputeHookTenant {
        node_id: NodeId,
    ) {
        match self {
-            Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => {
-                unsharded_tenant.node_id = node_id
+            Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
+                *existing_node_id = node_id
            }
            Self::Sharded(sharded_tenant)
                if sharded_tenant.stripe_size == stripe_size
@@ -106,14 +81,14 @@ impl ComputeHookTenant {
    }
 }

-#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Debug)]
 struct ComputeHookNotifyRequestShard {
    node_id: NodeId,
    shard_number: ShardNumber,
 }

 /// Request body that we send to the control plane to notify it of where a tenant is attached
-#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Debug)]
 struct ComputeHookNotifyRequest {
    tenant_id: TenantId,
    stripe_size: Option<ShardStripeSize>,
@@ -146,44 +121,14 @@ pub(crate) enum NotifyError {
    Fatal(StatusCode),
 }

-enum MaybeSendResult {
-    // Please send this request while holding the lock, and if you succeed then write
-    // the request into the lock.
-    Transmit(
-        (
-            ComputeHookNotifyRequest,
-            tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>,
-        ),
-    ),
-    // Something requires sending, but you must wait for a current sender then call again
-    AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>),
-    // Nothing requires sending
-    Noop,
-}
-
 impl ComputeHookTenant {
-    fn maybe_send(
-        &self,
-        tenant_id: TenantId,
-        lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>>,
-    ) -> MaybeSendResult {
-        let locked = match lock {
-            Some(already_locked) => already_locked,
-            None => {
-                // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock.
-                let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else {
-                    return MaybeSendResult::AwaitLock(self.get_send_lock().clone());
-                };
-                locked
-            }
-        };
-
-        let request = match self {
-            Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest {
+    fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
+        match self {
+            Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
                tenant_id,
                shards: vec![ComputeHookNotifyRequestShard {
                    shard_number: ShardNumber(0),
-                    node_id: unsharded_tenant.node_id,
+                    node_id: *node_id,
                }],
                stripe_size: None,
            }),
@@ -207,25 +152,12 @@ impl ComputeHookTenant {
                // Sharded tenant doesn't yet have information for all its shards

                tracing::info!(
-                    "ComputeHookTenant::maybe_send: not enough shards ({}/{})",
+                    "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
                    sharded_tenant.shards.len(),
                    sharded_tenant.shard_count.count()
                );
                None
            }
-        };
-
-        match request {
-            None => {
-                // Not yet ready to emit a notification
-                tracing::info!("Tenant isn't yet ready to emit a notification");
-                MaybeSendResult::Noop
-            }
-            Some(request) if Some(&request) == locked.as_ref() => {
-                // No change from the last value successfully sent
-                MaybeSendResult::Noop
-            }
-            Some(request) => MaybeSendResult::Transmit((request, locked)),
        }
    }
 }
@@ -235,19 +167,8 @@ impl ComputeHookTenant {
 /// the compute connection string.
 pub(super) struct ComputeHook {
    config: Config,
-    state: std::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
+    state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
    authorization_header: Option<String>,
-
-    // Concurrency limiter, so that we do not overload the cloud control plane when updating
-    // large numbers of tenants (e.g. when failing over after a node failure)
-    api_concurrency: tokio::sync::Semaphore,
-
-    // This lock is only used in testing enviroments, to serialize calls into neon_lock
-    neon_local_lock: tokio::sync::Mutex<()>,
-
-    // We share a client across all notifications to enable connection re-use etc when
-    // sending large numbers of notifications
-    client: reqwest::Client,
 }

 impl ComputeHook {
@@ -257,30 +178,18 @@ impl ComputeHook {
            .clone()
            .map(|jwt| format!("Bearer {}", jwt));

-        let client = reqwest::ClientBuilder::new()
-            .timeout(NOTIFY_REQUEST_TIMEOUT)
-            .build()
-            .expect("Failed to construct HTTP client");
-
        Self {
            state: Default::default(),
            config,
            authorization_header,
-            neon_local_lock: Default::default(),
-            api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
-            client,
        }
    }

    /// For test environments: use neon_local's LocalEnv to update compute
    async fn do_notify_local(
        &self,
-        reconfigure_request: &ComputeHookNotifyRequest,
+        reconfigure_request: ComputeHookNotifyRequest,
    ) -> anyhow::Result<()> {
-        // neon_local updates are not safe to call concurrently, use a lock to serialize
-        // all calls to this function
-        let _locked = self.neon_local_lock.lock().await;
-
        let env = match LocalEnv::load_config() {
            Ok(e) => e,
            Err(e) => {
@@ -297,7 +206,7 @@ impl ComputeHook {
        } = reconfigure_request;

        let compute_pageservers = shards
-            .iter()
+            .into_iter()
            .map(|shard| {
                let ps_conf = env
                    .get_pageserver_conf(shard.node_id)
@@ -309,10 +218,10 @@ impl ComputeHook {
            .collect::<Vec<_>>();

        for (endpoint_name, endpoint) in &cplane.endpoints {
-            if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
+            if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
                tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
                endpoint
-                    .reconfigure(compute_pageservers.clone(), *stripe_size)
+                    .reconfigure(compute_pageservers.clone(), stripe_size)
                    .await?;
            }
        }
@@ -322,11 +231,12 @@ impl ComputeHook {

    async fn do_notify_iteration(
        &self,
+        client: &reqwest::Client,
        url: &String,
        reconfigure_request: &ComputeHookNotifyRequest,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
-        let req = self.client.request(Method::PUT, url);
+        let req = client.request(Method::PUT, url);
        let req = if let Some(value) = &self.authorization_header {
            req.header(reqwest::header::AUTHORIZATION, value)
        } else {
@@ -370,10 +280,11 @@ impl ComputeHook {
                Err(NotifyError::SlowDown)
            }
            StatusCode::LOCKED => {
-                // We consider this fatal, because it's possible that the operation blocking the control one is
-                // also the one that is waiting for this reconcile.  We should let the reconciler calling
-                // this hook fail, to give control plane a chance to un-lock.
-                tracing::info!("Control plane reports tenant is locked, dropping out of notify");
+                // Delay our retry if busy: the usual fast exponential backoff in backoff::retry
+                // is not appropriate
+                tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
+                    .await
+                    .ok();
                Err(NotifyError::Busy)
            }
            StatusCode::SERVICE_UNAVAILABLE
@@ -389,27 +300,13 @@ impl ComputeHook {
    async fn do_notify(
        &self,
        url: &String,
-        reconfigure_request: &ComputeHookNotifyRequest,
+        reconfigure_request: ComputeHookNotifyRequest,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
-        // We hold these semaphore units across all retries, rather than only across each
-        // HTTP request: this is to preserve fairness and avoid a situation where a retry might
-        // time out waiting for a semaphore.
-        let _units = self
-            .api_concurrency
-            .acquire()
-            .await
-            // Interpret closed semaphore as shutdown
-            .map_err(|_| NotifyError::ShuttingDown)?;
-
+        let client = reqwest::Client::new();
        backoff::retry(
-            || self.do_notify_iteration(url, reconfigure_request, cancel),
-            |e| {
-                matches!(
-                    e,
-                    NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy
-                )
-            },
+            || self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
+            |e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
            3,
            10,
            "Send compute notification",
@@ -443,70 +340,42 @@ impl ComputeHook {
        stripe_size: ShardStripeSize,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
-        let maybe_send_result = {
-            let mut state_locked = self.state.lock().unwrap();
+        let mut locked = self.state.lock().await;

-            use std::collections::hash_map::Entry;
-            let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
-                Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
-                    tenant_shard_id,
-                    stripe_size,
-                    node_id,
-                )),
-                Entry::Occupied(e) => {
-                    let tenant = e.into_mut();
-                    tenant.update(tenant_shard_id, stripe_size, node_id);
-                    tenant
-                }
-            };
-            tenant.maybe_send(tenant_shard_id.tenant_id, None)
+        use std::collections::hash_map::Entry;
+        let tenant = match locked.entry(tenant_shard_id.tenant_id) {
+            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
+                tenant_shard_id,
+                stripe_size,
+                node_id,
+            )),
+            Entry::Occupied(e) => {
+                let tenant = e.into_mut();
+                tenant.update(tenant_shard_id, stripe_size, node_id);
+                tenant
+            }
        };

-        // Process result: we may get an update to send, or we may have to wait for a lock
-        // before trying again.
-        let (request, mut send_lock_guard) = match maybe_send_result {
-            MaybeSendResult::Noop => {
-                return Ok(());
-            }
-            MaybeSendResult::AwaitLock(send_lock) => {
-                let send_locked = send_lock.lock_owned().await;
-
-                // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here
-                // we have acquired the send lock and take `[Self::state]` lock.  This is safe because maybe_send only uses
-                // try_lock.
-                let state_locked = self.state.lock().unwrap();
-                let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else {
-                    return Ok(());
-                };
-                match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) {
-                    MaybeSendResult::AwaitLock(_) => {
-                        unreachable!("We supplied lock guard")
-                    }
-                    MaybeSendResult::Noop => {
-                        return Ok(());
-                    }
-                    MaybeSendResult::Transmit((request, lock)) => (request, lock),
-                }
-            }
-            MaybeSendResult::Transmit((request, lock)) => (request, lock),
+        let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
+        let Some(reconfigure_request) = reconfigure_request else {
+            // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
+            // until it does.
+            tracing::info!("Tenant isn't yet ready to emit a notification");
+            return Ok(());
        };

-        let result = if let Some(notify_url) = &self.config.compute_hook_url {
-            self.do_notify(notify_url, &request, cancel).await
+        if let Some(notify_url) = &self.config.compute_hook_url {
+            self.do_notify(notify_url, reconfigure_request, cancel)
+                .await
        } else {
-            self.do_notify_local(&request).await.map_err(|e| {
-                // This path is for testing only, so munge the error into our prod-style error type.
-                tracing::error!("Local notification hook failed: {e}");
-                NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
-            })
-        };
-
-        if result.is_ok() {
-            // Before dropping the send lock, stash the request we just sent so that
-            // subsequent callers can avoid redundantly re-sending the same thing.
-            *send_lock_guard = Some(request);
+            self.do_notify_local(reconfigure_request)
+                .await
+                .map_err(|e| {
+                    // This path is for testing only, so munge the error into our prod-style error type.
+                    tracing::error!("Local notification hook failed: {e}");
+                    NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
+                })
        }
-        result
    }
 }

@@ -530,22 +399,21 @@ pub(crate) mod tests {
            NodeId(1),
        );

-        // An unsharded tenant is always ready to emit a notification, but won't
-        // send the same one twice
-        let send_result = tenant_state.maybe_send(tenant_id, None);
-        let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
-            anyhow::bail!("Wrong send result");
-        };
-        assert_eq!(request.shards.len(), 1);
-        assert!(request.stripe_size.is_none());
-
-        // Simulate successful send
-        *guard = Some(request);
-        drop(guard);
-
-        // Try asking again: this should be a no-op
-        let send_result = tenant_state.maybe_send(tenant_id, None);
-        assert!(matches!(send_result, MaybeSendResult::Noop));
+        // An unsharded tenant is always ready to emit a notification
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .shards
+                .len(),
+            1
+        );
+        assert!(tenant_state
+            .maybe_reconfigure(tenant_id)
+            .unwrap()
+            .stripe_size
+            .is_none());

        // Writing the first shard of a multi-sharded situation (i.e. in a split)
        // resets the tenant state and puts it in an non-notifying state (need to
@@ -559,10 +427,7 @@ pub(crate) mod tests {
            ShardStripeSize(32768),
            NodeId(1),
        );
-        assert!(matches!(
-            tenant_state.maybe_send(tenant_id, None),
-            MaybeSendResult::Noop
-        ));
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());

        // Writing the second shard makes it ready to notify
        tenant_state.update(
@@ -575,16 +440,22 @@ pub(crate) mod tests {
            NodeId(1),
        );

-        let send_result = tenant_state.maybe_send(tenant_id, None);
-        let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
-            anyhow::bail!("Wrong send result");
-        };
-        assert_eq!(request.shards.len(), 2);
-        assert_eq!(request.stripe_size, Some(ShardStripeSize(32768)));
-
-        // Simulate successful send
-        *guard = Some(request);
-        drop(guard);
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .shards
+                .len(),
+            2
+        );
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .stripe_size,
+            Some(ShardStripeSize(32768))
+        );

        Ok(())
    }
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,14 +1,7 @@
-use crate::metrics::{
-    HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
-    METRICS_REGISTRY,
-};
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
-use futures::Future;
-use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
-use metrics::{BuildInfo, NeonMetrics};
 use pageserver_api::models::{
    TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
    TenantTimeTravelRequest, TimelineCreateRequest,
@@ -17,11 +10,9 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
-use tokio_util::sync::CancellationToken;
 use utils::auth::{Scope, SwappableJwtAuth};
-use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
-use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param};
+use utils::http::request::{must_get_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};

 use utils::{
@@ -35,29 +26,22 @@ use utils::{
 };

 use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest,
-    TenantShardMigrateRequest,
+    NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
 };
 use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};

 use control_plane::storage_controller::{AttachHookRequest, InspectRequest};

-use routerify::Middleware;
-
 /// State available to HTTP request handlers
+#[derive(Clone)]
 pub struct HttpState {
    service: Arc<crate::service::Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
-    neon_metrics: NeonMetrics,
    allowlist_routes: Vec<Uri>,
 }

 impl HttpState {
-    pub fn new(
-        service: Arc<crate::service::Service>,
-        auth: Option<Arc<SwappableJwtAuth>>,
-        build_info: BuildInfo,
-    ) -> Self {
+    pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
        let allowlist_routes = ["/status", "/ready", "/metrics"]
            .iter()
            .map(|v| v.parse().unwrap())
@@ -65,7 +49,6 @@ impl HttpState {
        Self {
            service,
            auth,
-            neon_metrics: NeonMetrics::new(build_info),
            allowlist_routes,
        }
    }
@@ -263,10 +246,8 @@ async fn handle_tenant_secondary_download(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
-
-    let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
-    json_response(status, progress)
+    service.tenant_secondary_download(tenant_id).await?;
+    json_response(StatusCode::OK, ())
 }

 async fn handle_tenant_delete(
@@ -328,7 +309,7 @@ async fn handle_tenant_timeline_passthrough(
    tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);

    // Find the node that holds shard zero
-    let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?;
+    let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;

    // Callers will always pass an unsharded tenant ID.  Before proxying, we must
    // rewrite this to a shard-aware shard zero ID.
@@ -337,39 +318,12 @@ async fn handle_tenant_timeline_passthrough(
    let tenant_shard_str = format!("{}", tenant_shard_id);
    let path = path.replace(&tenant_str, &tenant_shard_str);

-    let latency = &METRICS_REGISTRY
-        .metrics_group
-        .storage_controller_passthrough_request_latency;
-
-    // This is a bit awkward. We remove the param from the request
-    // and join the words by '_' to get a label for the request.
-    let just_path = path.replace(&tenant_shard_str, "");
-    let path_label = just_path
-        .split('/')
-        .filter(|token| !token.is_empty())
-        .collect::<Vec<_>>()
-        .join("_");
-    let labels = PageserverRequestLabelGroup {
-        pageserver_id: &node.get_id().to_string(),
-        path: &path_label,
-        method: crate::metrics::Method::Get,
-    };
-
-    let _timer = latency.start_timer(labels.clone());
-
-    let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
+    let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
    let resp = client.get_raw(path).await.map_err(|_e|
        // FIXME: give APiError a proper Unavailable variant.  We return 503 here because
        // if we can't successfully send a request to the pageserver, we aren't available.
        ApiError::ShuttingDown)?;

-    if !resp.status().is_success() {
-        let error_counter = &METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_passthrough_request_error;
-        error_counter.inc(labels);
-    }
-
    // We have a reqest::Response, would like a http::Response
    let mut builder = hyper::Response::builder()
        .status(resp.status())
@@ -395,25 +349,6 @@ async fn handle_tenant_locate(
    json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }

-async fn handle_tenant_describe(
-    service: Arc<Service>,
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
-}
-
-async fn handle_tenant_list(
-    service: Arc<Service>,
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    json_response(StatusCode::OK, service.tenant_list())
-}
-
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -427,10 +362,7 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
    check_permissions(&req, Scope::Admin)?;

    let state = get_state(&req);
-    let nodes = state.service.node_list().await?;
-    let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
-
-    json_response(StatusCode::OK, api_nodes)
+    json_response(StatusCode::OK, state.service.node_list().await?)
 }

 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -455,14 +387,7 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,

    json_response(
        StatusCode::OK,
-        state
-            .service
-            .node_configure(
-                config_req.node_id,
-                config_req.availability.map(NodeAvailability::from),
-                config_req.scheduling,
-            )
-            .await?,
+        state.service.node_configure(config_req).await?,
    )
 }

@@ -497,22 +422,6 @@ async fn handle_tenant_shard_migrate(
    )
 }

-async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
-    let state = get_state(&req);
-
-    json_response(
-        StatusCode::OK,
-        state
-            .service
-            .tenant_update_policy(tenant_id, update_req)
-            .await?,
-    )
-}
-
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    check_permissions(&req, Scope::PageServerApi)?;
@@ -544,14 +453,6 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, state.service.consistency_check().await?)
 }

-async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    let state = get_state(&req);
-
-    json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
-}
-
 /// Status endpoint is just used for checking that our HTTP listener is up
 async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(StatusCode::OK, ())
@@ -576,11 +477,7 @@ impl From<ReconcileError> for ApiError {

 /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
 /// be allowed to run if Service has finished its initial reconciliation.
-async fn tenant_service_handler<R, H>(
-    request: Request<Body>,
-    handler: H,
-    request_name: RequestName,
-) -> R::Output
+async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
 where
    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
    H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
@@ -600,122 +497,24 @@ where
        ));
    }

-    named_request_span(
+    request_span(
        request,
        |request| async move { handler(service, request).await },
-        request_name,
    )
    .await
 }

-/// Check if the required scope is held in the request's token, or if the request has
-/// a token with 'admin' scope then always permit it.
 fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
    check_permission_with(request, |claims| {
-        match crate::auth::check_permission(claims, required_scope) {
-            Err(e) => match crate::auth::check_permission(claims, Scope::Admin) {
-                Ok(()) => Ok(()),
-                Err(_) => Err(e),
-            },
-            Ok(()) => Ok(()),
-        }
+        crate::auth::check_permission(claims, required_scope)
    })
 }

-#[derive(Clone, Debug)]
-struct RequestMeta {
-    method: hyper::http::Method,
-    at: Instant,
-}
-
-fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-) -> Middleware<B, ApiError> {
-    Middleware::pre(move |req| async move {
-        let meta = RequestMeta {
-            method: req.method().clone(),
-            at: Instant::now(),
-        };
-
-        req.set_context(meta);
-
-        Ok(req)
-    })
-}
-
-fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
-) -> Middleware<B, ApiError> {
-    Middleware::post_with_info(move |resp, req_info| async move {
-        let request_name = match req_info.context::<RequestName>() {
-            Some(name) => name,
-            None => {
-                return Ok(resp);
-            }
-        };
-
-        if let Some(meta) = req_info.context::<RequestMeta>() {
-            let status = &crate::metrics::METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_http_request_status;
-            let latency = &crate::metrics::METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_http_request_latency;
-
-            status.inc(HttpRequestStatusLabelGroup {
-                path: request_name.0,
-                method: meta.method.clone().into(),
-                status: crate::metrics::StatusCode(resp.status()),
-            });
-
-            latency.observe(
-                HttpRequestLatencyLabelGroup {
-                    path: request_name.0,
-                    method: meta.method.into(),
-                },
-                meta.at.elapsed().as_secs_f64(),
-            );
-        }
-        Ok(resp)
-    })
-}
-
-pub async fn measured_metrics_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
-
-    let state = get_state(&req);
-    let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics);
-    let response = Response::builder()
-        .status(200)
-        .header(CONTENT_TYPE, TEXT_FORMAT)
-        .body(payload.into())
-        .unwrap();
-
-    Ok(response)
-}
-
-#[derive(Clone)]
-struct RequestName(&'static str);
-
-async fn named_request_span<R, H>(
-    request: Request<Body>,
-    handler: H,
-    name: RequestName,
-) -> R::Output
-where
-    R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
-    H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
-{
-    request.set_context(name);
-    request_span(request, handler).await
-}
-
 pub fn make_router(
    service: Arc<Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
-    build_info: BuildInfo,
 ) -> RouterBuilder<hyper::Body, ApiError> {
-    let mut router = endpoint::make_router()
-        .middleware(prologue_metrics_middleware())
-        .middleware(epilogue_metrics_middleware());
+    let mut router = endpoint::make_router();
    if auth.is_some() {
        router = router.middleware(auth_middleware(|request| {
            let state = get_state(request);
@@ -724,179 +523,93 @@ pub fn make_router(
            } else {
                state.auth.as_deref()
            }
-        }));
+        }))
    }

    router
-        .data(Arc::new(HttpState::new(service, auth, build_info)))
-        .get("/metrics", |r| {
-            named_request_span(r, measured_metrics_handler, RequestName("metrics"))
-        })
+        .data(Arc::new(HttpState::new(service, auth)))
        // Non-prefixed generic endpoints (status, metrics)
-        .get("/status", |r| {
-            named_request_span(r, handle_status, RequestName("status"))
-        })
-        .get("/ready", |r| {
-            named_request_span(r, handle_ready, RequestName("ready"))
-        })
+        .get("/status", |r| request_span(r, handle_status))
+        .get("/ready", |r| request_span(r, handle_ready))
        // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
        .post("/upcall/v1/re-attach", |r| {
-            named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach"))
-        })
-        .post("/upcall/v1/validate", |r| {
-            named_request_span(r, handle_validate, RequestName("upcall_v1_validate"))
+            request_span(r, handle_re_attach)
        })
+        .post("/upcall/v1/validate", |r| request_span(r, handle_validate))
        // Test/dev/debug endpoints
        .post("/debug/v1/attach-hook", |r| {
-            named_request_span(r, handle_attach_hook, RequestName("debug_v1_attach_hook"))
-        })
-        .post("/debug/v1/inspect", |r| {
-            named_request_span(r, handle_inspect, RequestName("debug_v1_inspect"))
+            request_span(r, handle_attach_hook)
        })
+        .post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
        .post("/debug/v1/tenant/:tenant_id/drop", |r| {
-            named_request_span(r, handle_tenant_drop, RequestName("debug_v1_tenant_drop"))
+            request_span(r, handle_tenant_drop)
        })
        .post("/debug/v1/node/:node_id/drop", |r| {
-            named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
-        })
-        .get("/debug/v1/tenant", |r| {
-            named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
-        })
-        .get("/debug/v1/tenant/:tenant_id/locate", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_locate,
-                RequestName("debug_v1_tenant_locate"),
-            )
+            request_span(r, handle_node_drop)
        })
+        .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
        .get("/debug/v1/scheduler", |r| {
-            named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
+            request_span(r, handle_scheduler_dump)
        })
        .post("/debug/v1/consistency_check", |r| {
-            named_request_span(
-                r,
-                handle_consistency_check,
-                RequestName("debug_v1_consistency_check"),
-            )
+            request_span(r, handle_consistency_check)
        })
-        .post("/debug/v1/reconcile_all", |r| {
-            request_span(r, handle_reconcile_all)
-        })
-        .put("/debug/v1/failpoints", |r| {
-            request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
+        .get("/control/v1/tenant/:tenant_id/locate", |r| {
+            tenant_service_handler(r, handle_tenant_locate)
        })
        // Node operations
        .post("/control/v1/node", |r| {
-            named_request_span(r, handle_node_register, RequestName("control_v1_node"))
-        })
-        .get("/control/v1/node", |r| {
-            named_request_span(r, handle_node_list, RequestName("control_v1_node"))
+            request_span(r, handle_node_register)
        })
+        .get("/control/v1/node", |r| request_span(r, handle_node_list))
        .put("/control/v1/node/:node_id/config", |r| {
-            named_request_span(
-                r,
-                handle_node_configure,
-                RequestName("control_v1_node_config"),
-            )
+            request_span(r, handle_node_configure)
        })
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_shard_migrate,
-                RequestName("control_v1_tenant_migrate"),
-            )
+            tenant_service_handler(r, handle_tenant_shard_migrate)
        })
        .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_shard_split,
-                RequestName("control_v1_tenant_shard_split"),
-            )
-        })
-        .get("/control/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_describe,
-                RequestName("control_v1_tenant_describe"),
-            )
-        })
-        .get("/control/v1/tenant", |r| {
-            tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list"))
-        })
-        .put("/control/v1/tenant/:tenant_id/policy", |r| {
-            named_request_span(
-                r,
-                handle_tenant_update_policy,
-                RequestName("control_v1_tenant_policy"),
-            )
+            tenant_service_handler(r, handle_tenant_shard_split)
        })
        // Tenant operations
        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
        .post("/v1/tenant", |r| {
-            tenant_service_handler(r, handle_tenant_create, RequestName("v1_tenant"))
+            tenant_service_handler(r, handle_tenant_create)
        })
        .delete("/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
+            tenant_service_handler(r, handle_tenant_delete)
        })
        .put("/v1/tenant/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
+            tenant_service_handler(r, handle_tenant_config_set)
        })
        .get("/v1/tenant/:tenant_id/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_get, RequestName("v1_tenant_config"))
+            tenant_service_handler(r, handle_tenant_config_get)
        })
        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_location_config,
-                RequestName("v1_tenant_location_config"),
-            )
+            tenant_service_handler(r, handle_tenant_location_config)
        })
        .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_time_travel_remote_storage,
-                RequestName("v1_tenant_time_travel_remote_storage"),
-            )
+            tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
        })
        .post("/v1/tenant/:tenant_id/secondary/download", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_secondary_download,
-                RequestName("v1_tenant_secondary_download"),
-            )
+            tenant_service_handler(r, handle_tenant_secondary_download)
        })
        // Timeline operations
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_timeline_delete,
-                RequestName("v1_tenant_timeline"),
-            )
+            tenant_service_handler(r, handle_tenant_timeline_delete)
        })
        .post("/v1/tenant/:tenant_id/timeline", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_timeline_create,
-                RequestName("v1_tenant_timeline"),
-            )
+            tenant_service_handler(r, handle_tenant_timeline_create)
        })
        // Tenant detail GET passthrough to shard zero
        .get("/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_timeline_passthrough,
-                RequestName("v1_tenant_passthrough"),
-            )
+            tenant_service_handler(r, handle_tenant_timeline_passthrough)
        })
        // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
        // timeline GET APIs will be implicitly included.
        .get("/v1/tenant/:tenant_id/timeline*", |r| {
-            tenant_service_handler(
-                r,
-                handle_tenant_timeline_passthrough,
-                RequestName("v1_tenant_timeline_passthrough"),
-            )
+            tenant_service_handler(r, handle_tenant_timeline_passthrough)
        })
 }
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -3,18 +3,15 @@ use utils::seqwait::MonotonicCounter;

 mod auth;
 mod compute_hook;
-mod heartbeater;
 pub mod http;
-mod id_lock_map;
 pub mod metrics;
 mod node;
-mod pageserver_client;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
 mod schema;
 pub mod service;
-mod tenant_shard;
+mod tenant_state;

 #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
 struct Sequence(u64);
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -1,20 +1,19 @@
 use anyhow::{anyhow, Context};
+use attachment_service::http::make_router;
+use attachment_service::metrics::preinitialize_metrics;
+use attachment_service::persistence::Persistence;
+use attachment_service::service::{Config, Service};
+use aws_config::{BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
-use metrics::BuildInfo;
 use std::sync::Arc;
-use storage_controller::http::make_router;
-use storage_controller::metrics::preinitialize_metrics;
-use storage_controller::persistence::Persistence;
-use storage_controller::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};

-use utils::sentry_init::init_sentry;
 use utils::{project_build_tag, project_git_version, tcp_listener};

 project_git_version!(GIT_VERSION);
@@ -52,33 +51,9 @@ struct Cli {
    #[arg(short, long)]
    path: Option<Utf8PathBuf>,

-    /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller
+    /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
    #[arg(long)]
    database_url: Option<String>,
-
-    /// Flag to enable dev mode, which permits running without auth
-    #[arg(long, default_value = "false")]
-    dev: bool,
-
-    /// Grace period before marking unresponsive pageserver offline
-    #[arg(long)]
-    max_unavailable_interval: Option<humantime::Duration>,
-}
-
-enum StrictMode {
-    /// In strict mode, we will require that all secrets are loaded, i.e. security features
-    /// may not be implicitly turned off by omitting secrets in the environment.
-    Strict,
-    /// In dev mode, secrets are optional, and omitting a particular secret will implicitly
-    /// disable the auth related to it (e.g. no pageserver jwt key -> send unauthenticated
-    /// requests, no public key -> don't authenticate incoming requests).
-    Dev,
-}
-
-impl Default for StrictMode {
-    fn default() -> Self {
-        Self::Strict
-    }
 }

 /// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
@@ -91,6 +66,13 @@ struct Secrets {
 }

 impl Secrets {
+    const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url";
+    const PAGESERVER_JWT_TOKEN_SECRET: &'static str =
+        "neon-storage-controller-pageserver-jwt-token";
+    const CONTROL_PLANE_JWT_TOKEN_SECRET: &'static str =
+        "neon-storage-controller-control-plane-jwt-token";
+    const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
+
    const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
    const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
    const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
@@ -101,41 +83,111 @@ impl Secrets {
    /// - Environment variables if DATABASE_URL is set.
    /// - AWS Secrets Manager secrets
    async fn load(args: &Cli) -> anyhow::Result<Self> {
-        let Some(database_url) =
-            Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await
-        else {
-            anyhow::bail!(
-                "Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)"
-            )
-        };
-
-        let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await {
-            Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?),
-            None => None,
-        };
-
-        let this = Self {
-            database_url,
-            public_key,
-            jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await,
-            control_plane_jwt_token: Self::load_secret(
-                &args.control_plane_jwt_token,
-                Self::CONTROL_PLANE_JWT_TOKEN_ENV,
-            )
-            .await,
-        };
-
-        Ok(this)
+        match &args.database_url {
+            Some(url) => Self::load_cli(url, args),
+            None => match std::env::var(Self::DATABASE_URL_ENV) {
+                Ok(database_url) => Self::load_env(database_url),
+                Err(_) => Self::load_aws_sm().await,
+            },
+        }
    }

-    async fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
-        if let Some(v) = cli {
-            Some(v.clone())
-        } else if let Ok(v) = std::env::var(env_name) {
-            Some(v)
-        } else {
-            None
+    fn load_env(database_url: String) -> anyhow::Result<Self> {
+        let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
+            Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
+            Err(_) => None,
+        };
+        Ok(Self {
+            database_url,
+            public_key,
+            jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
+            control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
+        })
+    }
+
+    async fn load_aws_sm() -> anyhow::Result<Self> {
+        let Ok(region) = std::env::var("AWS_REGION") else {
+            anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
+        };
+        let config = aws_config::defaults(BehaviorVersion::v2023_11_09())
+            .region(Region::new(region.clone()))
+            .load()
+            .await;
+
+        let asm = aws_sdk_secretsmanager::Client::new(&config);
+
+        let Some(database_url) = asm
+            .get_secret_value()
+            .secret_id(Self::DATABASE_URL_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string)
+        else {
+            anyhow::bail!(
+                "Database URL secret not found at {region}/{}",
+                Self::DATABASE_URL_SECRET
+            )
+        };
+
+        let jwt_token = asm
+            .get_secret_value()
+            .secret_id(Self::PAGESERVER_JWT_TOKEN_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string);
+        if jwt_token.is_none() {
+            tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver");
        }
+
+        let control_plane_jwt_token = asm
+            .get_secret_value()
+            .secret_id(Self::CONTROL_PLANE_JWT_TOKEN_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string);
+        if jwt_token.is_none() {
+            tracing::warn!("No control plane JWT token set: this will only work if authentication is disabled on the pageserver");
+        }
+
+        let public_key = asm
+            .get_secret_value()
+            .secret_id(Self::PUBLIC_KEY_SECRET)
+            .send()
+            .await?
+            .secret_string()
+            .map(str::to_string);
+        let public_key = match public_key {
+            Some(key) => Some(JwtAuth::from_key(key)?),
+            None => {
+                tracing::warn!(
+                    "No public key set: inccoming HTTP requests will not be authenticated"
+                );
+                None
+            }
+        };
+
+        Ok(Self {
+            database_url,
+            public_key,
+            jwt_token,
+            control_plane_jwt_token,
+        })
+    }
+
+    fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
+        let public_key = match &args.public_key {
+            None => None,
+            Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?),
+        };
+        Ok(Self {
+            database_url: database_url.to_owned(),
+            public_key,
+            jwt_token: args.jwt_token.clone(),
+            control_plane_jwt_token: args.control_plane_jwt_token.clone(),
+        })
    }
 }

@@ -154,14 +206,6 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> {
 }

 fn main() -> anyhow::Result<()> {
-    let default_panic = std::panic::take_hook();
-    std::panic::set_hook(Box::new(move |info| {
-        default_panic(info);
-        std::process::exit(1);
-    }));
-
-    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
-
    tokio::runtime::Builder::new_current_thread()
        // We use spawn_blocking for database operations, so require approximately
        // as many blocking threads as we will open database connections.
@@ -193,55 +237,12 @@ async fn async_main() -> anyhow::Result<()> {
        args.listen
    );

-    let build_info = BuildInfo {
-        revision: GIT_VERSION,
-        build_tag: BUILD_TAG,
-    };
-
-    let strict_mode = if args.dev {
-        StrictMode::Dev
-    } else {
-        StrictMode::Strict
-    };
-
    let secrets = Secrets::load(&args).await?;

-    // Validate required secrets and arguments are provided in strict mode
-    match strict_mode {
-        StrictMode::Strict
-            if (secrets.public_key.is_none()
-                || secrets.jwt_token.is_none()
-                || secrets.control_plane_jwt_token.is_none()) =>
-        {
-            // Production systems should always have secrets configured: if public_key was not set
-            // then we would implicitly disable auth.
-            anyhow::bail!(
-                    "Insecure config!  One or more secrets is not set.  This is only permitted in `--dev` mode"
-                );
-        }
-        StrictMode::Strict if args.compute_hook_url.is_none() => {
-            // Production systems should always have a compute hook set, to prevent falling
-            // back to trying to use neon_local.
-            anyhow::bail!(
-                "`--compute-hook-url` is not set: this is only permitted in `--dev` mode"
-            );
-        }
-        StrictMode::Strict => {
-            tracing::info!("Starting in strict mode: configuration is OK.")
-        }
-        StrictMode::Dev => {
-            tracing::warn!("Starting in dev mode: this may be an insecure configuration.")
-        }
-    }
-
    let config = Config {
        jwt_token: secrets.jwt_token,
        control_plane_jwt_token: secrets.control_plane_jwt_token,
        compute_hook_url: args.compute_hook_url,
-        max_unavailable_interval: args
-            .max_unavailable_interval
-            .map(humantime::Duration::into)
-            .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
    };

    // After loading secrets & config, but before starting anything else, apply database migrations
@@ -259,7 +260,7 @@ async fn async_main() -> anyhow::Result<()> {
    let auth = secrets
        .public_key
        .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
-    let router = make_router(service.clone(), auth, build_info)
+    let router = make_router(service.clone(), auth)
        .build()
        .map_err(|err| anyhow!(err))?;
    let router_service = utils::http::RouterService::new(router).unwrap();
--- a/control_plane/attachment_service/src/metrics.rs
+++ b/control_plane/attachment_service/src/metrics.rs
@@ -0,0 +1,32 @@
+use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
+use once_cell::sync::Lazy;
+
+pub(crate) struct ReconcilerMetrics {
+    pub(crate) spawned: IntCounter,
+    pub(crate) complete: IntCounterVec,
+}
+
+impl ReconcilerMetrics {
+    // Labels used on [`Self::complete`]
+    pub(crate) const SUCCESS: &'static str = "ok";
+    pub(crate) const ERROR: &'static str = "success";
+    pub(crate) const CANCEL: &'static str = "cancel";
+}
+
+pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
+    spawned: register_int_counter!(
+        "storage_controller_reconcile_spawn",
+        "Count of how many times we spawn a reconcile task",
+    )
+    .expect("failed to define a metric"),
+    complete: register_int_counter_vec!(
+        "storage_controller_reconcile_complete",
+        "Reconciler tasks completed, broken down by success/failure/cancelled",
+        &["status"],
+    )
+    .expect("failed to define a metric"),
+});
+
+pub fn preinitialize_metrics() {
+    Lazy::force(&RECONCILER);
+}
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -3,8 +3,7 @@ use std::{str::FromStr, time::Duration};
 use hyper::StatusCode;
 use pageserver_api::{
    controller_api::{
-        NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
-        TenantLocateResponseShard,
+        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
    },
    shard::TenantShardId,
 };
@@ -13,9 +12,7 @@ use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, id::NodeId};

-use crate::{
-    pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule,
-};
+use crate::persistence::NodePersistence;

 /// Represents the in-memory description of a Node.
 ///
@@ -86,38 +83,29 @@ impl Node {
        }
    }

-    pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
-        match self.get_availability_transition(availability) {
-            AvailabilityTransition::ToActive => {
+    pub(crate) fn set_availability(
+        &mut self,
+        availability: NodeAvailability,
+    ) -> AvailabilityTransition {
+        use NodeAvailability::*;
+        let transition = match (self.availability, availability) {
+            (Offline, Active) => {
                // Give the node a new cancellation token, effectively resetting it to un-cancelled.  Any
                // users of previously-cloned copies of the node will still see the old cancellation
                // state.  For example, Reconcilers in flight will have to complete and be spawned
                // again to realize that the node has become available.
                self.cancel = CancellationToken::new();
+                AvailabilityTransition::ToActive
            }
-            AvailabilityTransition::ToOffline => {
+            (Active, Offline) => {
                // Fire the node's cancellation token to cancel any in-flight API requests to it
                self.cancel.cancel();
+                AvailabilityTransition::ToOffline
            }
-            AvailabilityTransition::Unchanged => {}
-        }
+            _ => AvailabilityTransition::Unchanged,
+        };
        self.availability = availability;
-    }
-
-    /// Without modifying the availability of the node, convert the intended availability
-    /// into a description of the transition.
-    pub(crate) fn get_availability_transition(
-        &self,
-        availability: NodeAvailability,
-    ) -> AvailabilityTransition {
-        use AvailabilityTransition::*;
-        use NodeAvailability::*;
-
-        match (self.availability, availability) {
-            (Offline, Active(_)) => ToActive,
-            (Active(_), Offline) => ToOffline,
-            _ => Unchanged,
-        }
+        transition
    }

    /// Whether we may send API requests to this node.
@@ -126,21 +114,21 @@ impl Node {
        // a reference to the original Node's cancellation status.  Checking both of these results
        // in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
        // when we cloned it, or if the original Node instance's cancellation token was fired.
-        matches!(self.availability, NodeAvailability::Active(_)) && !self.cancel.is_cancelled()
+        matches!(self.availability, NodeAvailability::Active) && !self.cancel.is_cancelled()
    }

    /// Is this node elegible to have work scheduled onto it?
-    pub(crate) fn may_schedule(&self) -> MaySchedule {
-        let score = match self.availability {
-            NodeAvailability::Active(score) => score,
-            NodeAvailability::Offline => return MaySchedule::No,
-        };
+    pub(crate) fn may_schedule(&self) -> bool {
+        match self.availability {
+            NodeAvailability::Active => {}
+            NodeAvailability::Offline => return false,
+        }

        match self.scheduling {
-            NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
-            NodeSchedulingPolicy::Draining => MaySchedule::No,
-            NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
-            NodeSchedulingPolicy::Pause => MaySchedule::No,
+            NodeSchedulingPolicy::Active => true,
+            NodeSchedulingPolicy::Draining => false,
+            NodeSchedulingPolicy::Filling => true,
+            NodeSchedulingPolicy::Pause => false,
        }
    }

@@ -158,7 +146,8 @@ impl Node {
            listen_pg_addr,
            listen_pg_port,
            scheduling: NodeSchedulingPolicy::Filling,
-            availability: NodeAvailability::Offline,
+            // TODO: we shouldn't really call this Active until we've heartbeated it.
+            availability: NodeAvailability::Active,
            cancel: CancellationToken::new(),
        }
    }
@@ -205,7 +194,7 @@ impl Node {
        cancel: &CancellationToken,
    ) -> Option<mgmt_api::Result<T>>
    where
-        O: FnMut(PageserverClient) -> F,
+        O: FnMut(mgmt_api::Client) -> F,
        F: std::future::Future<Output = mgmt_api::Result<T>>,
    {
        fn is_fatal(e: &mgmt_api::Error) -> bool {
@@ -227,12 +216,8 @@ impl Node {
                    .build()
                    .expect("Failed to construct HTTP client");

-                let client = PageserverClient::from_client(
-                    self.get_id(),
-                    http_client,
-                    self.base_url(),
-                    jwt.as_deref(),
-                );
+                let client =
+                    mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref());

                let node_cancel_fut = self.cancel.cancelled();

@@ -257,19 +242,6 @@ impl Node {
        )
        .await
    }
-
-    /// Generate the simplified API-friendly description of a node's state
-    pub(crate) fn describe(&self) -> NodeDescribeResponse {
-        NodeDescribeResponse {
-            id: self.id,
-            availability: self.availability.into(),
-            scheduling: self.scheduling,
-            listen_http_addr: self.listen_http_addr.clone(),
-            listen_http_port: self.listen_http_port,
-            listen_pg_addr: self.listen_pg_addr.clone(),
-            listen_pg_port: self.listen_pg_port,
-        }
-    }
 }

 impl std::fmt::Display for Node {
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -9,20 +9,13 @@ use camino::Utf8PathBuf;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
-use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
-use pageserver_api::shard::ShardConfigError;
-use pageserver_api::shard::ShardIdentity;
-use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};

-use crate::metrics::{
-    DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY,
-};
 use crate::node::Node;

 /// ## What do we store?
@@ -79,41 +72,8 @@ pub(crate) enum DatabaseError {
    Logical(String),
 }

-#[derive(measured::FixedCardinalityLabel, Copy, Clone)]
-pub(crate) enum DatabaseOperation {
-    InsertNode,
-    UpdateNode,
-    DeleteNode,
-    ListNodes,
-    BeginShardSplit,
-    CompleteShardSplit,
-    AbortShardSplit,
-    Detach,
-    ReAttach,
-    IncrementGeneration,
-    ListTenantShards,
-    InsertTenantShards,
-    UpdateTenantShard,
-    DeleteTenant,
-    UpdateTenantConfig,
-}
-
-#[must_use]
-pub(crate) enum AbortShardSplitStatus {
-    /// We aborted the split in the database by reverting to the parent shards
-    Aborted,
-    /// The split had already been persisted.
-    Complete,
-}
-
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;

-/// Some methods can operate on either a whole tenant or a single shard
-pub(crate) enum TenantFilter {
-    Tenant(TenantId),
-    Shard(TenantShardId),
-}
-
 impl Persistence {
    // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
    // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
@@ -144,36 +104,10 @@ impl Persistence {
        }
    }

-    /// Wraps `with_conn` in order to collect latency and error metrics
-    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
-    where
-        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
-        R: Send + 'static,
-    {
-        let latency = &METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_database_query_latency;
-        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
-
-        let res = self.with_conn(func).await;
-
-        if let Err(err) = &res {
-            let error_counter = &METRICS_REGISTRY
-                .metrics_group
-                .storage_controller_database_query_error;
-            error_counter.inc(DatabaseQueryErrorLabelGroup {
-                error_type: err.error_label(),
-                operation: op,
-            })
-        }
-
-        res
-    }
-
    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
    where
-        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
        R: Send + 'static,
    {
        let mut conn = self.connection_pool.get()?;
@@ -185,27 +119,21 @@ impl Persistence {
    /// When a node is first registered, persist it before using it for anything
    pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
        let np = node.to_persistent();
-        self.with_measured_conn(
-            DatabaseOperation::InsertNode,
-            move |conn| -> DatabaseResult<()> {
-                diesel::insert_into(crate::schema::nodes::table)
-                    .values(&np)
-                    .execute(conn)?;
-                Ok(())
-            },
-        )
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::insert_into(crate::schema::nodes::table)
+                .values(&np)
+                .execute(conn)?;
+            Ok(())
+        })
        .await
    }

    /// At startup, populate the list of nodes which our shards may be placed on
    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
        let nodes: Vec<NodePersistence> = self
-            .with_measured_conn(
-                DatabaseOperation::ListNodes,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
-                },
-            )
+            .with_conn(move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
+            })
            .await?;

        tracing::info!("list_nodes: loaded {} nodes", nodes.len());
@@ -220,7 +148,7 @@ impl Persistence {
    ) -> DatabaseResult<()> {
        use crate::schema::nodes::dsl::*;
        let updated = self
-            .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
+            .with_conn(move |conn| {
                let updated = diesel::update(nodes)
                    .filter(node_id.eq(input_node_id.0 as i64))
                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
@@ -242,12 +170,9 @@ impl Persistence {
    /// be enriched at runtime with state discovered on pageservers.
    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
        let loaded = self
-            .with_measured_conn(
-                DatabaseOperation::ListTenantShards,
-                move |conn| -> DatabaseResult<_> {
-                    Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
-                },
-            )
+            .with_conn(move |conn| -> DatabaseResult<_> {
+                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
+            })
            .await?;

        if loaded.is_empty() {
@@ -275,15 +200,15 @@ impl Persistence {

        let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
            .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
-        for shard in decoded.tenants.values_mut() {
-            if shard.placement_policy == "\"Single\"" {
-                // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
-                shard.placement_policy = "{\"Attached\":0}".to_string();
-            }
-
-            if shard.scheduling_policy.is_empty() {
-                shard.scheduling_policy =
-                    serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
+        for (tenant_id, tenant) in &mut decoded.tenants {
+            // Backward compat: an old attachments.json from before PR #6251, replace
+            // empty strings with proper defaults.
+            if tenant.tenant_id.is_empty() {
+                tenant.tenant_id = tenant_id.to_string();
+                tenant.config = serde_json::to_string(&TenantConfig::default())
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
+                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single)
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
            }
        }

@@ -329,20 +254,17 @@ impl Persistence {
        shards: Vec<TenantShardPersistence>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::InsertTenantShards,
-            move |conn| -> DatabaseResult<()> {
-                conn.transaction(|conn| -> QueryResult<()> {
-                    for tenant in &shards {
-                        diesel::insert_into(tenant_shards)
-                            .values(tenant)
-                            .execute(conn)?;
-                    }
-                    Ok(())
-                })?;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            conn.transaction(|conn| -> QueryResult<()> {
+                for tenant in &shards {
+                    diesel::insert_into(tenant_shards)
+                        .values(tenant)
+                        .execute(conn)?;
+                }
                Ok(())
-            },
-        )
+            })?;
+            Ok(())
+        })
        .await
    }

@@ -350,31 +272,25 @@ impl Persistence {
    /// the tenant from memory on this server.
    pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::DeleteTenant,
-            move |conn| -> DatabaseResult<()> {
-                diesel::delete(tenant_shards)
-                    .filter(tenant_id.eq(del_tenant_id.to_string()))
-                    .execute(conn)?;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::delete(tenant_shards)
+                .filter(tenant_id.eq(del_tenant_id.to_string()))
+                .execute(conn)?;

-                Ok(())
-            },
-        )
+            Ok(())
+        })
        .await
    }

    pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
        use crate::schema::nodes::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::DeleteNode,
-            move |conn| -> DatabaseResult<()> {
-                diesel::delete(nodes)
-                    .filter(node_id.eq(del_node_id.0 as i64))
-                    .execute(conn)?;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            diesel::delete(nodes)
+                .filter(node_id.eq(del_node_id.0 as i64))
+                .execute(conn)?;

-                Ok(())
-            },
-        )
+            Ok(())
+        })
        .await
    }

@@ -388,7 +304,7 @@ impl Persistence {
    ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
        use crate::schema::tenant_shards::dsl::*;
        let updated = self
-            .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
+            .with_conn(move |conn| {
                let rows_updated = diesel::update(tenant_shards)
                    .filter(generation_pageserver.eq(node_id.0 as i64))
                    .set(generation.eq(generation + 1))
@@ -438,7 +354,7 @@ impl Persistence {
    ) -> anyhow::Result<Generation> {
        use crate::schema::tenant_shards::dsl::*;
        let updated = self
-            .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
+            .with_conn(move |conn| {
                let updated = diesel::update(tenant_shards)
                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -475,45 +391,59 @@ impl Persistence {
    /// that we only do the first time a tenant is set to an attached policy via /location_config.
    pub(crate) async fn update_tenant_shard(
        &self,
-        tenant: TenantFilter,
-        input_placement_policy: Option<PlacementPolicy>,
-        input_config: Option<TenantConfig>,
+        tenant_shard_id: TenantShardId,
+        input_placement_policy: PlacementPolicy,
+        input_config: TenantConfig,
        input_generation: Option<Generation>,
-        input_scheduling_policy: Option<ShardSchedulingPolicy>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;

-        self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
-            let query = match tenant {
-                TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
-                    .into_boxed(),
-                TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
-                    .filter(tenant_id.eq(input_tenant_id.to_string()))
-                    .into_boxed(),
-            };
+        self.with_conn(move |conn| {
+            let query = diesel::update(tenant_shards)
+                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));

-            #[derive(AsChangeset)]
-            #[diesel(table_name = crate::schema::tenant_shards)]
-            struct ShardUpdate {
-                generation: Option<i32>,
-                placement_policy: Option<String>,
-                config: Option<String>,
-                scheduling_policy: Option<String>,
+            if let Some(input_generation) = input_generation {
+                // Update includes generation column
+                query
+                    .set((
+                        generation.eq(Some(input_generation.into().unwrap() as i32)),
+                        placement_policy
+                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
+                        config.eq(serde_json::to_string(&input_config).unwrap()),
+                    ))
+                    .execute(conn)?;
+            } else {
+                // Update does not include generation column
+                query
+                    .set((
+                        placement_policy
+                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
+                        config.eq(serde_json::to_string(&input_config).unwrap()),
+                    ))
+                    .execute(conn)?;
            }

-            let update = ShardUpdate {
-                generation: input_generation.map(|g| g.into().unwrap() as i32),
-                placement_policy: input_placement_policy
-                    .map(|p| serde_json::to_string(&p).unwrap()),
-                config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
-                scheduling_policy: input_scheduling_policy
-                    .map(|p| serde_json::to_string(&p).unwrap()),
-            };
+            Ok(())
+        })
+        .await?;

-            query.set(update).execute(conn)?;
+        Ok(())
+    }
+
+    pub(crate) async fn update_tenant_config(
+        &self,
+        input_tenant_id: TenantId,
+        input_config: TenantConfig,
+    ) -> DatabaseResult<()> {
+        use crate::schema::tenant_shards::dsl::*;
+
+        self.with_conn(move |conn| {
+            diesel::update(tenant_shards)
+                .filter(tenant_id.eq(input_tenant_id.to_string()))
+                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
+                .execute(conn)?;

            Ok(())
        })
@@ -524,7 +454,7 @@ impl Persistence {

    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
+        self.with_conn(move |conn| {
            let updated = diesel::update(tenant_shards)
                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -554,7 +484,7 @@ impl Persistence {
        parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
+        self.with_conn(move |conn| -> DatabaseResult<()> {
            conn.transaction(|conn| -> DatabaseResult<()> {
                // Mark parent shards as splitting

@@ -618,83 +548,31 @@ impl Persistence {
        old_shard_count: ShardCount,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::CompleteShardSplit,
-            move |conn| -> DatabaseResult<()> {
-                conn.transaction(|conn| -> QueryResult<()> {
-                    // Drop parent shards
-                    diesel::delete(tenant_shards)
-                        .filter(tenant_id.eq(split_tenant_id.to_string()))
-                        .filter(shard_count.eq(old_shard_count.literal() as i32))
-                        .execute(conn)?;
+        self.with_conn(move |conn| -> DatabaseResult<()> {
+            conn.transaction(|conn| -> QueryResult<()> {
+                // Drop parent shards
+                diesel::delete(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(shard_count.eq(old_shard_count.literal() as i32))
+                    .execute(conn)?;

-                    // Clear sharding flag
-                    let updated = diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(split_tenant_id.to_string()))
-                        .set((splitting.eq(0),))
-                        .execute(conn)?;
-                    debug_assert!(updated > 0);
-
-                    Ok(())
-                })?;
+                // Clear sharding flag
+                let updated = diesel::update(tenant_shards)
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .set((splitting.eq(0),))
+                    .execute(conn)?;
+                debug_assert!(updated > 0);

                Ok(())
-            },
-        )
-        .await
-    }
+            })?;

-    /// Used when the remote part of a shard split failed: we will revert the database state to have only
-    /// the parent shards, with SplitState::Idle.
-    pub(crate) async fn abort_shard_split(
-        &self,
-        split_tenant_id: TenantId,
-        new_shard_count: ShardCount,
-    ) -> DatabaseResult<AbortShardSplitStatus> {
-        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
-            DatabaseOperation::AbortShardSplit,
-            move |conn| -> DatabaseResult<AbortShardSplitStatus> {
-                let aborted =
-                    conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
-                        // Clear the splitting state on parent shards
-                        let updated = diesel::update(tenant_shards)
-                            .filter(tenant_id.eq(split_tenant_id.to_string()))
-                            .filter(shard_count.ne(new_shard_count.literal() as i32))
-                            .set((splitting.eq(0),))
-                            .execute(conn)?;
-
-                        // Parent shards are already gone: we cannot abort.
-                        if updated == 0 {
-                            return Ok(AbortShardSplitStatus::Complete);
-                        }
-
-                        // Sanity check: if parent shards were present, their cardinality should
-                        // be less than the number of child shards.
-                        if updated >= new_shard_count.count() as usize {
-                            return Err(DatabaseError::Logical(format!(
-                                "Unexpected parent shard count {updated} while aborting split to \
-                            count {new_shard_count:?} on tenant {split_tenant_id}"
-                            )));
-                        }
-
-                        // Erase child shards
-                        diesel::delete(tenant_shards)
-                            .filter(tenant_id.eq(split_tenant_id.to_string()))
-                            .filter(shard_count.eq(new_shard_count.literal() as i32))
-                            .execute(conn)?;
-
-                        Ok(AbortShardSplitStatus::Aborted)
-                    })?;
-
-                Ok(aborted)
-            },
-        )
+            Ok(())
+        })
        .await
    }
 }

-/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
+/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
 #[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
 #[diesel(table_name = crate::schema::tenant_shards)]
 pub(crate) struct TenantShardPersistence {
@@ -724,30 +602,6 @@ pub(crate) struct TenantShardPersistence {
    pub(crate) splitting: SplitState,
    #[serde(default)]
    pub(crate) config: String,
-    #[serde(default)]
-    pub(crate) scheduling_policy: String,
-}
-
-impl TenantShardPersistence {
-    pub(crate) fn get_shard_identity(&self) -> Result<ShardIdentity, ShardConfigError> {
-        if self.shard_count == 0 {
-            Ok(ShardIdentity::unsharded())
-        } else {
-            Ok(ShardIdentity::new(
-                ShardNumber(self.shard_number as u8),
-                ShardCount::new(self.shard_count as u8),
-                ShardStripeSize(self.shard_stripe_size as u32),
-            )?)
-        }
-    }
-
-    pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
-        Ok(TenantShardId {
-            tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
-            shard_number: ShardNumber(self.shard_number as u8),
-            shard_count: ShardCount::new(self.shard_count as u8),
-        })
-    }
 }

 /// Parts of [`crate::node::Node`] that are stored durably
--- a/control_plane/attachment_service/src/persistence/split_state.rs
+++ b/control_plane/attachment_service/src/persistence/split_state.rs
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,7 +1,5 @@
-use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
 use crate::service;
-use hyper::StatusCode;
 use pageserver_api::models::{
    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
@@ -9,7 +7,7 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_client::mgmt_api;
 use std::collections::HashMap;
 use std::sync::Arc;
-use std::time::{Duration, Instant};
+use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
@@ -18,14 +16,12 @@ use utils::sync::gate::GateGuard;

 use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
-use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation};
-
-const DEFAULT_HEATMAP_PERIOD: &str = "60s";
+use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};

 /// Object with the lifetime of the background reconcile task that is created
 /// for tenants which have a difference between their intent and observed states.
 pub(super) struct Reconciler {
-    /// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot
+    /// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
    /// of a tenant's state from when we spawned a reconcile task.
    pub(super) tenant_shard_id: TenantShardId,
    pub(crate) shard: ShardIdentity,
@@ -48,11 +44,11 @@ pub(super) struct Reconciler {

    /// To avoid stalling if the cloud control plane is unavailable, we may proceed
    /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
-    /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry.
+    /// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry.
    pub(crate) compute_notify_failure: bool,

    /// A means to abort background reconciliation: it is essential to
-    /// call this when something changes in the original TenantShard that
+    /// call this when something changes in the original TenantState that
    /// will make this reconciliation impossible or unnecessary, for
    /// example when a pageserver node goes offline, or the PlacementPolicy for
    /// the tenant is changed.
@@ -66,7 +62,7 @@ pub(super) struct Reconciler {
    pub(crate) persistence: Arc<Persistence>,
 }

-/// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any
+/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
 /// reference counting for Scheduler.  The IntentState is what the scheduler works with,
 /// and the TargetState is just the instruction for a particular Reconciler run.
 #[derive(Debug)]
@@ -118,15 +114,6 @@ impl Reconciler {
        flush_ms: Option<Duration>,
        lazy: bool,
    ) -> Result<(), ReconcileError> {
-        if !node.is_available() && config.mode == LocationConfigMode::Detached {
-            // Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline
-            // will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of
-            // what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`]
-            tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation");
-            self.observed.locations.remove(&node.get_id());
-            return Ok(());
-        }
-
        self.observed
            .locations
            .insert(node.get_id(), ObservedStateLocation { conf: None });
@@ -159,16 +146,9 @@ impl Reconciler {
        };
        tracing::info!("location_config({node}) complete: {:?}", config);

-        match config.mode {
-            LocationConfigMode::Detached => {
-                self.observed.locations.remove(&node.get_id());
-            }
-            _ => {
-                self.observed
-                    .locations
-                    .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
-            }
-        }
+        self.observed
+            .locations
+            .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });

        Ok(())
    }
@@ -260,11 +240,8 @@ impl Reconciler {
        tenant_shard_id: TenantShardId,
        node: &Node,
    ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-        let client = PageserverClient::new(
-            node.get_id(),
-            node.base_url(),
-            self.service_config.jwt_token.as_deref(),
-        );
+        let client =
+            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());

        let timelines = client.timeline_list(&tenant_shard_id).await?;
        Ok(timelines
@@ -278,81 +255,22 @@ impl Reconciler {
        tenant_shard_id: TenantShardId,
        node: &Node,
    ) -> Result<(), ReconcileError> {
-        // This is not the timeout for a request, but the total amount of time we're willing to wait
-        // for a secondary location to get up to date before
-        const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
-
-        // This the long-polling interval for the secondary download requests we send to destination pageserver
-        // during a migration.
-        const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
-
-        let started_at = Instant::now();
-
-        loop {
-            let (status, progress) = match node
-                .with_client_retries(
-                    |client| async move {
-                        client
-                            .tenant_secondary_download(
-                                tenant_shard_id,
-                                Some(REQUEST_DOWNLOAD_TIMEOUT),
-                            )
-                            .await
-                    },
-                    &self.service_config.jwt_token,
-                    1,
-                    3,
-                    REQUEST_DOWNLOAD_TIMEOUT * 2,
-                    &self.cancel,
-                )
-                .await
-            {
-                None => Err(ReconcileError::Cancel),
-                Some(Ok(v)) => Ok(v),
-                Some(Err(e)) => {
-                    // Give up, but proceed: it's unfortunate if we couldn't freshen the destination before
-                    // attaching, but we should not let an issue with a secondary location stop us proceeding
-                    // with a live migration.
-                    tracing::warn!("Failed to prepare by downloading layers on node {node}: {e})");
-                    return Ok(());
-                }
-            }?;
-
-            if status == StatusCode::OK {
-                tracing::info!(
-                    "Downloads to {} complete: {}/{} layers, {}/{} bytes",
-                    node,
-                    progress.layers_downloaded,
-                    progress.layers_total,
-                    progress.bytes_downloaded,
-                    progress.bytes_total
-                );
-                return Ok(());
-            } else if status == StatusCode::ACCEPTED {
-                let total_runtime = started_at.elapsed();
-                if total_runtime > TOTAL_DOWNLOAD_TIMEOUT {
-                    tracing::warn!("Timed out after {}ms downloading layers to {node}.  Progress so far: {}/{} layers, {}/{} bytes",
-                        total_runtime.as_millis(),
-                        progress.layers_downloaded,
-                        progress.layers_total,
-                        progress.bytes_downloaded,
-                        progress.bytes_total
-                    );
-                    // Give up, but proceed: an incompletely warmed destination doesn't prevent migration working,
-                    // it just makes the I/O performance for users less good.
-                    return Ok(());
-                }
-
-                // Log and proceed around the loop to retry.  We don't sleep between requests, because our HTTP call
-                // to the pageserver is a long-poll.
-                tracing::info!(
-                    "Downloads to {} not yet complete: {}/{} layers, {}/{} bytes",
-                    node,
-                    progress.layers_downloaded,
-                    progress.layers_total,
-                    progress.bytes_downloaded,
-                    progress.bytes_total
-                );
+        match node
+            .with_client_retries(
+                |client| async move { client.tenant_secondary_download(tenant_shard_id).await },
+                &self.service_config.jwt_token,
+                1,
+                1,
+                Duration::from_secs(60),
+                &self.cancel,
+            )
+            .await
+        {
+            None => Err(ReconcileError::Cancel),
+            Some(Ok(_)) => Ok(()),
+            Some(Err(e)) => {
+                tracing::info!("  (skipping destination download: {})", e);
+                Ok(())
            }
        }
    }
@@ -487,7 +405,6 @@ impl Reconciler {
        while let Err(e) = self.compute_notify().await {
            match e {
                NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
-                NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
                _ => {
                    tracing::warn!(
                        "Live migration blocked by compute notification error, retrying: {e}"
@@ -496,7 +413,7 @@ impl Reconciler {
            }
        }

-        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then
+        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Single, then
        // this location will be deleted in the general case reconciliation that runs after this.
        let origin_secondary_conf = build_location_config(
            &self.shard,
@@ -568,29 +485,17 @@ impl Reconciler {
                )
                .await
            {
-                Some(Ok(observed)) => Some(observed),
-                Some(Err(mgmt_api::Error::ApiError(status, _msg)))
-                    if status == StatusCode::NOT_FOUND =>
-                {
-                    None
-                }
+                Some(Ok(observed)) => observed,
                Some(Err(e)) => return Err(e.into()),
                None => return Err(ReconcileError::Cancel),
            };
            tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}");
-            match observed_conf {
-                Some(conf) => {
-                    // Pageserver returned a state: update it in observed.  This may still be an indeterminate (None) state,
-                    // if internally the pageserver's TenantSlot was being mutated (e.g. some long running API call is still running)
-                    self.observed
-                        .locations
-                        .insert(attached_node.get_id(), ObservedStateLocation { conf });
-                }
-                None => {
-                    // Pageserver returned 404: we have confirmation that there is no state for this shard on that pageserver.
-                    self.observed.locations.remove(&attached_node.get_id());
-                }
-            }
+            self.observed.locations.insert(
+                attached_node.get_id(),
+                ObservedStateLocation {
+                    conf: observed_conf,
+                },
+            );
        }

        Ok(())
@@ -620,12 +525,7 @@ impl Reconciler {
                )));
            };

-            let mut wanted_conf = attached_location_conf(
-                generation,
-                &self.shard,
-                &self.config,
-                !self.intent.secondary.is_empty(),
-            );
+            let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
            match self.observed.locations.get(&node.get_id()) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                    // Nothing to do
@@ -762,26 +662,10 @@ impl Reconciler {
    }
 }

-/// We tweak the externally-set TenantConfig while configuring
-/// locations, using our awareness of whether secondary locations
-/// are in use to automatically enable/disable heatmap uploads.
-fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig {
-    let mut config = config.clone();
-    if has_secondaries {
-        if config.heatmap_period.is_none() {
-            config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
-        }
-    } else {
-        config.heatmap_period = None;
-    }
-    config
-}
-
 pub(crate) fn attached_location_conf(
    generation: Generation,
    shard: &ShardIdentity,
    config: &TenantConfig,
-    has_secondaries: bool,
 ) -> LocationConfig {
    LocationConfig {
        mode: LocationConfigMode::AttachedSingle,
@@ -790,7 +674,7 @@ pub(crate) fn attached_location_conf(
        shard_number: shard.number.0,
        shard_count: shard.count.literal(),
        shard_stripe_size: shard.stripe_size.0,
-        tenant_conf: ha_aware_config(config, has_secondaries),
+        tenant_conf: config.clone(),
    }
 }

@@ -805,6 +689,6 @@ pub(crate) fn secondary_location_conf(
        shard_number: shard.number.0,
        shard_count: shard.count.literal(),
        shard_stripe_size: shard.stripe_size.0,
-        tenant_conf: ha_aware_config(config, true),
+        tenant_conf: config.clone(),
    }
 }
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -1,5 +1,4 @@
-use crate::{node::Node, tenant_shard::TenantShard};
-use pageserver_api::controller_api::UtilizationScore;
+use crate::{node::Node, tenant_state::TenantState};
 use serde::Serialize;
 use std::collections::HashMap;
 use utils::{http::error::ApiError, id::NodeId};
@@ -20,34 +19,15 @@ impl From<ScheduleError> for ApiError {
 }

 #[derive(Serialize, Eq, PartialEq)]
-pub enum MaySchedule {
-    Yes(UtilizationScore),
-    No,
-}
-
-#[derive(Serialize)]
 struct SchedulerNode {
-    /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
+    /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
    shard_count: usize,

    /// Whether this node is currently elegible to have new shards scheduled (this is derived
    /// from a node's availability state and scheduling policy).
-    may_schedule: MaySchedule,
+    may_schedule: bool,
 }

-impl PartialEq for SchedulerNode {
-    fn eq(&self, other: &Self) -> bool {
-        let may_schedule_matches = matches!(
-            (&self.may_schedule, &other.may_schedule),
-            (MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
-        );
-
-        may_schedule_matches && self.shard_count == other.shard_count
-    }
-}
-
-impl Eq for SchedulerNode {}
-
 /// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
 /// on which to run.
 ///
@@ -58,86 +38,6 @@ pub(crate) struct Scheduler {
    nodes: HashMap<NodeId, SchedulerNode>,
 }

-/// Score for soft constraint scheduling: lower scores are preferred to higher scores.
-///
-/// For example, we may set an affinity score based on the number of shards from the same
-/// tenant already on a node, to implicitly prefer to balance out shards.
-#[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
-pub(crate) struct AffinityScore(pub(crate) usize);
-
-impl AffinityScore {
-    /// If we have no anti-affinity at all toward a node, this is its score.  It means
-    /// the scheduler has a free choice amongst nodes with this score, and may pick a node
-    /// based on other information such as total utilization.
-    pub(crate) const FREE: Self = Self(0);
-
-    pub(crate) fn inc(&mut self) {
-        self.0 += 1;
-    }
-}
-
-impl std::ops::Add for AffinityScore {
-    type Output = Self;
-
-    fn add(self, rhs: Self) -> Self::Output {
-        Self(self.0 + rhs.0)
-    }
-}
-
-/// Hint for whether this is a sincere attempt to schedule, or a speculative
-/// check for where we _would_ schedule (done during optimization)
-#[derive(Debug)]
-pub(crate) enum ScheduleMode {
-    Normal,
-    Speculative,
-}
-
-impl Default for ScheduleMode {
-    fn default() -> Self {
-        Self::Normal
-    }
-}
-
-// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
-// it for many shards in the same tenant.
-#[derive(Debug, Default)]
-pub(crate) struct ScheduleContext {
-    /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
-    pub(crate) nodes: HashMap<NodeId, AffinityScore>,
-
-    /// Specifically how many _attached_ locations are on each node
-    pub(crate) attached_nodes: HashMap<NodeId, usize>,
-
-    pub(crate) mode: ScheduleMode,
-}
-
-impl ScheduleContext {
-    /// Input is a list of nodes we would like to avoid using again within this context.  The more
-    /// times a node is passed into this call, the less inclined we are to use it.
-    pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
-        for node_id in nodes {
-            let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE);
-            entry.inc()
-        }
-    }
-
-    pub(crate) fn push_attached(&mut self, node_id: NodeId) {
-        let entry = self.attached_nodes.entry(node_id).or_default();
-        *entry += 1;
-    }
-
-    pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
-        self.nodes
-            .get(&node_id)
-            .copied()
-            .unwrap_or(AffinityScore::FREE)
-    }
-
-    pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
-        self.attached_nodes.get(&node_id).copied().unwrap_or(0)
-    }
-}
-
 impl Scheduler {
    pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
        let mut scheduler_nodes = HashMap::new();
@@ -163,7 +63,7 @@ impl Scheduler {
    pub(crate) fn consistency_check<'a>(
        &self,
        nodes: impl Iterator<Item = &'a Node>,
-        shards: impl Iterator<Item = &'a TenantShard>,
+        shards: impl Iterator<Item = &'a TenantState>,
    ) -> anyhow::Result<()> {
        let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
        for node in nodes {
@@ -286,15 +186,13 @@ impl Scheduler {
            return None;
        }

-        // TODO: When the utilization score returned by the pageserver becomes meaningful,
-        // schedule based on that instead of the shard count.
        let node = nodes
            .iter()
            .map(|node_id| {
                let may_schedule = self
                    .nodes
                    .get(node_id)
-                    .map(|n| n.may_schedule != MaySchedule::No)
+                    .map(|n| n.may_schedule)
                    .unwrap_or(false);
                (*node_id, may_schedule)
            })
@@ -304,94 +202,59 @@ impl Scheduler {
        node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
    }

-    /// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they
-    /// are already in use by this shard -- we use this to avoid picking the same node
-    /// as both attached and secondary location.  This is a hard constraint: if we cannot
-    /// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`].
-    ///
-    /// context: we prefer to avoid using nodes identified in the context, according
-    /// to their anti-affinity score.  We use this to prefeer to avoid placing shards in
-    /// the same tenant on the same node.  This is a soft constraint: the context will never
-    /// cause us to fail to schedule a shard.
-    pub(crate) fn schedule_shard(
-        &self,
-        hard_exclude: &[NodeId],
-        context: &ScheduleContext,
-    ) -> Result<NodeId, ScheduleError> {
+    pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
        if self.nodes.is_empty() {
            return Err(ScheduleError::NoPageservers);
        }

-        let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
+        let mut tenant_counts: Vec<(NodeId, usize)> = self
            .nodes
            .iter()
            .filter_map(|(k, v)| {
-                if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
+                if hard_exclude.contains(k) || !v.may_schedule {
                    None
                } else {
-                    Some((
-                        *k,
-                        context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
-                        v.shard_count,
-                    ))
+                    Some((*k, v.shard_count))
                }
            })
            .collect();

-        // Sort by, in order of precedence:
-        //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
-        //  2nd: Utilization.  Within nodes with the same affinity, use the least loaded nodes.
-        //  3rd: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
-        scores.sort_by_key(|i| (i.1, i.2, i.0));
+        // Sort by tenant count.  Nodes with the same tenant count are sorted by ID.
+        tenant_counts.sort_by_key(|i| (i.1, i.0));

-        if scores.is_empty() {
-            // After applying constraints, no pageservers were left.
-            if !matches!(context.mode, ScheduleMode::Speculative) {
-                // If this was not a speculative attempt, log details to understand why we couldn't
-                // schedule: this may help an engineer understand if some nodes are marked offline
-                // in a way that's preventing progress.
+        if tenant_counts.is_empty() {
+            // After applying constraints, no pageservers were left.  We log some detail about
+            // the state of nodes to help understand why this happened.  This is not logged as an error because
+            // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
+            tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
+            for (node_id, node) in &self.nodes {
                tracing::info!(
-                    "Scheduling failure, while excluding {hard_exclude:?}, node states:"
+                    "Node {node_id}: may_schedule={} shards={}",
+                    node.may_schedule,
+                    node.shard_count
                );
-                for (node_id, node) in &self.nodes {
-                    tracing::info!(
-                        "Node {node_id}: may_schedule={} shards={}",
-                        node.may_schedule != MaySchedule::No,
-                        node.shard_count
-                    );
-                }
            }
+
            return Err(ScheduleError::ImpossibleConstraint);
        }

-        // Lowest score wins
-        let node_id = scores.first().unwrap().0;
-
-        if !matches!(context.mode, ScheduleMode::Speculative) {
-            tracing::info!(
-            "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
-            scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
+        let node_id = tenant_counts.first().unwrap().0;
+        tracing::info!(
+            "scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
+            tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
        );
-        }

        // Note that we do not update shard count here to reflect the scheduling: that
        // is IntentState's job when the scheduled location is used.

        Ok(node_id)
    }
-
-    /// Unit test access to internal state
-    #[cfg(test)]
-    pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
-        self.nodes.get(&node_id).unwrap().shard_count
-    }
 }

 #[cfg(test)]
 pub(crate) mod test_utils {

    use crate::node::Node;
-    use pageserver_api::controller_api::{NodeAvailability, UtilizationScore};
    use std::collections::HashMap;
    use utils::id::NodeId;
    /// Test helper: synthesize the requested number of nodes, all in active state.
@@ -401,14 +264,13 @@ pub(crate) mod test_utils {
        (1..n + 1)
            .map(|i| {
                (NodeId(i), {
-                    let mut node = Node::new(
+                    let node = Node::new(
                        NodeId(i),
                        format!("httphost-{i}"),
                        80 + i as u16,
                        format!("pghost-{i}"),
                        5432 + i as u16,
                    );
-                    node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
                    assert!(node.is_available());
                    node
                })
@@ -421,7 +283,7 @@ pub(crate) mod test_utils {
 mod tests {
    use super::*;

-    use crate::tenant_shard::IntentState;
+    use crate::tenant_state::IntentState;
    #[test]
    fn scheduler_basic() -> anyhow::Result<()> {
        let nodes = test_utils::make_test_nodes(2);
@@ -430,17 +292,15 @@ mod tests {
        let mut t1_intent = IntentState::new();
        let mut t2_intent = IntentState::new();

-        let context = ScheduleContext::default();
-
-        let scheduled = scheduler.schedule_shard(&[], &context)?;
+        let scheduled = scheduler.schedule_shard(&[])?;
        t1_intent.set_attached(&mut scheduler, Some(scheduled));
-        let scheduled = scheduler.schedule_shard(&[], &context)?;
+        let scheduled = scheduler.schedule_shard(&[])?;
        t2_intent.set_attached(&mut scheduler, Some(scheduled));

        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);

-        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
+        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
        t1_intent.push_secondary(&mut scheduler, scheduled);

        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -22,7 +22,6 @@ diesel::table! {
        placement_policy -> Varchar,
        splitting -> Int2,
        config -> Text,
-        scheduling_policy -> Varchar,
    }
 }

--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -4,12 +4,8 @@ use std::{
    time::Duration,
 };

-use crate::{
-    metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
-    persistence::TenantShardPersistence,
-    scheduler::{AffinityScore, MaySchedule, ScheduleContext},
-};
-use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
+use crate::{metrics, persistence::TenantShardPersistence};
+use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
    shard::{ShardIdentity, TenantShardId},
@@ -50,7 +46,7 @@ where
 /// This struct implement Serialize for debugging purposes, but is _not_ persisted
 /// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
 #[derive(Serialize)]
-pub(crate) struct TenantShard {
+pub(crate) struct TenantState {
    pub(crate) tenant_shard_id: TenantShardId,

    pub(crate) shard: ShardIdentity,
@@ -117,10 +113,6 @@ pub(crate) struct TenantShard {
    /// sending it.  This is the mechanism by which compute notifications are included in the scope
    /// of state that we publish externally in an eventually consistent way.
    pub(crate) pending_compute_notification: bool,
-
-    // Support/debug tool: if something is going wrong or flapping with scheduling, this may
-    // be set to a non-active state to avoid making changes while the issue is fixed.
-    scheduling_policy: ShardSchedulingPolicy,
 }

 #[derive(Default, Clone, Debug, Serialize)]
@@ -251,13 +243,8 @@ impl IntentState {

 impl Drop for IntentState {
    fn drop(&mut self) {
-        // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler.
-        // We do not check this while panicking, to avoid polluting unit test failures or
-        // other assertions with this assertion's output.  It's still wrong to leak these,
-        // but if we already have a panic then we don't need to independently flag this case.
-        if !(std::thread::panicking()) {
-            debug_assert!(self.attached.is_none() && self.secondary.is_empty());
-        }
+        // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
+        debug_assert!(self.attached.is_none() && self.secondary.is_empty());
    }
 }

@@ -302,26 +289,6 @@ pub enum ReconcileWaitError {
    Failed(TenantShardId, String),
 }

-#[derive(Eq, PartialEq, Debug)]
-pub(crate) struct ReplaceSecondary {
-    old_node_id: NodeId,
-    new_node_id: NodeId,
-}
-
-#[derive(Eq, PartialEq, Debug)]
-pub(crate) struct MigrateAttachment {
-    old_attached_node_id: NodeId,
-    new_attached_node_id: NodeId,
-}
-
-#[derive(Eq, PartialEq, Debug)]
-pub(crate) enum ScheduleOptimization {
-    // Replace one of our secondary locations with a different node
-    ReplaceSecondary(ReplaceSecondary),
-    // Migrate attachment to an existing secondary location
-    MigrateAttachment(MigrateAttachment),
-}
-
 impl ReconcilerWaiter {
    pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
        tokio::select! {
@@ -354,7 +321,7 @@ pub(crate) struct ReconcilerHandle {
 }

 /// When a reconcile task completes, it sends this result object
-/// to be applied to the primary TenantShard.
+/// to be applied to the primary TenantState.
 pub(crate) struct ReconcileResult {
    pub(crate) sequence: Sequence,
    /// On errors, `observed` should be treated as an incompleted description
@@ -367,7 +334,7 @@ pub(crate) struct ReconcileResult {
    pub(crate) generation: Option<Generation>,
    pub(crate) observed: ObservedState,

-    /// Set [`TenantShard::pending_compute_notification`] from this flag
+    /// Set [`TenantState::pending_compute_notification`] from this flag
    pub(crate) pending_compute_notification: bool,
 }

@@ -379,7 +346,7 @@ impl ObservedState {
    }
 }

-impl TenantShard {
+impl TenantState {
    pub(crate) fn new(
        tenant_shard_id: TenantShardId,
        shard: ShardIdentity,
@@ -400,7 +367,6 @@ impl TenantShard {
            error_waiter: Arc::new(SeqWait::new(Sequence(0))),
            last_error: Arc::default(),
            pending_compute_notification: false,
-            scheduling_policy: ShardSchedulingPolicy::default(),
        }
    }

@@ -456,7 +422,6 @@ impl TenantShard {
    fn schedule_attached(
        &mut self,
        scheduler: &mut Scheduler,
-        context: &ScheduleContext,
    ) -> Result<(bool, NodeId), ScheduleError> {
        // No work to do if we already have an attached tenant
        if let Some(node_id) = self.intent.attached {
@@ -470,33 +435,14 @@ impl TenantShard {
            Ok((true, promote_secondary))
        } else {
            // Pick a fresh node: either we had no secondaries or none were schedulable
-            let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?;
+            let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
            tracing::debug!("Selected {} as attached", node_id);
            self.intent.set_attached(scheduler, Some(node_id));
            Ok((true, node_id))
        }
    }

-    pub(crate) fn schedule(
-        &mut self,
-        scheduler: &mut Scheduler,
-        context: &mut ScheduleContext,
-    ) -> Result<(), ScheduleError> {
-        let r = self.do_schedule(scheduler, context);
-
-        context.avoid(&self.intent.all_pageservers());
-        if let Some(attached) = self.intent.get_attached() {
-            context.push_attached(*attached);
-        }
-
-        r
-    }
-
-    pub(crate) fn do_schedule(
-        &mut self,
-        scheduler: &mut Scheduler,
-        context: &ScheduleContext,
-    ) -> Result<(), ScheduleError> {
+    pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
        // TODO: before scheduling new nodes, check if any existing content in
        // self.intent refers to pageservers that are offline, and pick other
        // pageservers if so.
@@ -504,16 +450,6 @@ impl TenantShard {
        // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
        // change their attach location.

-        match self.scheduling_policy {
-            ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {}
-            ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
-                // Warn to make it obvious why other things aren't happening/working, if we skip scheduling
-                tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
-                    "Scheduling is disabled by policy {:?}", self.scheduling_policy);
-                return Ok(());
-            }
-        }
-
        // Build the set of pageservers already in use by this tenant, to avoid scheduling
        // more work on the same pageservers we're already using.
        let mut modified = false;
@@ -521,7 +457,22 @@ impl TenantShard {
        // Add/remove nodes to fulfil policy
        use PlacementPolicy::*;
        match self.policy {
-            Attached(secondary_count) => {
+            Single => {
+                // Should have exactly one attached, and zero secondaries
+                if !self.intent.secondary.is_empty() {
+                    self.intent.clear_secondary(scheduler);
+                    modified = true;
+                }
+
+                let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
+                modified |= modified_attached;
+
+                if !self.intent.secondary.is_empty() {
+                    self.intent.clear_secondary(scheduler);
+                    modified = true;
+                }
+            }
+            Double(secondary_count) => {
                let retain_secondaries = if self.intent.attached.is_none()
                    && scheduler.node_preferred(&self.intent.secondary).is_some()
                {
@@ -540,13 +491,12 @@ impl TenantShard {
                }

                // Should have exactly one attached, and N secondaries
-                let (modified_attached, attached_node_id) =
-                    self.schedule_attached(scheduler, context)?;
+                let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
                modified |= modified_attached;

                let mut used_pageservers = vec![attached_node_id];
                while self.intent.secondary.len() < secondary_count {
-                    let node_id = scheduler.schedule_shard(&used_pageservers, context)?;
+                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
                    self.intent.push_secondary(scheduler, node_id);
                    used_pageservers.push(node_id);
                    modified = true;
@@ -559,7 +509,7 @@ impl TenantShard {
                    modified = true;
                } else if self.intent.secondary.is_empty() {
                    // Populate secondary by scheduling a fresh node
-                    let node_id = scheduler.schedule_shard(&[], context)?;
+                    let node_id = scheduler.schedule_shard(&[])?;
                    self.intent.push_secondary(scheduler, node_id);
                    modified = true;
                }
@@ -586,167 +536,6 @@ impl TenantShard {
        Ok(())
    }

-    /// Optimize attachments: if a shard has a secondary location that is preferable to
-    /// its primary location based on soft constraints, switch that secondary location
-    /// to be attached.
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
-    pub(crate) fn optimize_attachment(
-        &self,
-        nodes: &HashMap<NodeId, Node>,
-        schedule_context: &ScheduleContext,
-    ) -> Option<ScheduleOptimization> {
-        let attached = (*self.intent.get_attached())?;
-        if self.intent.secondary.is_empty() {
-            // We can only do useful work if we have both attached and secondary locations: this
-            // function doesn't schedule new locations, only swaps between attached and secondaries.
-            return None;
-        }
-
-        let current_affinity_score = schedule_context.get_node_affinity(attached);
-        let current_attachment_count = schedule_context.get_node_attachments(attached);
-
-        // Generate score for each node, dropping any un-schedulable nodes.
-        let all_pageservers = self.intent.all_pageservers();
-        let mut scores = all_pageservers
-            .iter()
-            .flat_map(|node_id| {
-                if matches!(
-                    nodes
-                        .get(node_id)
-                        .map(|n| n.may_schedule())
-                        .unwrap_or(MaySchedule::No),
-                    MaySchedule::No
-                ) {
-                    None
-                } else {
-                    let affinity_score = schedule_context.get_node_affinity(*node_id);
-                    let attachment_count = schedule_context.get_node_attachments(*node_id);
-                    Some((*node_id, affinity_score, attachment_count))
-                }
-            })
-            .collect::<Vec<_>>();
-
-        // Sort precedence:
-        //  1st - prefer nodes with the lowest total affinity score
-        //  2nd - prefer nodes with the lowest number of attachments in this context
-        //  3rd - if all else is equal, sort by node ID for determinism in tests.
-        scores.sort_by_key(|i| (i.1, i.2, i.0));
-
-        if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) =
-            scores.first()
-        {
-            if attached != *preferred_node {
-                // The best alternative must be more than 1 better than us, otherwise we could end
-                // up flapping back next time we're called (e.g. there's no point migrating from
-                // a location with score 1 to a score zero, because on next location the situation
-                // would be the same, but in reverse).
-                if current_affinity_score > *preferred_affinity_score + AffinityScore(1)
-                    || current_attachment_count > *preferred_attachment_count + 1
-                {
-                    tracing::info!(
-                        "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
-                        self.intent.get_secondary()
-                    );
-                    return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                        old_attached_node_id: attached,
-                        new_attached_node_id: *preferred_node,
-                    }));
-                }
-            } else {
-                tracing::debug!(
-                    "Node {} is already preferred (score {:?})",
-                    preferred_node,
-                    preferred_affinity_score
-                );
-            }
-        }
-
-        // Fall-through: we didn't find an optimization
-        None
-    }
-
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
-    pub(crate) fn optimize_secondary(
-        &self,
-        scheduler: &Scheduler,
-        schedule_context: &ScheduleContext,
-    ) -> Option<ScheduleOptimization> {
-        if self.intent.secondary.is_empty() {
-            // We can only do useful work if we have both attached and secondary locations: this
-            // function doesn't schedule new locations, only swaps between attached and secondaries.
-            return None;
-        }
-
-        for secondary in self.intent.get_secondary() {
-            let Some(affinity_score) = schedule_context.nodes.get(secondary) else {
-                // We're already on a node unaffected any affinity constraints,
-                // so we won't change it.
-                continue;
-            };
-
-            // Let the scheduler suggest a node, where it would put us if we were scheduling afresh
-            // This implicitly limits the choice to nodes that are available, and prefers nodes
-            // with lower utilization.
-            let Ok(candidate_node) =
-                scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context)
-            else {
-                // A scheduling error means we have no possible candidate replacements
-                continue;
-            };
-
-            let candidate_affinity_score = schedule_context
-                .nodes
-                .get(&candidate_node)
-                .unwrap_or(&AffinityScore::FREE);
-
-            // The best alternative must be more than 1 better than us, otherwise we could end
-            // up flapping back next time we're called.
-            if *candidate_affinity_score + AffinityScore(1) < *affinity_score {
-                // If some other node is available and has a lower score than this node, then
-                // that other node is a good place to migrate to.
-                tracing::info!(
-                    "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
-                    self.intent.get_secondary()
-                );
-                return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
-                    old_node_id: *secondary,
-                    new_node_id: candidate_node,
-                }));
-            }
-        }
-
-        None
-    }
-
-    pub(crate) fn apply_optimization(
-        &mut self,
-        scheduler: &mut Scheduler,
-        optimization: ScheduleOptimization,
-    ) {
-        metrics::METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_schedule_optimization
-            .inc();
-
-        match optimization {
-            ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                old_attached_node_id,
-                new_attached_node_id,
-            }) => {
-                self.intent.demote_attached(old_attached_node_id);
-                self.intent
-                    .promote_attached(scheduler, new_attached_node_id);
-            }
-            ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
-                old_node_id,
-                new_node_id,
-            }) => {
-                self.intent.remove_secondary(scheduler, old_node_id);
-                self.intent.push_secondary(scheduler, new_node_id);
-            }
-        }
-    }
-
    /// Query whether the tenant's observed state for attached node matches its intent state, and if so,
    /// yield the node ID.  This is appropriate for emitting compute hook notifications: we are checking that
    /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
@@ -788,12 +577,7 @@ impl TenantShard {
                .generation
                .expect("Attempted to enter attached state without a generation");

-            let wanted_conf = attached_location_conf(
-                generation,
-                &self.shard,
-                &self.config,
-                !self.intent.secondary.is_empty(),
-            );
+            let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
            match self.observed.locations.get(&node_id) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                Some(_) | None => {
@@ -891,19 +675,6 @@ impl TenantShard {
            }
        }

-        // Pre-checks done: finally check whether we may actually do the work
-        match self.scheduling_policy {
-            ShardSchedulingPolicy::Active
-            | ShardSchedulingPolicy::Essential
-            | ShardSchedulingPolicy::Pause => {}
-            ShardSchedulingPolicy::Stop => {
-                // We only reach this point if there is work to do and we're going to skip
-                // doing it: warn it obvious why this tenant isn't doing what it ought to.
-                tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
-                return None;
-            }
-        }
-
        // Build list of nodes from which the reconciler should detach
        let mut detach = Vec::new();
        for node_id in self.observed.locations.keys() {
@@ -957,10 +728,7 @@ impl TenantShard {
        let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
                                                        tenant_id=%reconciler.tenant_shard_id.tenant_id,
                                                        shard_id=%reconciler.tenant_shard_id.shard_slug());
-        metrics::METRICS_REGISTRY
-            .metrics_group
-            .storage_controller_reconcile_spawn
-            .inc();
+        metrics::RECONCILER.spawned.inc();
        let result_tx = result_tx.clone();
        let join_handle = tokio::task::spawn(
            async move {
@@ -978,12 +746,10 @@ impl TenantShard {
                // TODO: wrap all remote API operations in cancellation check
                // as well.
                if reconciler.cancel.is_cancelled() {
-                    metrics::METRICS_REGISTRY
-                        .metrics_group
-                        .storage_controller_reconcile_complete
-                        .inc(ReconcileCompleteLabelGroup {
-                            status: ReconcileOutcome::Cancel,
-                        });
+                    metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
+                        .inc();
                    return;
                }

@@ -998,18 +764,18 @@ impl TenantShard {
                }

                // Update result counter
-                let outcome_label = match &result {
-                    Ok(_) => ReconcileOutcome::Success,
-                    Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel,
-                    Err(_) => ReconcileOutcome::Error,
-                };
-
-                metrics::METRICS_REGISTRY
-                    .metrics_group
-                    .storage_controller_reconcile_complete
-                    .inc(ReconcileCompleteLabelGroup {
-                        status: outcome_label,
-                    });
+                match &result {
+                    Ok(_) => metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
+                    Err(ReconcileError::Cancel) => metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
+                    Err(_) => metrics::RECONCILER
+                        .complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
+                }
+                .inc();

                result_tx
                    .send(ReconcileResult {
@@ -1040,22 +806,6 @@ impl TenantShard {
        })
    }

-    /// Get a waiter for any reconciliation in flight, but do not start reconciliation
-    /// if it is not already running
-    pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
-        if self.reconciler.is_some() {
-            Some(ReconcilerWaiter {
-                tenant_shard_id: self.tenant_shard_id,
-                seq_wait: self.waiter.clone(),
-                error_seq_wait: self.error_waiter.clone(),
-                error: self.last_error.clone(),
-                seq: self.sequence,
-            })
-        } else {
-            None
-        }
-    }
-
    /// Called when a ReconcileResult has been emitted and the service is updating
    /// our state: if the result is from a sequence >= my ReconcileHandle, then drop
    /// the handle to indicate there is no longer a reconciliation in progress.
@@ -1081,40 +831,6 @@ impl TenantShard {
        debug_assert!(!self.intent.all_pageservers().contains(&node_id));
    }

-    pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) {
-        self.scheduling_policy = p;
-    }
-
-    pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy {
-        &self.scheduling_policy
-    }
-
-    pub(crate) fn from_persistent(
-        tsp: TenantShardPersistence,
-        intent: IntentState,
-    ) -> anyhow::Result<Self> {
-        let tenant_shard_id = tsp.get_tenant_shard_id()?;
-        let shard_identity = tsp.get_shard_identity()?;
-
-        Ok(Self {
-            tenant_shard_id,
-            shard: shard_identity,
-            sequence: Sequence::initial(),
-            generation: tsp.generation.map(|g| Generation::new(g as u32)),
-            policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
-            intent,
-            observed: ObservedState::new(),
-            config: serde_json::from_str(&tsp.config).unwrap(),
-            reconciler: None,
-            splitting: tsp.splitting,
-            waiter: Arc::new(SeqWait::new(Sequence::initial())),
-            error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
-            last_error: Arc::default(),
-            pending_compute_notification: false,
-            scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
-        })
-    }
-
    pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
        TenantShardPersistence {
            tenant_id: self.tenant_shard_id.tenant_id.to_string(),
@@ -1126,7 +842,6 @@ impl TenantShard {
            placement_policy: serde_json::to_string(&self.policy).unwrap(),
            config: serde_json::to_string(&self.config).unwrap(),
            splitting: SplitState::default(),
-            scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
        }
    }
 }
@@ -1143,7 +858,7 @@ pub(crate) mod tests {

    use super::*;

-    fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard {
+    fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
        let tenant_id = TenantId::generate();
        let shard_number = ShardNumber(0);
        let shard_count = ShardCount::new(1);
@@ -1153,7 +868,7 @@ pub(crate) mod tests {
            shard_number,
            shard_count,
        };
-        TenantShard::new(
+        TenantState::new(
            tenant_shard_id,
            ShardIdentity::new(
                shard_number,
@@ -1165,32 +880,6 @@ pub(crate) mod tests {
        )
    }

-    fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantShard> {
-        let tenant_id = TenantId::generate();
-
-        (0..shard_count.count())
-            .map(|i| {
-                let shard_number = ShardNumber(i);
-
-                let tenant_shard_id = TenantShardId {
-                    tenant_id,
-                    shard_number,
-                    shard_count,
-                };
-                TenantShard::new(
-                    tenant_shard_id,
-                    ShardIdentity::new(
-                        shard_number,
-                        shard_count,
-                        pageserver_api::shard::ShardStripeSize(32768),
-                    )
-                    .unwrap(),
-                    policy.clone(),
-                )
-            })
-            .collect()
-    }
-
    /// Test the scheduling behaviors used when a tenant configured for HA is subject
    /// to nodes being marked offline.
    #[test]
@@ -1200,26 +889,25 @@ pub(crate) mod tests {
        let mut nodes = make_test_nodes(3);

        let mut scheduler = Scheduler::new(nodes.values());
-        let mut context = ScheduleContext::default();

-        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
-        tenant_shard
-            .schedule(&mut scheduler, &mut context)
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
+        tenant_state
+            .schedule(&mut scheduler)
            .expect("we have enough nodes, scheduling should work");

        // Expect to initially be schedule on to different nodes
-        assert_eq!(tenant_shard.intent.secondary.len(), 1);
-        assert!(tenant_shard.intent.attached.is_some());
+        assert_eq!(tenant_state.intent.secondary.len(), 1);
+        assert!(tenant_state.intent.attached.is_some());

-        let attached_node_id = tenant_shard.intent.attached.unwrap();
-        let secondary_node_id = *tenant_shard.intent.secondary.iter().last().unwrap();
+        let attached_node_id = tenant_state.intent.attached.unwrap();
+        let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
        assert_ne!(attached_node_id, secondary_node_id);

        // Notifying the attached node is offline should demote it to a secondary
-        let changed = tenant_shard.intent.demote_attached(attached_node_id);
+        let changed = tenant_state.intent.demote_attached(attached_node_id);
        assert!(changed);
-        assert!(tenant_shard.intent.attached.is_none());
-        assert_eq!(tenant_shard.intent.secondary.len(), 2);
+        assert!(tenant_state.intent.attached.is_none());
+        assert_eq!(tenant_state.intent.secondary.len(), 2);

        // Update the scheduler state to indicate the node is offline
        nodes
@@ -1229,18 +917,18 @@ pub(crate) mod tests {
        scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());

        // Scheduling the node should promote the still-available secondary node to attached
-        tenant_shard
-            .schedule(&mut scheduler, &mut context)
+        tenant_state
+            .schedule(&mut scheduler)
            .expect("active nodes are available");
-        assert_eq!(tenant_shard.intent.attached.unwrap(), secondary_node_id);
+        assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);

        // The original attached node should have been retained as a secondary
        assert_eq!(
-            *tenant_shard.intent.secondary.iter().last().unwrap(),
+            *tenant_state.intent.secondary.iter().last().unwrap(),
            attached_node_id
        );

-        tenant_shard.intent.clear(&mut scheduler);
+        tenant_state.intent.clear(&mut scheduler);

        Ok(())
    }
@@ -1250,263 +938,48 @@ pub(crate) mod tests {
        let nodes = make_test_nodes(3);
        let mut scheduler = Scheduler::new(nodes.values());

-        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));

-        tenant_shard.observed.locations.insert(
+        tenant_state.observed.locations.insert(
            NodeId(3),
            ObservedStateLocation {
                conf: Some(LocationConfig {
                    mode: LocationConfigMode::AttachedMulti,
                    generation: Some(2),
                    secondary_conf: None,
-                    shard_number: tenant_shard.shard.number.0,
-                    shard_count: tenant_shard.shard.count.literal(),
-                    shard_stripe_size: tenant_shard.shard.stripe_size.0,
+                    shard_number: tenant_state.shard.number.0,
+                    shard_count: tenant_state.shard.count.literal(),
+                    shard_stripe_size: tenant_state.shard.stripe_size.0,
                    tenant_conf: TenantConfig::default(),
                }),
            },
        );

-        tenant_shard.observed.locations.insert(
+        tenant_state.observed.locations.insert(
            NodeId(2),
            ObservedStateLocation {
                conf: Some(LocationConfig {
                    mode: LocationConfigMode::AttachedStale,
                    generation: Some(1),
                    secondary_conf: None,
-                    shard_number: tenant_shard.shard.number.0,
-                    shard_count: tenant_shard.shard.count.literal(),
-                    shard_stripe_size: tenant_shard.shard.stripe_size.0,
+                    shard_number: tenant_state.shard.number.0,
+                    shard_count: tenant_state.shard.count.literal(),
+                    shard_stripe_size: tenant_state.shard.stripe_size.0,
                    tenant_conf: TenantConfig::default(),
                }),
            },
        );

-        tenant_shard.intent_from_observed(&mut scheduler);
+        tenant_state.intent_from_observed(&mut scheduler);

        // The highest generationed attached location gets used as attached
-        assert_eq!(tenant_shard.intent.attached, Some(NodeId(3)));
+        assert_eq!(tenant_state.intent.attached, Some(NodeId(3)));
        // Other locations get used as secondary
-        assert_eq!(tenant_shard.intent.secondary, vec![NodeId(2)]);
+        assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]);

-        scheduler.consistency_check(nodes.values(), [&tenant_shard].into_iter())?;
-
-        tenant_shard.intent.clear(&mut scheduler);
-        Ok(())
-    }
-
-    #[test]
-    fn scheduling_mode() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(3);
-        let mut scheduler = Scheduler::new(nodes.values());
-
-        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
-
-        // In pause mode, schedule() shouldn't do anything
-        tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause;
-        assert!(tenant_shard
-            .schedule(&mut scheduler, &mut ScheduleContext::default())
-            .is_ok());
-        assert!(tenant_shard.intent.all_pageservers().is_empty());
-
-        // In active mode, schedule() works
-        tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active;
-        assert!(tenant_shard
-            .schedule(&mut scheduler, &mut ScheduleContext::default())
-            .is_ok());
-        assert!(!tenant_shard.intent.all_pageservers().is_empty());
-
-        tenant_shard.intent.clear(&mut scheduler);
-        Ok(())
-    }
-
-    #[test]
-    fn optimize_attachment() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(3);
-        let mut scheduler = Scheduler::new(nodes.values());
-
-        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
-        let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
-
-        // Initially: both nodes attached on shard 1, and both have secondary locations
-        // on different nodes.
-        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
-        shard_a.intent.push_secondary(&mut scheduler, NodeId(2));
-        shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1)));
-        shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
-
-        let mut schedule_context = ScheduleContext::default();
-        schedule_context.avoid(&shard_a.intent.all_pageservers());
-        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
-        schedule_context.avoid(&shard_b.intent.all_pageservers());
-        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
-
-        let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);
-
-        // Either shard should recognize that it has the option to switch to a secondary location where there
-        // would be no other shards from the same tenant, and request to do so.
-        assert_eq!(
-            optimization_a,
-            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                old_attached_node_id: NodeId(1),
-                new_attached_node_id: NodeId(2)
-            }))
-        );
-
-        // Note that these optimizing two shards in the same tenant with the same ScheduleContext is
-        // mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility
-        // of [`Service::optimize_all`] to avoid trying
-        // to do optimizations for multiple shards in the same tenant at the same time.  Generating
-        // both optimizations is just done for test purposes
-        let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
-        assert_eq!(
-            optimization_b,
-            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
-                old_attached_node_id: NodeId(1),
-                new_attached_node_id: NodeId(3)
-            }))
-        );
-
-        // Applying these optimizations should result in the end state proposed
-        shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
-        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2)));
-        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
-        shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap());
-        assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3)));
-        assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]);
-
-        shard_a.intent.clear(&mut scheduler);
-        shard_b.intent.clear(&mut scheduler);
-
-        Ok(())
-    }
-
-    #[test]
-    fn optimize_secondary() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(4);
-        let mut scheduler = Scheduler::new(nodes.values());
-
-        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
-        let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
-
-        // Initially: both nodes attached on shard 1, and both have secondary locations
-        // on different nodes.
-        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
-        shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
-        shard_b.intent.set_attached(&mut scheduler, Some(NodeId(2)));
-        shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
-
-        let mut schedule_context = ScheduleContext::default();
-        schedule_context.avoid(&shard_a.intent.all_pageservers());
-        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
-        schedule_context.avoid(&shard_b.intent.all_pageservers());
-        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
-
-        let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context);
-
-        // Since there is a node with no locations available, the node with two locations for the
-        // same tenant should generate an optimization to move one away
-        assert_eq!(
-            optimization_a,
-            Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
-                old_node_id: NodeId(3),
-                new_node_id: NodeId(4)
-            }))
-        );
-
-        shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
-        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
-        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(4)]);
-
-        shard_a.intent.clear(&mut scheduler);
-        shard_b.intent.clear(&mut scheduler);
-
-        Ok(())
-    }
-
-    // Optimize til quiescent: this emulates what Service::optimize_all does, when
-    // called repeatedly in the background.
-    fn optimize_til_idle(
-        nodes: &HashMap<NodeId, Node>,
-        scheduler: &mut Scheduler,
-        shards: &mut [TenantShard],
-    ) {
-        let mut loop_n = 0;
-        loop {
-            let mut schedule_context = ScheduleContext::default();
-            let mut any_changed = false;
-
-            for shard in shards.iter() {
-                schedule_context.avoid(&shard.intent.all_pageservers());
-                if let Some(attached) = shard.intent.get_attached() {
-                    schedule_context.push_attached(*attached);
-                }
-            }
-
-            for shard in shards.iter_mut() {
-                let optimization = shard.optimize_attachment(nodes, &schedule_context);
-                if let Some(optimization) = optimization {
-                    shard.apply_optimization(scheduler, optimization);
-                    any_changed = true;
-                    break;
-                }
-
-                let optimization = shard.optimize_secondary(scheduler, &schedule_context);
-                if let Some(optimization) = optimization {
-                    shard.apply_optimization(scheduler, optimization);
-                    any_changed = true;
-                    break;
-                }
-            }
-
-            if !any_changed {
-                break;
-            }
-
-            // Assert no infinite loop
-            loop_n += 1;
-            assert!(loop_n < 1000);
-        }
-    }
-
-    /// Test the balancing behavior of shard scheduling: that it achieves a balance, and
-    /// that it converges.
-    #[test]
-    fn optimize_add_nodes() -> anyhow::Result<()> {
-        let nodes = make_test_nodes(4);
-
-        // Only show the scheduler a couple of nodes
-        let mut scheduler = Scheduler::new([].iter());
-        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
-        scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
-
-        let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
-        let mut schedule_context = ScheduleContext::default();
-        for shard in &mut shards {
-            assert!(shard
-                .schedule(&mut scheduler, &mut schedule_context)
-                .is_ok());
-        }
-
-        // We should see equal number of locations on the two nodes.
-        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
-        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
-
-        // Add another two nodes: we should see the shards spread out when their optimize
-        // methods are called
-        scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap());
-        scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
-        optimize_til_idle(&nodes, &mut scheduler, &mut shards);
-
-        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
-        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
-        assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
-        assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
-
-        for shard in shards.iter_mut() {
-            shard.intent.clear(&mut scheduler);
-        }
+        scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?;

+        tenant_state.intent.clear(&mut scheduler);
        Ok(())
    }
 }
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -86,10 +86,7 @@ where
        .stdout(process_log_file)
        .stderr(same_file_for_stderr)
        .args(args);
-
-    let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
-        fill_rust_env_vars(background_command),
-    ));
+    let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
    filled_cmd.envs(envs);

    let pid_file_to_check = match &initial_pid_file {
@@ -271,15 +268,6 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
    cmd
 }

-fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
-    for (var, val) in std::env::vars() {
-        if var.starts_with("NEON_PAGESERVER_") {
-            cmd = cmd.env(var, val);
-        }
-    }
-    cmd
-}
-
 /// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
 /// 1. Claims a pidfile with a fcntl lock on it and
 /// 2. Sets up the pidfile's file descriptor so that it (and the lock)
@@ -306,7 +294,7 @@ where
    //      is in state 'taken' but the thread that would unlock it is
    //      not there.
    //   2. A rust object that represented some external resource in the
-    //      parent now got implicitly copied by the fork, even though
+    //      parent now got implicitly copied by the the fork, even though
    //      the object's type is not `Copy`. The parent program may use
    //      non-copyability as way to enforce unique ownership of an
    //      external resource in the typesystem. The fork breaks that
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -14,7 +14,9 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
-use pageserver_api::controller_api::PlacementPolicy;
+use pageserver_api::controller_api::{
+    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
+};
 use pageserver_api::models::{
    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
@@ -435,7 +437,7 @@ async fn handle_tenant(

            let placement_policy = match create_match.get_one::<String>("placement-policy") {
                Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
-                _ => PlacementPolicy::Attached(0),
+                _ => PlacementPolicy::Single,
            };

            let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
@@ -521,6 +523,88 @@ async fn handle_tenant(
                .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
+        Some(("migrate", matches)) => {
+            let tenant_shard_id = get_tenant_shard_id(matches, env)?;
+            let new_pageserver = get_pageserver(env, matches)?;
+            let new_pageserver_id = new_pageserver.conf.id;
+
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
+                .tenant_migrate(tenant_shard_id, new_pageserver_id)
+                .await?;
+
+            println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
+        }
+        Some(("status", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+
+            let mut shard_table = comfy_table::Table::new();
+            shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
+
+            let mut tenant_synthetic_size = None;
+
+            let storage_controller = StorageController::from_env(env);
+            for shard in storage_controller.tenant_locate(tenant_id).await?.shards {
+                let pageserver =
+                    PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
+
+                let size = pageserver
+                    .http_client
+                    .tenant_details(shard.shard_id)
+                    .await?
+                    .tenant_info
+                    .current_physical_size
+                    .unwrap();
+
+                shard_table.add_row([
+                    format!("{}", shard.shard_id.shard_slug()),
+                    format!("{}", shard.node_id.0),
+                    format!("{} MiB", size / (1024 * 1024)),
+                ]);
+
+                if shard.shard_id.is_zero() {
+                    tenant_synthetic_size =
+                        Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
+                }
+            }
+
+            let Some(synthetic_size) = tenant_synthetic_size else {
+                bail!("Shard 0 not found")
+            };
+
+            let mut tenant_table = comfy_table::Table::new();
+            tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
+            tenant_table.add_row([
+                "Synthetic size".to_string(),
+                format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
+            ]);
+
+            println!("{tenant_table}");
+            println!("{shard_table}");
+        }
+        Some(("shard-split", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+            let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
+            let shard_stripe_size: Option<ShardStripeSize> = matches
+                .get_one::<Option<ShardStripeSize>>("shard-stripe-size")
+                .cloned()
+                .unwrap();
+
+            let storage_controller = StorageController::from_env(env);
+            let result = storage_controller
+                .tenant_split(tenant_id, shard_count, shard_stripe_size)
+                .await?;
+            println!(
+                "Split tenant {} into shards {}",
+                tenant_id,
+                result
+                    .new_shards
+                    .iter()
+                    .map(|s| format!("{:?}", s))
+                    .collect::<Vec<_>>()
+                    .join(",")
+            );
+        }

        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
@@ -1058,6 +1142,21 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }
        }

+        Some(("set-state", subcommand_args)) => {
+            let pageserver = get_pageserver(env, subcommand_args)?;
+            let scheduling = subcommand_args.get_one("scheduling");
+            let availability = subcommand_args.get_one("availability");
+
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
+                .node_configure(NodeConfigureRequest {
+                    node_id: pageserver.conf.id,
+                    scheduling: scheduling.cloned(),
+                    availability: availability.cloned(),
+                })
+                .await?;
+        }
+
        Some(("status", subcommand_args)) => {
            match get_pageserver(env, subcommand_args)?.check_status().await {
                Ok(_) => println!("Page server is up and running"),
@@ -1231,7 +1330,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
    match ComputeControlPlane::load(env.clone()) {
        Ok(cplane) => {
            for (_k, node) in cplane.endpoints {
-                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) {
+                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
                    eprintln!("postgres stop failed: {e:#}");
                }
            }
@@ -1417,7 +1516,6 @@ fn cli() -> Command {
        .subcommand(
            Command::new("timeline")
            .about("Manage timelines")
-            .arg_required_else_help(true)
            .subcommand(Command::new("list")
                .about("List all timelines, available to this pageserver")
                .arg(tenant_id_arg.clone()))
@@ -1480,6 +1578,19 @@ fn cli() -> Command {
            .subcommand(Command::new("config")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
+            .subcommand(Command::new("migrate")
+                .about("Migrate a tenant from one pageserver to another")
+                .arg(tenant_id_arg.clone())
+                .arg(pageserver_id_arg.clone()))
+            .subcommand(Command::new("status")
+                .about("Human readable summary of the tenant's shards and attachment locations")
+                .arg(tenant_id_arg.clone()))
+            .subcommand(Command::new("shard-split")
+                .about("Increase the number of shards in the tenant")
+                .arg(tenant_id_arg.clone())
+                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
+                .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
+                )
        )
        .subcommand(
            Command::new("pageserver")
@@ -1499,6 +1610,12 @@ fn cli() -> Command {
                    .about("Restart local pageserver")
                    .arg(pageserver_config_args.clone())
                )
+                .subcommand(Command::new("set-state")
+                    .arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
+                    .arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
+                    .about("Set scheduling or availability state of pageserver node")
+                    .arg(pageserver_config_args.clone())
+                )
        )
        .subcommand(
            Command::new("storage_controller")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -12,7 +12,7 @@
 //!
 //! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
 //! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
-//! the basebackup from the pageserver to initialize the data directory, and
+//! the basebackup from the pageserver to initialize the the data directory, and
 //! finally launches the PostgreSQL process. It watches the PostgreSQL process
 //! until it exits.
 //!
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -114,7 +114,7 @@ impl NeonBroker {
 }

 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
-#[serde(default, deny_unknown_fields)]
+#[serde(default)]
 pub struct PageServerConf {
    // node id
    pub id: NodeId,
@@ -126,9 +126,6 @@ pub struct PageServerConf {
    // auth type used for the PG and HTTP ports
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,
-
-    pub(crate) virtual_file_io_engine: Option<String>,
-    pub(crate) get_vectored_impl: Option<String>,
 }

 impl Default for PageServerConf {
@@ -139,8 +136,6 @@ impl Default for PageServerConf {
            listen_http_addr: String::new(),
            pg_auth_type: AuthType::Trust,
            http_auth_type: AuthType::Trust,
-            virtual_file_io_engine: None,
-            get_vectored_impl: None,
        }
    }
 }
@@ -156,7 +151,6 @@ pub struct SafekeeperConf {
    pub remote_storage: Option<String>,
    pub backup_threads: Option<u32>,
    pub auth_enabled: bool,
-    pub listen_addr: Option<String>,
 }

 impl Default for SafekeeperConf {
@@ -170,7 +164,6 @@ impl Default for SafekeeperConf {
            remote_storage: None,
            backup_threads: None,
            auth_enabled: false,
-            listen_addr: None,
        }
    }
 }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -78,39 +78,18 @@ impl PageServerNode {
    ///
    /// These all end up on the command line of the `pageserver` binary.
    fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
+        let id = format!("id={}", self.conf.id);
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
        let pg_distrib_dir_param = format!(
            "pg_distrib_dir='{}'",
            self.env.pg_distrib_dir_raw().display()
        );

-        let PageServerConf {
-            id,
-            listen_pg_addr,
-            listen_http_addr,
-            pg_auth_type,
-            http_auth_type,
-            virtual_file_io_engine,
-            get_vectored_impl,
-        } = &self.conf;
+        let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
+        let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);

-        let id = format!("id={}", id);
-
-        let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
-        let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
-
-        let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
-        let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
-        let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
-            format!("virtual_file_io_engine='{virtual_file_io_engine}'")
-        } else {
-            String::new()
-        };
-        let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
-            format!("get_vectored_impl='{get_vectored_impl}'")
-        } else {
-            String::new()
-        };
+        let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
+        let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);

        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());

@@ -122,8 +101,6 @@ impl PageServerNode {
            listen_http_addr_param,
            listen_pg_addr_param,
            broker_endpoint_param,
-            virtual_file_io_engine,
-            get_vectored_impl,
        ];

        if let Some(control_plane_api) = &self.env.control_plane_api {
@@ -134,7 +111,7 @@ impl PageServerNode {

            // Storage controller uses the same auth as pageserver: if JWT is enabled
            // for us, we will also need it to talk to them.
-            if matches!(http_auth_type, AuthType::NeonJWT) {
+            if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
                let jwt_token = self
                    .env
                    .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
@@ -152,7 +129,8 @@ impl PageServerNode {
            ));
        }

-        if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
+        if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
+        {
            // Keys are generated in the toplevel repo dir, pageservers' workdirs
            // are one level below that, so refer to keys with ../
            overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
@@ -389,10 +367,6 @@ impl PageServerNode {
                .remove("image_creation_threshold")
                .map(|x| x.parse::<usize>())
                .transpose()?,
-            image_layer_creation_check_threshold: settings
-                .remove("image_layer_creation_check_threshold")
-                .map(|x| x.parse::<u8>())
-                .transpose()?,
            pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
            walreceiver_connect_timeout: settings
                .remove("walreceiver_connect_timeout")
@@ -410,6 +384,11 @@ impl PageServerNode {
                .map(|x| x.parse::<bool>())
                .transpose()
                .context("Failed to parse 'trace_read_requests' as bool")?,
+            image_layer_compression: settings
+                .remove("image_layer_compression")
+                .map(serde_json::from_str)
+                .transpose()
+                .context("Failed to parse 'image_layer_compression' json")?,
            eviction_policy: settings
                .remove("eviction_policy")
                .map(serde_json::from_str)
@@ -505,12 +484,6 @@ impl PageServerNode {
                    .map(|x| x.parse::<usize>())
                    .transpose()
                    .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
-                image_layer_creation_check_threshold: settings
-                    .remove("image_layer_creation_check_threshold")
-                    .map(|x| x.parse::<u8>())
-                    .transpose()
-                    .context("Failed to parse 'image_creation_check_threshold' as integer")?,
-
                pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
                walreceiver_connect_timeout: settings
                    .remove("walreceiver_connect_timeout")
@@ -528,6 +501,11 @@ impl PageServerNode {
                    .map(|x| x.parse::<bool>())
                    .transpose()
                    .context("Failed to parse 'trace_read_requests' as bool")?,
+                image_layer_compression: settings
+                    .remove("image_layer_compression")
+                    .map(serde_json::from_str)
+                    .transpose()
+                    .context("Failed to parse 'image_layer_compression' json")?,
                eviction_policy: settings
                    .remove("eviction_policy")
                    .map(serde_json::from_str)
@@ -586,6 +564,13 @@ impl PageServerNode {
        Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
    }

+    pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
+        Ok(self
+            .http_client
+            .tenant_secondary_download(*tenant_id)
+            .await?)
+    }
+
    pub async fn timeline_create(
        &self,
        tenant_shard_id: TenantShardId,
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -70,31 +70,24 @@ pub struct SafekeeperNode {
    pub pg_connection_config: PgConnectionConfig,
    pub env: LocalEnv,
    pub http_client: reqwest::Client,
-    pub listen_addr: String,
    pub http_base_url: String,
 }

 impl SafekeeperNode {
    pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
-        let listen_addr = if let Some(ref listen_addr) = conf.listen_addr {
-            listen_addr.clone()
-        } else {
-            "127.0.0.1".to_string()
-        };
        SafekeeperNode {
            id: conf.id,
            conf: conf.clone(),
-            pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port),
+            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
            env: env.clone(),
            http_client: reqwest::Client::new(),
-            http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port),
-            listen_addr,
+            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
        }
    }

    /// Construct libpq connection string for connecting to this safekeeper.
-    fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig {
-        PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port)
+    fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
+        PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port)
    }

    pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
@@ -118,8 +111,8 @@ impl SafekeeperNode {
        );
        io::stdout().flush().unwrap();

-        let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port);
-        let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port);
+        let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
+        let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
        let id = self.id;
        let datadir = self.datadir_path();

@@ -146,7 +139,7 @@ impl SafekeeperNode {
            availability_zone,
        ];
        if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
-            let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
+            let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
            args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
        }
        if !self.conf.sync {
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -38,9 +38,6 @@ const COMMAND: &str = "storage_controller";

 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;

-// Use a shorter pageserver unavailability interval than the default to speed up tests.
-const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
-
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -272,18 +269,13 @@ impl StorageController {
        // Run migrations on every startup, in case something changed.
        let database_url = self.setup_database().await?;

-        let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
-
        let mut args = vec![
            "-l",
            &self.listen,
            "-p",
            self.path.as_ref(),
-            "--dev",
            "--database-url",
            &database_url,
-            "--max-unavailable-interval",
-            &max_unavailable.to_string(),
        ]
        .into_iter()
        .map(|s| s.to_string())
@@ -476,7 +468,7 @@ impl StorageController {
    pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
        self.dispatch::<(), _>(
            Method::GET,
-            format!("debug/v1/tenant/{tenant_id}/locate"),
+            format!("control/v1/tenant/{tenant_id}/locate"),
            None,
        )
        .await
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -1,23 +0,0 @@
-[package]
-name = "storcon_cli"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-
-[dependencies]
-anyhow.workspace = true
-clap.workspace = true
-comfy-table.workspace = true
-hyper.workspace = true
-pageserver_api.workspace = true
-pageserver_client.workspace = true
-reqwest.workspace = true
-serde.workspace = true
-serde_json = { workspace = true, features = ["raw_value"] }
-thiserror.workspace = true
-tokio.workspace = true
-tracing.workspace = true
-utils.workspace = true
-workspace_hack.workspace = true
-
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,681 +0,0 @@
-use std::{collections::HashMap, str::FromStr, time::Duration};
-
-use clap::{Parser, Subcommand};
-use hyper::{Method, StatusCode};
-use pageserver_api::{
-    controller_api::{
-        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
-        TenantDescribeResponse, TenantPolicyRequest,
-    },
-    models::{
-        LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
-        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
-    },
-    shard::{ShardStripeSize, TenantShardId},
-};
-use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
-use reqwest::Url;
-use serde::{de::DeserializeOwned, Serialize};
-use utils::id::{NodeId, TenantId};
-
-use pageserver_api::controller_api::{
-    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-    TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
-};
-
-#[derive(Subcommand, Debug)]
-enum Command {
-    /// Register a pageserver with the storage controller.  This shouldn't usually be necessary,
-    /// since pageservers auto-register when they start up
-    NodeRegister {
-        #[arg(long)]
-        node_id: NodeId,
-
-        #[arg(long)]
-        listen_pg_addr: String,
-        #[arg(long)]
-        listen_pg_port: u16,
-
-        #[arg(long)]
-        listen_http_addr: String,
-        #[arg(long)]
-        listen_http_port: u16,
-    },
-
-    /// Modify a node's configuration in the storage controller
-    NodeConfigure {
-        #[arg(long)]
-        node_id: NodeId,
-
-        /// Availability is usually auto-detected based on heartbeats.  Set 'offline' here to
-        /// manually mark a node offline
-        #[arg(long)]
-        availability: Option<NodeAvailabilityArg>,
-        /// Scheduling policy controls whether tenant shards may be scheduled onto this node.
-        #[arg(long)]
-        scheduling: Option<NodeSchedulingPolicy>,
-    },
-    /// Modify a tenant's policies in the storage controller
-    TenantPolicy {
-        #[arg(long)]
-        tenant_id: TenantId,
-        /// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
-        /// or is in the normal attached state with N secondary locations (`attached:N`)
-        #[arg(long)]
-        placement: Option<PlacementPolicyArg>,
-        /// Scheduling policy enables pausing the controller's scheduling activity involving this tenant.  `active` is normal,
-        /// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
-        /// all reconciliation activity including for scheduling changes already made.  `pause` and `stop` can make a tenant
-        /// unavailable, and are only for use in emergencies.
-        #[arg(long)]
-        scheduling: Option<ShardSchedulingPolicyArg>,
-    },
-    /// List nodes known to the storage controller
-    Nodes {},
-    /// List tenants known to the storage controller
-    Tenants {},
-    /// Create a new tenant in the storage controller, and by extension on pageservers.
-    TenantCreate {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-    /// Delete a tenant in the storage controller, and by extension on pageservers.
-    TenantDelete {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-    /// Split an existing tenant into a higher number of shards than its current shard count.
-    TenantShardSplit {
-        #[arg(long)]
-        tenant_id: TenantId,
-        #[arg(long)]
-        shard_count: u8,
-        /// Optional, in 8kiB pages.  e.g. set 2048 for 16MB stripes.
-        #[arg(long)]
-        stripe_size: Option<u32>,
-    },
-    /// Migrate the attached location for a tenant shard to a specific pageserver.
-    TenantShardMigrate {
-        #[arg(long)]
-        tenant_shard_id: TenantShardId,
-        #[arg(long)]
-        node: NodeId,
-    },
-    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
-    /// that is passed through to pageservers, and does not affect storage controller behavior.
-    TenantConfig {
-        #[arg(long)]
-        tenant_id: TenantId,
-        #[arg(long)]
-        config: String,
-    },
-    /// Attempt to balance the locations for a tenant across pageservers.  This is a client-side
-    /// alternative to the storage controller's scheduling optimization behavior.
-    TenantScatter {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-    /// Print details about a particular tenant, including all its shards' states.
-    TenantDescribe {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-    /// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
-    /// mode so that it can warm up content on a pageserver.
-    TenantWarmup {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-}
-
-#[derive(Parser)]
-#[command(
-    author,
-    version,
-    about,
-    long_about = "CLI for Storage Controller Support/Debug"
-)]
-#[command(arg_required_else_help(true))]
-struct Cli {
-    #[arg(long)]
-    /// URL to storage controller.  e.g. http://127.0.0.1:1234 when using `neon_local`
-    api: Url,
-
-    #[arg(long)]
-    /// JWT token for authenticating with storage controller.  Depending on the API used, this
-    /// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
-    /// a token with both scopes to use with this tool.
-    jwt: Option<String>,
-
-    #[command(subcommand)]
-    command: Command,
-}
-
-#[derive(Debug, Clone)]
-struct PlacementPolicyArg(PlacementPolicy);
-
-impl FromStr for PlacementPolicyArg {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "detached" => Ok(Self(PlacementPolicy::Detached)),
-            "secondary" => Ok(Self(PlacementPolicy::Secondary)),
-            _ if s.starts_with("attached:") => {
-                let mut splitter = s.split(':');
-                let _prefix = splitter.next().unwrap();
-                match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
-                    Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
-                    None => Err(anyhow::anyhow!(
-                        "Invalid format '{s}', a valid example is 'attached:1'"
-                    )),
-                }
-            }
-            _ => Err(anyhow::anyhow!(
-                "Unknown placement policy '{s}', try detached,secondary,attached:<n>"
-            )),
-        }
-    }
-}
-
-#[derive(Debug, Clone)]
-struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
-
-impl FromStr for ShardSchedulingPolicyArg {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self(ShardSchedulingPolicy::Active)),
-            "essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
-            "pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
-            "stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
-            _ => Err(anyhow::anyhow!(
-                "Unknown scheduling policy '{s}', try active,essential,pause,stop"
-            )),
-        }
-    }
-}
-
-#[derive(Debug, Clone)]
-struct NodeAvailabilityArg(NodeAvailabilityWrapper);
-
-impl FromStr for NodeAvailabilityArg {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self(NodeAvailabilityWrapper::Active)),
-            "offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
-            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
-        }
-    }
-}
-
-struct Client {
-    base_url: Url,
-    jwt_token: Option<String>,
-    client: reqwest::Client,
-}
-
-impl Client {
-    fn new(base_url: Url, jwt_token: Option<String>) -> Self {
-        Self {
-            base_url,
-            jwt_token,
-            client: reqwest::ClientBuilder::new()
-                .build()
-                .expect("Failed to construct http client"),
-        }
-    }
-
-    /// Simple HTTP request wrapper for calling into storage controller
-    async fn dispatch<RQ, RS>(
-        &self,
-        method: hyper::Method,
-        path: String,
-        body: Option<RQ>,
-    ) -> mgmt_api::Result<RS>
-    where
-        RQ: Serialize + Sized,
-        RS: DeserializeOwned + Sized,
-    {
-        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
-        // for general purpose API access.
-        let url = Url::from_str(&format!(
-            "http://{}:{}/{path}",
-            self.base_url.host_str().unwrap(),
-            self.base_url.port().unwrap()
-        ))
-        .unwrap();
-
-        let mut builder = self.client.request(method, url);
-        if let Some(body) = body {
-            builder = builder.json(&body)
-        }
-        if let Some(jwt_token) = &self.jwt_token {
-            builder = builder.header(
-                reqwest::header::AUTHORIZATION,
-                format!("Bearer {jwt_token}"),
-            );
-        }
-
-        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
-        let response = response.error_from_body().await?;
-
-        response
-            .json()
-            .await
-            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
-    }
-}
-
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
-    let cli = Cli::parse();
-
-    let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
-
-    let mut trimmed = cli.api.to_string();
-    trimmed.pop();
-    let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
-
-    match cli.command {
-        Command::NodeRegister {
-            node_id,
-            listen_pg_addr,
-            listen_pg_port,
-            listen_http_addr,
-            listen_http_port,
-        } => {
-            storcon_client
-                .dispatch::<_, ()>(
-                    Method::POST,
-                    "control/v1/node".to_string(),
-                    Some(NodeRegisterRequest {
-                        node_id,
-                        listen_pg_addr,
-                        listen_pg_port,
-                        listen_http_addr,
-                        listen_http_port,
-                    }),
-                )
-                .await?;
-        }
-        Command::TenantCreate { tenant_id } => {
-            vps_client
-                .tenant_create(&TenantCreateRequest {
-                    new_tenant_id: TenantShardId::unsharded(tenant_id),
-                    generation: None,
-                    shard_parameters: ShardParameters::default(),
-                    placement_policy: Some(PlacementPolicy::Attached(1)),
-                    config: TenantConfig::default(),
-                })
-                .await?;
-        }
-        Command::TenantDelete { tenant_id } => {
-            let status = vps_client
-                .tenant_delete(TenantShardId::unsharded(tenant_id))
-                .await?;
-            tracing::info!("Delete status: {}", status);
-        }
-        Command::Nodes {} => {
-            let resp = storcon_client
-                .dispatch::<(), Vec<NodeDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/node".to_string(),
-                    None,
-                )
-                .await?;
-            let mut table = comfy_table::Table::new();
-            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
-            for node in resp {
-                table.add_row([
-                    format!("{}", node.id),
-                    node.listen_http_addr,
-                    format!("{:?}", node.scheduling),
-                    format!("{:?}", node.availability),
-                ]);
-            }
-            println!("{table}");
-        }
-        Command::NodeConfigure {
-            node_id,
-            availability,
-            scheduling,
-        } => {
-            let req = NodeConfigureRequest {
-                node_id,
-                availability: availability.map(|a| a.0),
-                scheduling,
-            };
-            storcon_client
-                .dispatch::<_, ()>(
-                    Method::PUT,
-                    format!("control/v1/node/{node_id}/config"),
-                    Some(req),
-                )
-                .await?;
-        }
-        Command::Tenants {} => {
-            let resp = storcon_client
-                .dispatch::<(), Vec<TenantDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/tenant".to_string(),
-                    None,
-                )
-                .await?;
-            let mut table = comfy_table::Table::new();
-            table.set_header([
-                "TenantId",
-                "ShardCount",
-                "StripeSize",
-                "Placement",
-                "Scheduling",
-            ]);
-            for tenant in resp {
-                let shard_zero = tenant.shards.into_iter().next().unwrap();
-                table.add_row([
-                    format!("{}", tenant.tenant_id),
-                    format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
-                    format!("{:?}", tenant.stripe_size),
-                    format!("{:?}", tenant.policy),
-                    format!("{:?}", shard_zero.scheduling_policy),
-                ]);
-            }
-
-            println!("{table}");
-        }
-        Command::TenantPolicy {
-            tenant_id,
-            placement,
-            scheduling,
-        } => {
-            let req = TenantPolicyRequest {
-                scheduling: scheduling.map(|s| s.0),
-                placement: placement.map(|p| p.0),
-            };
-            storcon_client
-                .dispatch::<_, ()>(
-                    Method::PUT,
-                    format!("control/v1/tenant/{tenant_id}/policy"),
-                    Some(req),
-                )
-                .await?;
-        }
-        Command::TenantShardSplit {
-            tenant_id,
-            shard_count,
-            stripe_size,
-        } => {
-            let req = TenantShardSplitRequest {
-                new_shard_count: shard_count,
-                new_stripe_size: stripe_size.map(ShardStripeSize),
-            };
-
-            let response = storcon_client
-                .dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
-                    Method::PUT,
-                    format!("control/v1/tenant/{tenant_id}/shard_split"),
-                    Some(req),
-                )
-                .await?;
-            println!(
-                "Split tenant {} into {} shards: {}",
-                tenant_id,
-                shard_count,
-                response
-                    .new_shards
-                    .iter()
-                    .map(|s| format!("{:?}", s))
-                    .collect::<Vec<_>>()
-                    .join(",")
-            );
-        }
-        Command::TenantShardMigrate {
-            tenant_shard_id,
-            node,
-        } => {
-            let req = TenantShardMigrateRequest {
-                tenant_shard_id,
-                node_id: node,
-            };
-
-            storcon_client
-                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
-                    Method::PUT,
-                    format!("control/v1/tenant/{tenant_shard_id}/migrate"),
-                    Some(req),
-                )
-                .await?;
-        }
-        Command::TenantConfig { tenant_id, config } => {
-            let tenant_conf = serde_json::from_str(&config)?;
-
-            vps_client
-                .tenant_config(&TenantConfigRequest {
-                    tenant_id,
-                    config: tenant_conf,
-                })
-                .await?;
-        }
-        Command::TenantScatter { tenant_id } => {
-            // Find the shards
-            let locate_response = storcon_client
-                .dispatch::<(), TenantLocateResponse>(
-                    Method::GET,
-                    format!("control/v1/tenant/{tenant_id}/locate"),
-                    None,
-                )
-                .await?;
-            let shards = locate_response.shards;
-
-            let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
-            let shard_count = shards.len();
-            for s in shards {
-                let entry = node_to_shards.entry(s.node_id).or_default();
-                entry.push(s.shard_id);
-            }
-
-            // Load list of available nodes
-            let nodes_resp = storcon_client
-                .dispatch::<(), Vec<NodeDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/node".to_string(),
-                    None,
-                )
-                .await?;
-
-            for node in nodes_resp {
-                if matches!(node.availability, NodeAvailabilityWrapper::Active) {
-                    node_to_shards.entry(node.id).or_default();
-                }
-            }
-
-            let max_shard_per_node = shard_count / node_to_shards.len();
-
-            loop {
-                let mut migrate_shard = None;
-                for shards in node_to_shards.values_mut() {
-                    if shards.len() > max_shard_per_node {
-                        // Pick the emptiest
-                        migrate_shard = Some(shards.pop().unwrap());
-                    }
-                }
-                let Some(migrate_shard) = migrate_shard else {
-                    break;
-                };
-
-                // Pick the emptiest node to migrate to
-                let mut destinations = node_to_shards
-                    .iter()
-                    .map(|(k, v)| (k, v.len()))
-                    .collect::<Vec<_>>();
-                destinations.sort_by_key(|i| i.1);
-                let (destination_node, destination_count) = *destinations.first().unwrap();
-                if destination_count + 1 > max_shard_per_node {
-                    // Even the emptiest destination doesn't have space: we're done
-                    break;
-                }
-                let destination_node = *destination_node;
-
-                node_to_shards
-                    .get_mut(&destination_node)
-                    .unwrap()
-                    .push(migrate_shard);
-
-                println!("Migrate {} -> {} ...", migrate_shard, destination_node);
-
-                storcon_client
-                    .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
-                        Method::PUT,
-                        format!("control/v1/tenant/{migrate_shard}/migrate"),
-                        Some(TenantShardMigrateRequest {
-                            tenant_shard_id: migrate_shard,
-                            node_id: destination_node,
-                        }),
-                    )
-                    .await?;
-                println!("Migrate {} -> {} OK", migrate_shard, destination_node);
-            }
-
-            // Spread the shards across the nodes
-        }
-        Command::TenantDescribe { tenant_id } => {
-            let describe_response = storcon_client
-                .dispatch::<(), TenantDescribeResponse>(
-                    Method::GET,
-                    format!("control/v1/tenant/{tenant_id}"),
-                    None,
-                )
-                .await?;
-            let shards = describe_response.shards;
-            let mut table = comfy_table::Table::new();
-            table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
-            for shard in shards {
-                let secondary = shard
-                    .node_secondary
-                    .iter()
-                    .map(|n| format!("{}", n))
-                    .collect::<Vec<_>>()
-                    .join(",");
-
-                let mut status_parts = Vec::new();
-                if shard.is_reconciling {
-                    status_parts.push("reconciling");
-                }
-
-                if shard.is_pending_compute_notification {
-                    status_parts.push("pending_compute");
-                }
-
-                if shard.is_splitting {
-                    status_parts.push("splitting");
-                }
-                let status = status_parts.join(",");
-
-                table.add_row([
-                    format!("{}", shard.tenant_shard_id),
-                    shard
-                        .node_attached
-                        .map(|n| format!("{}", n))
-                        .unwrap_or(String::new()),
-                    secondary,
-                    shard.last_error,
-                    status,
-                ]);
-            }
-            println!("{table}");
-        }
-        Command::TenantWarmup { tenant_id } => {
-            let describe_response = storcon_client
-                .dispatch::<(), TenantDescribeResponse>(
-                    Method::GET,
-                    format!("control/v1/tenant/{tenant_id}"),
-                    None,
-                )
-                .await;
-            match describe_response {
-                Ok(describe) => {
-                    if matches!(describe.policy, PlacementPolicy::Secondary) {
-                        // Fine: it's already known to controller in secondary mode: calling
-                        // again to put it into secondary mode won't cause problems.
-                    } else {
-                        anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
-                    }
-                }
-                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
-                    // Fine: this tenant isn't know to the storage controller yet.
-                }
-                Err(e) => {
-                    // Unexpected API error
-                    return Err(e.into());
-                }
-            }
-
-            vps_client
-                .location_config(
-                    TenantShardId::unsharded(tenant_id),
-                    pageserver_api::models::LocationConfig {
-                        mode: pageserver_api::models::LocationConfigMode::Secondary,
-                        generation: None,
-                        secondary_conf: Some(LocationConfigSecondary { warm: true }),
-                        shard_number: 0,
-                        shard_count: 0,
-                        shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
-                        tenant_conf: TenantConfig::default(),
-                    },
-                    None,
-                    true,
-                )
-                .await?;
-
-            let describe_response = storcon_client
-                .dispatch::<(), TenantDescribeResponse>(
-                    Method::GET,
-                    format!("control/v1/tenant/{tenant_id}"),
-                    None,
-                )
-                .await?;
-
-            let secondary_ps_id = describe_response
-                .shards
-                .first()
-                .unwrap()
-                .node_secondary
-                .first()
-                .unwrap();
-
-            println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
-            loop {
-                let (status, progress) = vps_client
-                    .tenant_secondary_download(
-                        TenantShardId::unsharded(tenant_id),
-                        Some(Duration::from_secs(10)),
-                    )
-                    .await?;
-                println!(
-                    "Progress: {}/{} layers, {}/{} bytes",
-                    progress.layers_downloaded,
-                    progress.layers_total,
-                    progress.bytes_downloaded,
-                    progress.bytes_total
-                );
-                match status {
-                    StatusCode::OK => {
-                        println!("Download complete");
-                        break;
-                    }
-                    StatusCode::ACCEPTED => {
-                        // Loop
-                    }
-                    _ => {
-                        anyhow::bail!("Unexpected download status: {status}");
-                    }
-                }
-            }
-        }
-    }
-
-    Ok(())
-}
--- a/diesel.toml
+++ b/diesel.toml
@@ -2,8 +2,8 @@
 # see https://diesel.rs/guides/configuring-diesel-cli

 [print_schema]
-file = "storage_controller/src/schema.rs"
+file = "control_plane/attachment_service/src/schema.rs"
 custom_type_derives = ["diesel::query_builder::QueryId"]

 [migrations_directory]
-dir = "storage_controller/migrations"
+dir = "control_plane/attachment_service/migrations"
--- a/docs/rfcs/031-sharding-static.md
+++ b/docs/rfcs/031-sharding-static.md
@@ -1,408 +0,0 @@
-# Sharding Phase 1: Static Key-space Sharding
-
-## Summary
-
-To enable databases with sizes approaching the capacity of a pageserver's disk,
-it is necessary to break up the storage for the database, or _shard_ it.
-
-Sharding in general is a complex area. This RFC aims to define an initial
-capability that will permit creating large-capacity databases using a static configuration
-defined at time of Tenant creation.
-
-## Motivation
-
-Currently, all data for a Tenant, including all its timelines, is stored on a single
-pageserver. The local storage required may be several times larger than the actual
-database size, due to LSM write inflation.
-
-If a database is larger than what one pageserver can hold, then it becomes impossible
-for the pageserver to hold it in local storage, as it must do to provide service to
-clients.
-
-### Prior art
-
-In Neon:
-
- Layer File Spreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Konstantin-21fd9b11b618475da5f39c61dd8ab7a4
- Layer File SPreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843
- Key Space partitioning: https://www.notion.so/neondatabase/One-Pager-Key-Space-Partitioning-Stas-8e3a28a600a04a25a68523f42a170677
-
-Prior art in other distributed systems is too broad to capture here: pretty much
-any scale out storage system does something like this.
-
-## Requirements
-
- Enable creating a large (for example, 16TiB) database without requiring dedicated
-  pageserver nodes.
- Share read/write bandwidth costs for large databases across pageservers, as well
-  as storage capacity, in order to avoid large capacity databases acting as I/O hotspots
-  that disrupt service to other tenants.
- Our data distribution scheme should handle sparse/nonuniform keys well, since postgres
-  does not write out a single contiguous ranges of page numbers.
-
-_Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database
-that a user might create on a current-gen enterprise SSD should also work well on
-Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the
-pageserver backend is not the limiting factor in the database size_.
-
-## Non Goals
-
- Independently distributing timelines within the same tenant. If a tenant has many
-  timelines, then sharding may be a less efficient mechanism for distributing load than
-  sharing out timelines between pageservers.
- Distributing work in the LSN dimension: this RFC focuses on the Key dimension only,
-  based on the idea that separate mechanisms will make sense for each dimension.
-
-## Impacted Components
-
-pageserver, control plane, postgres/smgr
-
-## Terminology
-
-**Key**: a postgres page number, qualified by relation. In the sense that the pageserver is a versioned key-value store,
-the page number is the key in that store. `Key` is a literal data type in existing code.
-
-**LSN dimension**: this just means the range of LSNs (history), when talking about the range
-of keys and LSNs as a two dimensional space.
-
-## Implementation
-
-### Key sharding vs. LSN sharding
-
-When we think of sharding across the two dimensional key/lsn space, this is an
-opportunity to think about how the two dimensions differ:
-
- Sharding the key space distributes the _write_ workload of ingesting data
-  and compacting. This work must be carefully managed so that exactly one
-  node owns a given key.
- Sharding the LSN space distributes the _historical read_ workload. This work
-  can be done by anyone without any special coordination, as long as they can
-  see the remote index and layers.
-
-The key sharding is the harder part, and also the more urgent one, to support larger
-capacity databases. Because distributing historical LSN read work is a relatively
-simpler problem that most users don't have, we defer it to future work. It is anticipated
-that some quite simple P2P offload model will enable distributing work for historical
-reads: a node which is low on space can call out to peer to ask it to download and
-serve reads from a historical layer.
-
-### Key mapping scheme
-
-Having decided to focus on key sharding, we must next decide how we will map
-keys to shards. It is proposed to use a "wide striping" approach, to obtain a good compromise
-between data locality and avoiding entire large relations mapping to the same shard.
-
-We will define two spaces:
-
- Key space: unsigned integer
- Shard space: integer from 0 to N-1, where we have N shards.
-
-### Key -> Shard mapping
-
-Keys are currently defined in the pageserver's getpage@lsn interface as follows:
-
-```
-pub struct Key {
-    pub field1: u8,
-    pub field2: u32,
-    pub field3: u32,
-    pub field4: u32,
-    pub field5: u8,
-    pub field6: u32,
-}
-
-
-fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
-    Key {
-        field1: 0x00,
-        field2: rel.spcnode,
-        field3: rel.dbnode,
-        field4: rel.relnode,
-        field5: rel.forknum,
-        field6: blknum,
-    }
-}
-```
-
-_Note: keys for relation metadata are ignored here, as this data will be mirrored to all
-shards. For distribution purposes, we only care about user data keys_
-
-The properties we want from our Key->Shard mapping are:
-
- Locality in `blknum`, such that adjacent `blknum` will usually map to
-  the same stripe and consequently land on the same shard, even though the overall
-  collection of blocks in a relation will be spread over many stripes and therefore
-  many shards.
- Avoid the same blknum on different relations landing on the same stripe, so that
-  with many small relations we do not end up aliasing data to the same stripe/shard.
- Avoid vulnerability to aliasing in the values of relation identity fields, such that
-  if there are patterns in the value of `relnode`, these do not manifest as patterns
-  in data placement.
-
-To accomplish this, the blknum is used to select a stripe, and stripes are
-assigned to shards in a pseudorandom order via a hash. The motivation for
-pseudo-random distribution (rather than sequential mapping of stripe to shard)
-is to avoid I/O hotspots when sequentially reading multiple relations: we don't want
-all relations' stripes to touch pageservers in the same order.
-
-To map a `Key` to a shard:
-
- Hash the `Key` field 4 (relNode).
- Divide field 6 (`blknum`) field by the stripe size in pages, and combine the
-  hash of this with the hash from the previous step.
- The total hash modulo the shard count gives the shard holding this key.
-
-Why don't we use the other fields in the Key?
-
- We ignore `forknum` for key mapping, because it distinguishes different classes of data
-  in the same relation, and we would like to keep the data in a relation together.
- We would like to use spcNode and dbNode, but cannot. Postgres database creation operations can refer to an existing database as a template, such that the created
-  database's blocks differ only by spcNode and dbNode from the original. To enable running
-  this type of creation without cross-pageserver communication, we must ensure that these
-  blocks map to the same shard -- we do this by excluding spcNode and dbNode from the hash.
-
-### Data placement examples
-
-For example, consider the extreme large databases cases of postgres data layout in a system with 8 shards
-and a stripe size of 32k pages:
-
- A single large relation: `blknum` division will break the data up into 4096
-  stripes, which will be scattered across the shards.
- 4096 relations of of 32k pages each: each relation will map to exactly one stripe,
-  and that stripe will be placed according to the hash of the key fields 4. The
-  data placement will be statistically uniform across shards.
-
-Data placement will be more uneven on smaller databases:
-
- A tenant with 2 shards and 2 relations of one stripe size each: there is a 50% chance
-  that both relations land on the same shard and no data lands on the other shard.
- A tenant with 8 shards and one relation of size 12 stripes: 4 shards will have double
-  the data of the other four shards.
-
-These uneven cases for small amounts of data do not matter, as long as the stripe size
-is an order of magnitude smaller than the amount of data we are comfortable holding
-in a single shard: if our system handles shard sizes up to 10-100GB, then it is not an issue if
-a tenant has some shards with 256MB size and some shards with 512MB size, even though
-the standard deviation of shard size within the tenant is very high. Our key mapping
-scheme provides a statistical guarantee that as the tenant's overall data size increases,
-uniformity of placement will improve.
-
-### Important Types
-
-#### `ShardIdentity`
-
-Provides the information needed to know whether a particular key belongs
-to a particular shard:
-
- Layout version
- Stripe size
- Shard count
- Shard index
-
-This structure's size is constant. Note that if we had used a differnet key
-mapping scheme such as consistent hashing with explicit hash ranges assigned
-to each shard, then the ShardIdentity's size would grow with the shard count: the simpler
-key mapping scheme used here enables a small fixed size ShardIdentity.
-
-### Pageserver changes
-
-#### Structural
-
-Everywhere the Pageserver currently deals with Tenants, it will move to dealing with
-`TenantShard`s, which are just a `Tenant` plus a `ShardIdentity` telling it which part
-of the keyspace it owns. An un-sharded tenant is just a `TenantShard` whose `ShardIdentity`
-covers the whole keyspace.
-
-When the pageserver writes layers and index_part.json to remote storage, it must
-include the shard index & count in the name, to avoid collisions (the count is
-necessary for future-proofing: the count will vary in time). These keys
-will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work
-exactly the same for TenantShards as it does for Tenants today: each shard will have
-its own generation number.
-
-#### Storage Format: Keys
-
-For tenants with >1 shard, layer files implicitly become sparse: within the key
-range described in the layer name, the layer file for a shard will only hold the
-content relevant to stripes assigned to the shard.
-
-For this reason, the LayerFileName within a tenant is no longer unique: different shards
-may use the same LayerFileName to refer to different data. We may solve this simply
-by including the shard number in the keys used for layers.
-
-The shard number will be included as a prefix (as part of tenant ID), like this:
-
-`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/<layer file name>-<generation>`
-
-`pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/index_part.json-<generation>`
-
-Reasons for this particular format:
-
- Use of a prefix is convenient for implementation (no need to carry the shard ID everywhere
-  we construct a layer file name), and enables efficient listing of index_parts within
-  a particular shard-timeline prefix.
- Including the shard _count_ as well as shard number means that in future when we implement
-  shard splitting, it will be possible for a parent shard and one of its children to write
-  the same layer file without a name collision. For example, a parent shard 0_1 might split
-  into two (0_2, 1_2), and in the process of splitting shard 0_2 could write a layer or index_part
-  that is distinct from what shard 0_1 would have written at the same place.
-
-In practice, we expect shard counts to be relatively small, so a `u8` will be sufficient,
-and therefore the shard part of the path can be a fixed-length hex string like `{:02X}{:02X}`,
-for example a single-shard tenant's prefix will be `0001`.
-
-For backward compatibility, we may define a special `ShardIdentity` that has shard_count==0,
-and use this as a cue to construct paths with no prefix at all.
-
-#### Storage Format: Indices
-
-In the phase 1 described in this RFC, shards only reference layers they write themselves. However,
-when we implement shard splitting in future, it will be useful to enable shards to reference layers
-written by other shards (specifically the parent shard during a split), so that shards don't
-have to exhaustively copy all data into their own shard-prefixed keys.
-
-To enable this, the `IndexPart` structure will be extended to store the (shard number, shard count)
-tuple on each layer, such that it can construct paths for layers written by other shards. This
-naturally raises the question of who "owns" such layers written by ancestral shards: this problem
-will be addressed in phase 2.
-
-For backward compatibility, any index entry without shard information will be assumed to be
-in the legacy shardidentity.
-
-#### WAL Ingest
-
-In Phase 1, all shards will subscribe to the safekeeper to download WAL content. They will filter
-it down to the pages relevant to their shard:
-
- For ordinary user data writes, only retain a write if it matches the ShardIdentity
- For metadata describing relations etc, all shards retain these writes.
-
-The pageservers must somehow give the safekeeper correct feedback on remote_consistent_lsn:
-one solution here is for the 0th shard to periodically peek at the IndexParts for all the other shards,
-and have only the 0th shard populate remote_consistent_lsn. However, this is relatively
-expensive: if the safekeeper can be made shard-aware then it could be taught to use
-the max() of all shards' remote_consistent_lsns to decide when to trim the WAL.
-
-#### Compaction/GC
-
-No changes needed.
-
-The pageserver doesn't have to do anything special during compaction
-or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity.
-This will result in sparse layer files, containing keys only in the stripes that this
-shard owns. Where optimizations currently exist in compaction for spotting "gaps" in
-the key range, these should be updated to ignore gaps that are due to sharding, to
-avoid spuriously splitting up layers ito stripe-sized pieces.
-
-### Compute Endpoints
-
-Compute endpoints will need to:
-
- Accept a vector of connection strings as part of their configuration from the control plane
- Route pageserver requests according to mapping the hash of key to the correct
-  entry in the vector of connection strings.
-
-Doing this in compute rather than routing requests via a single pageserver is
-necessary to enable sharding tenants without adding latency from extra hops.
-
-### Control Plane
-
-Tenants, or _Projects_ in the control plane, will each own a set of TenantShards (this will
-be 1 for small tenants). Logic for placement of tenant shards is just the same as the current logic for placing
-tenants.
-
-Tenant lifecycle operations like deletion will require fanning-out to all the shards
-in the tenant. The same goes for timeline creation and deletion: a timeline should
-not be considered created until it has been created in all shards.
-
-#### Selectively enabling sharding for large tenants
-
-Initially, we will explicitly enable sharding for large tenants only.
-
-In future, this hint mechanism will become optional when we implement automatic
-re-sharding of tenants.
-
-## Future Phases
-
-This section exists to indicate what will likely come next after this phase.
-
-Phases 2a and 2b are amenable to execution in parallel.
-
-### Phase 2a: WAL fan-out
-
-**Problem**: when all shards consume the whole WAL, the network bandwidth used
-for transmitting the WAL from safekeeper to pageservers is multiplied by a factor
-of the shard count.
-
-Network bandwidth is not our most pressing bottleneck, but it is likely to become
-a problem if we set a modest shard count (~8) on a significant number of tenants,
-especially as those larger tenants which we shard are also likely to have higher
-write bandwidth than average.
-
-### Phase 2b: Shard Splitting
-
-**Problem**: the number of shards in a tenant is defined at creation time and cannot
-be changed. This causes excessive sharding for most small tenants, and an upper
-bound on scale for very large tenants.
-
-To address this, a _splitting_ feature will later be added. One shard can split its
-data into a number of children by doing a special compaction operation to generate
-image layers broken up child-shard-wise, and then writing out an `index_part.json` for
-each child. This will then require external coordination (by the control plane) to
-safely attach these new child shards and then move them around to distribute work.
-The opposite _merging_ operation can also be imagined, but is unlikely to be implemented:
-once a Tenant has been sharded, the marginal efficiency benefit of merging is unlikely to justify
-the risk/complexity of implementing such a rarely-encountered scenario.
-
-### Phase N (future): distributed historical reads
-
-**Problem**: while sharding based on key is good for handling changes in overall
-database size, it is less suitable for spiky/unpredictable changes in the read
-workload to historical layers. Sudden increases in historical reads could result
-in sudden increases in local disk capacity required for a TenantShard.
-
-Example: the extreme case of this would be to run a tenant for a year, then create branches
-with ancestors at monthly intervals. This could lead to a sudden 12x inflation in
-the on-disk capacity footprint of a TenantShard, since it would be serving reads
-from all those disparate historical layers.
-
-If we can respond fast enough, then key-sharding a tenant more finely can help with
-this, but splitting may be a relatively expensive operation and the increased historical
-read load may be transient.
-
-A separate mechanism for handling heavy historical reads could be something like
-a gossip mechanism for pageservers to communicate
-about their workload, and then a getpageatlsn offload mechanism where one pageserver can
-ask another to go read the necessary layers from remote storage to serve the read. This
-requires relativly little coordination because it is read-only: any node can service any
-read. All reads to a particular shard would still flow through one node, but the
-disk capactity & I/O impact of servicing the read would be distributed.
-
-## FAQ/Alternatives
-
-### Why stripe the data, rather than using contiguous ranges of keyspace for each shard?
-
-When a database is growing under a write workload, writes may predominantly hit the
-end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user
-is intensively re-writing a particular relation, if that relation lived in a particular
-shard then it would not achieve our goal of distributing the write work across shards.
-
-### Why not proxy read requests through one pageserver, so that endpoints don't have to change?
-
-1. This would not achieve scale-out of network bandwidth: a busy tenant with a large
-   database would still cause a load hotspot on the pageserver routing its read requests.
-2. The additional hop through the "proxy" pageserver would add latency and overall
-   resource cost (CPU, network bandwidth)
-
-### Layer File Spreading: use one pageserver as the owner of a tenant, and have it spread out work on a per-layer basis to peers
-
-In this model, there would be no explicit sharding of work, but the pageserver to which
-a tenant is attached would not hold all layers on its disk: instead, it would call out
-to peers to have them store some layers, and call out to those peers to request reads
-in those layers.
-
-This mechanism will work well for distributing work in the LSN dimension, but in the key
-space dimension it has the major limitation of requiring one node to handle all
-incoming writes, and compactions. Even if the write workload for a large database
-fits in one pageserver, it will still be a hotspot and such tenants may still
-de-facto require their own pageserver.
--- a/docs/rfcs/032-shard-splitting.md
+++ b/docs/rfcs/032-shard-splitting.md
@@ -1,479 +0,0 @@
-# Shard splitting
-
-## Summary
-
-This RFC describes a new pageserver API for splitting an existing tenant shard into
-multiple shards, and describes how to use this API to safely increase the total
-shard count of a tenant.
-
-## Motivation
-
-In the [sharding RFC](031-sharding-static.md), a mechanism was introduced to scale
-tenants beyond the capacity of a single pageserver by breaking up the key space
-into stripes, and distributing these stripes across many pageservers. However,
-the shard count was defined once at tenant creation time and not varied thereafter.
-
-In practice, the expected size of a database is rarely known at creation time, and
-it is inefficient to enable sharding for very small tenants: we need to be
-able to create a tenant with a small number of shards (such as 1), and later expand
-when it becomes clear that the tenant has grown in size to a point where sharding
-is beneficial.
-
-### Prior art
-
-Many distributed systems have the problem of choosing how many shards to create for
-tenants that do not specify an expected size up-front. There are a couple of general
-approaches:
-
- Write to a key space in order, and start a new shard when the highest key advances
-  past some point. This doesn't work well for Neon, because we write to our key space
-  in many different contiguous ranges (per relation), rather than in one contiguous
-  range. To adapt to this kind of model, we would need a sharding scheme where each
-  relation had its own range of shards, which would be inefficient for the common
-  case of databases with many small relations.
- Monitor the system, and automatically re-shard at some size threshold. For
-  example in Ceph, the [pg_autoscaler](https://github.com/ceph/ceph/blob/49c27499af4ee9a90f69fcc6bf3597999d6efc7b/src/pybind/mgr/pg_autoscaler/module.py)
-  component monitors the size of each RADOS Pool, and adjusts the number of Placement
-  Groups (Ceph's shard equivalent).
-
-## Requirements
-
- A configurable capacity limit per-shard is enforced.
- Changes in shard count do not interrupt service beyond requiring postgres
-  to reconnect (i.e. milliseconds).
- Human being does not have to choose shard count
-
-## Non Goals
-
- Shard splitting is always a tenant-global operation: we will not enable splitting
-  one shard while leaving others intact.
- The inverse operation (shard merging) is not described in this RFC. This is a lower
-  priority than splitting, because databases grow more often than they shrink, and
-  a database with many shards will still work properly if the stored data shrinks, just
-  with slightly more overhead (e.g. redundant WAL replication)
- Shard splitting is only initiated based on capacity bounds, not load. Splitting
-  a tenant based on load will make sense for some medium-capacity, high-load workloads,
-  but is more complex to reason about and likely is not desirable until we have
-  shard merging to reduce the shard count again if the database becomes less busy.
-
-## Impacted Components
-
-pageserver, storage controller
-
-(the _storage controller_ is the evolution of what was called `attachment_service` in our test environment)
-
-## Terminology
-
-**Parent** shards are the shards that exist before a split. **Child** shards are
-the new shards created during a split.
-
-**Shard** is synonymous with _tenant shard_.
-
-**Shard Index** is the 2-tuple of shard number and shard count, written in
-paths as {:02x}{:02x}, e.g. `0001`.
-
-## Background
-
-In the implementation section, a couple of existing aspects of sharding are important
-to remember:
-
- Shard identifiers contain the shard number and count, so that "shard 0 of 1" (`0001`) is
-  a distinct shard from "shard 0 of 2" (`0002`). This is the case in key paths, local
-  storage paths, and remote index metadata.
- Remote layer file paths contain the shard index of the shard that created them, and
-  remote indices contain the same index to enable building the layer file path. A shard's
-  index may reference layers that were created by another shard.
- Local tenant shard directories include the shard index. All layers downloaded by
-  a tenant shard are stored in this shard-prefixed path, even if those layers were
-  initially created by another shard: tenant shards do not read and write one anothers'
-  paths.
- The `Tenant` pageserver type represents one tenant _shard_, not the whole tenant.
-  This is for historical reasons and will be cleaned up in future, but the existing
-  name is used here to help comprehension when reading code.
-
-## Implementation
-
-Note: this section focuses on the correctness of the core split process. This will
-be fairly inefficient in a naive implementation, and several important optimizations
-are described in a later section.
-
-There are broadly two parts to the implementation:
-
-1. The pageserver split API, which splits one shard on one pageserver
-2. The overall tenant split proccess which is coordinated by the storage controller,
-   and calls into the pageserver split API as needed.
-
-### Pageserver Split API
-
-The pageserver will expose a new API endpoint at `/v1/tenant/:tenant_shard_id/shard_split`
-that takes the new total shard count in the body.
-
-The pageserver split API operates on one tenant shard, on one pageserver. External
-coordination is required to use it safely, this is described in the later
-'Split procedure' section.
-
-#### Preparation
-
-First identify the shard indices for the new child shards. These are deterministic,
-calculated from the parent shard's index, and the number of children being created (this
-is an input to the API, and validated to be a power of two). In a trivial example, splitting
-0001 in two always results in 0002 and 0102.
-
-Child shard indices are chosen such that the childrens' parts of the keyspace will
-be subsets of the parent's parts of the keyspace.
-
-#### Step 1: write new remote indices
-
-In remote storage, splitting is very simple: we may just write new index_part.json
-objects for each child shard, containing exactly the same layers as the parent shard.
-
-The children will have more data than they need, but this avoids any exhausive
-re-writing or copying of layer files.
-
-The index key path includes a generation number: the parent shard's current
-attached generation number will also be used for the child shards' indices. This
-makes the operation safely retryable: if everything crashes and restarts, we may
-call the split API again on the parent shard, and the result will be some new remote
-indices for the child shards, under a higher generation number.
-
-#### Step 2: start new `Tenant` objects
-
-A new `Tenant` object may be instantiated for each child shard, while the parent
-shard still exists. When calling the tenant_spawn function for this object,
-the remote index from step 1 will be read, and the child shard will start
-to ingest WAL to catch up from whatever was in the remote storage at step 1.
-
-We now wait for child shards' WAL ingestion to catch up with the parent shard,
-so that we can safely tear down the parent shard without risking an availability
-gap to clients reading recent LSNs.
-
-#### Step 3: tear down parent `Tenant` object
-
-Once child shards are running and have caught up with WAL ingest, we no longer
-need the parent shard. Note that clients may still be using it -- when we
-shut it down, any page_service handlers will also shut down, causing clients
-to disconnect. When the client reconnects, it will re-lookup the tenant,
-and hit the child shard instead of the parent (shard lookup from page_service
-should bias toward higher ShardCount shards).
-
-Note that at this stage the page service client has not yet been notified of
-any split. In the trivial single split example:
-
- Shard 0001 is gone: Tenant object torn down
- Shards 0002 and 0102 are running on the same pageserver where Shard 0001 used to live.
- Clients will continue to connect to that server thinking that shard 0001 is there,
-  and all requests will work, because any key that was in shard 0001 is definitely
-  available in either shard 0002 or shard 0102.
- Eventually, the storage controller (not the pageserver) will decide to migrate
-  some child shards away: at that point it will do a live migration, ensuring
-  that the client has an updated configuration before it detaches anything
-  from the original server.
-
-#### Complete
-
-When we send a 200 response to the split request, we are promising the caller:
-
- That the child shards are persistent in remote storage
- That the parent shard has been shut down
-
-This enables the caller to proceed with the overall shard split operation, which
-may involve other shards on other pageservers.
-
-### Storage Controller Split procedure
-
-Splitting a tenant requires calling the pageserver split API, and tracking
-enough state to ensure recovery + completion in the event of any component (pageserver
-or storage controller) crashing (or request timing out) during the split.
-
-1. call the split API on all existing shards. Ensure that the resulting
-   child shards are pinned to their pageservers until _all_ the split calls are done.
-   This pinning may be implemented as a "split bit" on the tenant shards, that
-   blocks any migrations, and also acts as a sign that if we restart, we must go
-   through some recovery steps to resume the split.
-2. Once all the split calls are done, we may unpin the child shards (clear
-   the split bit). The split is now complete: subsequent steps are just migrations,
-   not strictly part of the split.
-3. Try to schedule new pageserver locations for the child shards, using
-   a soft anti-affinity constraint to place shards from the same tenant onto different
-   pageservers.
-
-Updating computes about the new shard count is not necessary until we migrate
-any of the child shards away from the parent's location.
-
-### Recovering from failures
-
-#### Rolling back an incomplete split
-
-An incomplete shard split may be rolled back quite simply, by attaching the parent shards to pageservers,
-and detaching child shards. This will lose any WAL ingested into the children after the parents
-were detached earlier, but the parents will catch up.
-
-No special pageserver API is needed for this. From the storage controllers point of view, the
-procedure is:
-
-1. For all parent shards in the tenant, ensure they are attached
-2. For all child shards, ensure they are not attached
-3. Drop child shards from the storage controller's database, and clear the split bit on the parent shards.
-
-Any remote storage content for child shards is left behind. This is similar to other cases where
-we may leave garbage objects in S3 (e.g. when we upload a layer but crash before uploading an
-index that references it). Future online scrub/cleanup functionality can remove these objects, or
-they will be removed when the tenant is deleted, as tenant deletion lists all objects in the prefix,
-which would include any child shards that were rolled back.
-
-If any timelines had been created on child shards, they will be lost when rolling back. To mitigate
-this, we will **block timeline creation during splitting**, so that we can safely roll back until
-the split is complete, without risking losing timelines.
-
-Rolling back an incomplete split will happen automatically if a split fails due to some fatal
-reason, and will not be accessible via an API:
-
- A pageserver fails to complete its split API request after too many retries
- A pageserver returns a fatal unexpected error such as 400 or 500
- The storage controller database returns a non-retryable error
- Some internal invariant is violated in the storage controller split code
-
-#### Rolling back a complete split
-
-A complete shard split may be rolled back similarly to an incomplete split, with the following
-modifications:
-
- The parent shards will no longer exist in the storage controller database, so these must
-  be re-synthesized somehow: the hard part of this is figuring the parent shards' generations. This
-  may be accomplished either by probing in S3, or by retaining some tombstone state for deleted
-  shards in the storage controller database.
- Any timelines that were created after the split complete will disappear when rolling back
-  to the tenant shards. For this reason, rolling back after a complete split should only
-  be done due to serious issues where loss of recently created timelines is acceptable, or
-  in cases where we have confirmed that no timelines were created in the intervening period.
- Parent shards' layers must not have been deleted: this property will come "for free" when
-  we first roll out sharding, by simply not implementing deletion of parent layers after
-  a split. When we do implement such deletion (see "Cleaning up parent-shard layers" in the
-  Optimizations section), it should apply a TTL to layers such that we have a
-  defined walltime window in which rollback will be possible.
-
-The storage controller will expose an API for rolling back a complete split, for use
-in the field if we encounter some critical bug with a post-split tenant.
-
-#### Retrying API calls during Pageserver Restart
-
-When a pageserver restarts during a split API call, it may witness on-disk content for both parent and
-child shards from an ongoing split. This does not intrinsically break anything, and the
-pageserver may include all these shards in its `/re-attach` request to the storage controller.
-
-In order to support such restarts, it is important that the storage controller stores
-persistent records of each child shard before it calls into a pageserver, as these child shards
-may require generation increments via a `/re-attach` request.
-
-The pageserver restart will also result in a failed API call from the storage controller's point
-of view. Recall that if _any_ pageserver fails to split, the overall split operation may not
-complete, and all shards must remain pinned to their current pageserver locations until the
-split is done.
-
-The pageserver API calls during splitting will retry on transient errors, so that
-short availability gaps do not result in a failure of the overall operation. The
-split in progress will be automatically rolled back if the threshold for API
-retries is reached (e.g. if a pageserver stays offline for longer than a typical
-restart).
-
-#### Rollback on Storage Controller Restart
-
-On startup, the storage controller will inspect the split bit for tenant shards that
-it loads from the database. If any splits are in progress:
-
- Database content will be reverted to the parent shards
- Child shards will be dropped from memory
- The parent and child shards will be included in the general startup reconciliation that
-  the storage controller does: any child shards will be detached from pageservers because
-  they don't exist in the storage controller's expected set of shards, and parent shards
-  will be attached if they aren't already.
-
-#### Storage controller API request failures/retries
-
-The split request handler will implement idempotency: if the [`Tenant`] requested to split
-doesn't exist, we will check for the would-be child shards, and if they already exist,
-we consider the request complete.
-
-If a request is retried while the original request is still underway, then the split
-request handler will notice an InProgress marker in TenantManager, and return 503
-to encourage the client to backoff/retry. This is the same as the general pageserver
-API handling for calls that try to act on an InProgress shard.
-
-#### Compute start/restart during a split
-
-If a compute starts up during split, it will be configured with the old sharding
-configuration. This will work for reads irrespective of the progress of the split
-as long as no child hards have been migrated away from their original location, and
-this is guaranteed in the split procedure (see earlier section).
-
-#### Pageserver fails permanently during a split
-
-If a pageserver permanently fails (i.e. the storage controller availability state for it
-goes to Offline) while a split is in progress, the splitting operation will roll back, and
-during the roll back it will skip any API calls to the offline pageserver. If the offline
-pageserver becomes available again, any stale locations will be cleaned up via the normal reconciliation process (the `/re-attach` API).
-
-### Handling secondary locations
-
-For correctness, it is not necessary to split secondary locations. We can simply detach
-the secondary locations for parent shards, and then attach new secondary locations
-for child shards.
-
-Clearly this is not optimal, as it will result in re-downloads of layer files that
-were already present on disk. See "Splitting secondary locations"
-
-### Conditions to trigger a split
-
-The pageserver will expose a new API for reporting on shards that are candidates
-for split: this will return a top-N report of the largest tenant shards by
-physical size (remote size). This should exclude any tenants that are already
-at the maximum configured shard count.
-
-The API would look something like:
-`/v1/top_n_tenant?shard_count_lt=8&sort_by=resident_size`
-
-The storage controller will poll that API across all pageservers it manages at some appropriate interval (e.g. 60 seconds).
-
-A split operation will be started when the tenant exceeds some threshold. This threshold
-should be _less than_ how large we actually want shards to be, perhaps much less. That's to
-minimize the amount of work involved in splitting -- if we want 100GiB shards, we shouldn't
-wait for a tenant to exceed 100GiB before we split anything. Some data analysis of existing
-tenant size distribution may be useful here: if we can make a statement like "usually, if
-a tenant has exceeded 20GiB they're probably going to exceed 100GiB later", then we might
-make our policy to split a tenant at 20GiB.
-
-The finest split we can do is by factors of two, but we can do higher-cardinality splits
-too, and this will help to reduce the overhead of repeatedly re-splitting a tenant
-as it grows. An example of a very simple heuristic for early deployment of the splitting
-feature would be: "Split tenants into 8 shards when their physical size exceeds 64GiB": that
-would give us two kinds of tenant (1 shard and 8 shards), and the confidence that once we had
-split a tenant, it will not need re-splitting soon after.
-
-## Optimizations
-
-### Flush parent shard to remote storage during split
-
-Any data that is in WAL but not remote storage at time of split will need
-to be replayed by child shards when they start for the first time. To minimize
-this work, we may flush the parent shard to remote storage before writing the
-remote indices for child shards.
-
-It is important that this flush is subject to some time bounds: we may be splitting
-in response to a surge of write ingest, so it may be time-critical to split. A
-few seconds to flush latest data should be sufficient to optimize common cases without
-running the risk of holding up a split for a harmful length of time when a parent
-shard is being written heavily. If the flush doesn't complete in time, we may proceed
-to shut down the parent shard and carry on with the split.
-
-### Hard linking parent layers into child shard directories
-
-Before we start the Tenant objects for child shards, we may pre-populate their
-local storage directories with hard links to the layer files already present
-in the parent shard's local directory. When the child shard starts and downloads
-its remote index, it will find all those layer files already present on local disk.
-
-This avoids wasting download capacity and makes splitting faster, but more importantly
-it avoids taking up a factor of N more disk space when splitting 1 shard into N.
-
-This mechanism will work well in typical flows where shards are migrated away
-promptly after a split, but for the general case including what happens when
-layers are evicted and re-downloaded after a split, see the 'Proactive compaction'
-section below.
-
-### Filtering during compaction
-
-Compaction, especially image layer generation, should skip any keys that are
-present in a shard's layer files, but do not match the shard's ShardIdentity's
-is_key_local() check. This avoids carrying around data for longer than necessary
-in post-split compactions.
-
-This was already implemented in https://github.com/neondatabase/neon/pull/6246
-
-### Proactive compaction
-
-In remote storage, there is little reason to rewrite any data on a shard split:
-all the children can reference parent layers via the very cheap write of the child
-index_part.json.
-
-In local storage, things are more nuanced. During the initial split there is no
-capacity cost to duplicating parent layers, if we implement the hard linking
-optimization described above. However, as soon as any layers are evicted from
-local disk and re-downloaded, the downloaded layers will not be hard-links any more:
-they'll have real capacity footprint. That isn't a problem if we migrate child shards
-away from the parent node swiftly, but it risks a significant over-use of local disk
-space if we do not.
-
-For example, if we did an 8-way split of a shard, and then _didn't_ migrate 7 of
-the shards elsewhere, then churned all the layers in all the shards via eviction,
-then we would blow up the storage capacity used on the node by 8x. If we're splitting
-a 100GB shard, that could take the pageserver to the point of exhausting disk space.
-
-To avoid this scenario, we could implement a special compaction mode where we just
-read historic layers, drop unwanted keys, and write back the layer file. This
-is pretty expensive, but useful if we have split a large shard and are not going to
-migrate the child shards away.
-
-The heuristic conditions for triggering such a compaction are:
-
- A) eviction plus time: if a child shard
-  has existed for more than a time threshold, and has been requested to perform at least one eviction, then it becomes urgent for this child shard to execute a proactive compaction to reduce its storage footprint, at the cost of I/O load.
- B) resident size plus time: we may inspect the resident layers and calculate how
-  many of them include the overhead of storing pre-split keys. After some time
-  threshold (different to the one in case A) we still have such layers occupying
-  local disk space, then we should proactively compact them.
-
-### Cleaning up parent-shard layers
-
-It is functionally harmless to leave parent shard layers in remote storage indefinitely.
-They would be cleaned up in the event of the tenant's deletion.
-
-As an optimization to avoid leaking remote storage capacity (which costs money), we may
-lazily clean up parent shard layers once no child shards reference them.
-
-This may be done _very_ lazily: e.g. check every PITR interval. The cleanup procedure is:
-
- list all the key prefixes beginning with the tenant ID, and select those shard prefixes
-  which do not belong to the most-recently-split set of shards (_ancestral shards_, i.e. `shard*count < max(shard_count) over all shards)`, and those shard prefixes which do have the latest shard count (_current shards_)
- If there are no _ancestral shard_ prefixes found, we have nothing to clean up and
-  may drop out now.
- find the latest-generation index for each _current shard_, read all and accumulate the set of layers belonging to ancestral shards referenced by these indices.
- for all ancestral shards, list objects in the prefix and delete any layer which was not
-  referenced by a current shard.
-
-If this cleanup is scheduled for 1-2 PITR periods after the split, there is a good chance that child shards will have written their own image layers covering the whole keyspace, such that all parent shard layers will be deletable.
-
-The cleanup may be done by the scrubber (external process), or we may choose to have
-the zeroth shard in the latest generation do the work -- there is no obstacle to one shard
-reading the other shard's indices at runtime, and we do not require visibility of the
-latest index writes.
-
-Cleanup should be artificially delayed by some period (for example 24 hours) to ensure
-that we retain the option to roll back a split in case of bugs.
-
-### Splitting secondary locations
-
-We may implement a pageserver API similar to the main splitting API, which does a simpler
-operation for secondary locations: it would not write anything to S3, instead it would simply
-create the child shard directory on local disk, hard link in directories from the parent,
-and set up the in memory (TenantSlot) state for the children.
-
-Similar to attached locations, a subset of secondary locations will probably need re-locating
-after the split is complete, to avoid leaving multiple child shards on the same pageservers,
-where they may use excessive space for the tenant.
-
-## FAQ/Alternatives
-
-### What should the thresholds be set to?
-
-Shard size limit: the pre-sharding default capacity quota for databases was 200GiB, so this could be a starting point for the per-shard size limit.
-
-Max shard count:
-
- The safekeeper overhead to sharding is currently O(N) network bandwidth because
-  the un-filtered WAL is sent to all shards. To avoid this growing out of control,
-  a limit of 8 shards should be temporarily imposed until WAL filtering is implemented
-  on the safekeeper.
- there is also little benefit to increasing the shard count beyond the number
-  of pageservers in a region.
-
-### Is it worth just rewriting all the data during a split to simplify reasoning about space?
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -7,11 +7,6 @@ Below you will find a brief overview of each subdir in the source tree in alphab
 Neon storage broker, providing messaging between safekeepers and pageservers.
 [storage_broker.md](./storage_broker.md)

-`storage_controller`:
-
-Neon storage controller, manages a cluster of pageservers and exposes an API that enables
-managing a many-sharded tenant as a single entity.
-
 `/control_plane`:

 Local control plane.
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -1,150 +0,0 @@
-# Storage Controller
-
-## Concepts
-
-The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller,
-which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations).
-
-It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding
-the underlying details of how data is spread across multiple nodes.
-
-The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent.
-
-## APIs
-
-The storage controller’s HTTP server implements four logically separate APIs:
-
- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because that’s where clients expect to find it on a pageserver.
- `/control/v1/...` path is the storage controller’s API, which enables operations such as registering and management pageservers, or executing shard splits.
- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system.
- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers
-  to ensure data safety with generation numbers.
-
-The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers’ APIs).
-
-See the `http.rs` file in the source for where the HTTP APIs are implemented.
-
-## Database
-
-The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not
-persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
-rebuilt on startup.
-
-The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
-
-The `diesel` crate is used for defining models & migrations.
-
-Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controller’s database.
-
-### Diesel tip: migrations
-
-If you need to modify the database schema, here’s how to create a migration:
-
- Install the diesel CLI with `cargo install diesel_cli`
- Use `diesel migration generate <name>` to create a new migration
- Populate the SQL files in the `migrations/` subdirectory
- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
-  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
- Commit the migration files and the changes to schema.rs
- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
-
-## storcon_cli
-
-The `storcon_cli` tool enables interactive management of the storage controller. This is usually
-only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline).
-
-`storcon_cli --help` includes details on commands.
-
-# Deploying
-
-This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as
-part of a self-hosted system.
-
-_General note: since the default `neon_local` environment includes a storage controller, this is a useful
-reference when figuring out deployment._
-
-## Database
-
-It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral
-local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver.
-
-The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte.
-
-Set the URL to the database using the `--database-url` CLI option.
-
-There is no need to run migrations manually: the storage controller automatically applies migrations
-when it starts up.
-
-## Configure pageservers to use the storage controller
-
-1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should
-   point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters.
-2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself
-   with the storage controller when it starts up. See the example below for the format of this file.
-
-### Example `metadata.json`
-
-```
-{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000}
-```
-
- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever
-  postgres runs.
- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where
-  the storage controller runs.
-
-## Handle compute notifications.
-
-The storage controller independently moves tenant attachments between pageservers in response to
-changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable
-postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
-location changes.
-
-The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
-JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
-
-In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
-the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
-the compute hook.
-
-When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
-the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
-
-```
-struct ComputeHookNotifyRequestShard {
-    node_id: NodeId,
-    shard_number: ShardNumber,
-}
-
-struct ComputeHookNotifyRequest {
-    tenant_id: TenantId,
-    stripe_size: Option<ShardStripeSize>,
-    shards: Vec<ComputeHookNotifyRequestShard>,
-}
-```
-
-When a notification is received:
-
-1. Modify postgres configuration for this tenant:
-
-   - set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
-     shards identified by `NodeId` must be converted to the address+port of the node.
-   - if stripe_size is not None, set `neon.stripe_size` to this value
-
-2. Send SIGHUP to postgres to reload configuration
-3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
-   will retry the notification until it succeeds..
-
-### Example notification body
-
-```
-{
-  "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
-  "stripe_size": 32768,
-  "shards": [
-      {"node_id": 344, "shard_number": 0},
-      {"node_id": 722, "shard_number": 1},
-  ],
-}
-```
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -10,13 +10,11 @@ libc.workspace = true
 once_cell.workspace = true
 chrono.workspace = true
 twox-hash.workspace = true
-measured.workspace = true

 workspace_hack.workspace = true

 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
-measured-process.workspace = true

 [dev-dependencies]
 rand = "0.8"
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -7,19 +7,14 @@
 //! use significantly less memory than this, but can only approximate the cardinality.

 use std::{
-    hash::{BuildHasher, BuildHasherDefault, Hash},
-    sync::atomic::AtomicU8,
+    collections::HashMap,
+    hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
+    sync::{atomic::AtomicU8, Arc, RwLock},
 };

-use measured::{
-    label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
-    metric::{
-        group::{Encoding, MetricValue},
-        name::MetricNameEncoder,
-        Metric, MetricType, MetricVec,
-    },
-    text::TextEncoder,
-    LabelGroup,
+use prometheus::{
+    core::{self, Describer},
+    proto, Opts,
 };
 use twox_hash::xxh3;

@@ -45,7 +40,7 @@ macro_rules! register_hll {
    }};

    ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
-        $crate::register_hll!($N, $crate::opts!($NAME, $HELP))
+        $crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
    }};
 }

@@ -98,25 +93,203 @@ macro_rules! register_hll {
 /// ```
 ///
 /// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
-pub type HyperLogLogVec<L, const N: usize> = MetricVec<HyperLogLogState<N>, L>;
-pub type HyperLogLog<const N: usize> = Metric<HyperLogLogState<N>>;
-
-pub struct HyperLogLogState<const N: usize> {
-    shards: [AtomicU8; N],
+#[derive(Clone)]
+pub struct HyperLogLogVec<const N: usize> {
+    core: Arc<HyperLogLogVecCore<N>>,
 }
-impl<const N: usize> Default for HyperLogLogState<N> {
-    fn default() -> Self {
-        #[allow(clippy::declare_interior_mutable_const)]
-        const ZERO: AtomicU8 = AtomicU8::new(0);
-        Self { shards: [ZERO; N] }
+
+struct HyperLogLogVecCore<const N: usize> {
+    pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
+    pub desc: core::Desc,
+    pub opts: Opts,
+}
+
+impl<const N: usize> core::Collector for HyperLogLogVec<N> {
+    fn desc(&self) -> Vec<&core::Desc> {
+        vec![&self.core.desc]
+    }
+
+    fn collect(&self) -> Vec<proto::MetricFamily> {
+        let mut m = proto::MetricFamily::default();
+        m.set_name(self.core.desc.fq_name.clone());
+        m.set_help(self.core.desc.help.clone());
+        m.set_field_type(proto::MetricType::GAUGE);
+
+        let mut metrics = Vec::new();
+        for child in self.core.children.read().unwrap().values() {
+            child.core.collect_into(&mut metrics);
+        }
+        m.set_metric(metrics);
+
+        vec![m]
    }
 }

-impl<const N: usize> MetricType for HyperLogLogState<N> {
-    type Metadata = ();
+impl<const N: usize> HyperLogLogVec<N> {
+    /// Create a new [`HyperLogLogVec`] based on the provided
+    /// [`Opts`] and partitioned by the given label names. At least one label name must be
+    /// provided.
+    pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
+        assert!(N.is_power_of_two());
+        let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
+        let opts = opts.variable_labels(variable_names);
+
+        let desc = opts.describe()?;
+        let v = HyperLogLogVecCore {
+            children: RwLock::new(HashMap::default()),
+            desc,
+            opts,
+        };
+
+        Ok(Self { core: Arc::new(v) })
+    }
+
+    /// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
+    /// of label values (same order as the VariableLabels in Desc). If that combination of
+    /// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
+    ///
+    /// An error is returned if the number of label values is not the same as the
+    /// number of VariableLabels in Desc.
+    pub fn get_metric_with_label_values(
+        &self,
+        vals: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        self.core.get_metric_with_label_values(vals)
+    }
+
+    /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
+    /// occurs.
+    pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
+        self.get_metric_with_label_values(vals).unwrap()
+    }
 }

-impl<const N: usize> HyperLogLogState<N> {
+impl<const N: usize> HyperLogLogVecCore<N> {
+    pub fn get_metric_with_label_values(
+        &self,
+        vals: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        let h = self.hash_label_values(vals)?;
+
+        if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
+            return Ok(metric);
+        }
+
+        self.get_or_create_metric(h, vals)
+    }
+
+    pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
+        if vals.len() != self.desc.variable_labels.len() {
+            return Err(prometheus::Error::InconsistentCardinality {
+                expect: self.desc.variable_labels.len(),
+                got: vals.len(),
+            });
+        }
+
+        let mut h = xxh3::Hash64::default();
+        for val in vals {
+            h.write(val.as_bytes());
+        }
+
+        Ok(h.finish())
+    }
+
+    fn get_or_create_metric(
+        &self,
+        hash: u64,
+        label_values: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        let mut children = self.children.write().unwrap();
+        // Check exist first.
+        if let Some(metric) = children.get(&hash).cloned() {
+            return Ok(metric);
+        }
+
+        let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
+        children.insert(hash, metric.clone());
+        Ok(metric)
+    }
+}
+
+/// HLL is a probabilistic cardinality measure.
+///
+/// How to use this time-series for a metric name `my_metrics_total_hll`:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// If you want an estimate over time, you can use the following query:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (
+///                 max_over_time(my_metrics_total_hll{}[$__rate_interval])
+///             ) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// In the case of low cardinality, you might want to use the linear counting approximation:
+///
+/// ```promql
+/// # LinearCounting(m, V) = m log (m / V)
+/// shards_count * ln(shards_count /
+///     # calculate V = how many shards contain a 0
+///     count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
+/// )
+/// ```
+///
+/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
+#[derive(Clone)]
+pub struct HyperLogLog<const N: usize> {
+    core: Arc<HyperLogLogCore<N>>,
+}
+
+impl<const N: usize> HyperLogLog<N> {
+    /// Create a [`HyperLogLog`] with the `name` and `help` arguments.
+    pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
+        assert!(N.is_power_of_two());
+        let opts = Opts::new(name, help);
+        Self::with_opts(opts)
+    }
+
+    /// Create a [`HyperLogLog`] with the `opts` options.
+    pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
+        Self::with_opts_and_label_values(&opts, &[])
+    }
+
+    fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
+        let desc = opts.describe()?;
+        let labels = make_label_pairs(&desc, label_values)?;
+
+        let v = HyperLogLogCore {
+            shards: [0; N].map(AtomicU8::new),
+            desc,
+            labels,
+        };
+        Ok(Self { core: Arc::new(v) })
+    }
+
    pub fn measure(&self, item: &impl Hash) {
        // changing the hasher will break compatibility with previous measurements.
        self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
@@ -126,11 +299,42 @@ impl<const N: usize> HyperLogLogState<N> {
        let p = N.ilog2() as u8;
        let j = hash & (N as u64 - 1);
        let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
-        self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
+        self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
+    }
+}
+
+struct HyperLogLogCore<const N: usize> {
+    shards: [AtomicU8; N],
+    desc: core::Desc,
+    labels: Vec<proto::LabelPair>,
+}
+
+impl<const N: usize> core::Collector for HyperLogLog<N> {
+    fn desc(&self) -> Vec<&core::Desc> {
+        vec![&self.core.desc]
    }

-    fn take_sample(&self) -> [u8; N] {
-        self.shards.each_ref().map(|x| {
+    fn collect(&self) -> Vec<proto::MetricFamily> {
+        let mut m = proto::MetricFamily::default();
+        m.set_name(self.core.desc.fq_name.clone());
+        m.set_help(self.core.desc.help.clone());
+        m.set_field_type(proto::MetricType::GAUGE);
+
+        let mut metrics = Vec::new();
+        self.core.collect_into(&mut metrics);
+        m.set_metric(metrics);
+
+        vec![m]
+    }
+}
+
+impl<const N: usize> HyperLogLogCore<N> {
+    fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
+        self.shards.iter().enumerate().for_each(|(i, x)| {
+            let mut shard_label = proto::LabelPair::default();
+            shard_label.set_name("hll_shard".to_owned());
+            shard_label.set_value(format!("{i}"));
+
            // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.

            // This seems like it would be a race condition,
@@ -140,90 +344,85 @@ impl<const N: usize> HyperLogLogState<N> {

            // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
            // this would mean that a dev port-forwarding the metrics url won't break the sampling.
-            x.swap(0, std::sync::atomic::Ordering::Relaxed)
+            let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
+
+            let mut m = proto::Metric::default();
+            let mut c = proto::Gauge::default();
+            c.set_value(v as f64);
+            m.set_gauge(c);
+
+            let mut labels = Vec::with_capacity(self.labels.len() + 1);
+            labels.extend_from_slice(&self.labels);
+            labels.push(shard_label);
+
+            m.set_label(labels);
+            metrics.push(m);
        })
    }
 }
-impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
-    for HyperLogLogState<N>
-{
-    fn write_type(
-        name: impl MetricNameEncoder,
-        enc: &mut TextEncoder<W>,
-    ) -> Result<(), std::io::Error> {
-        enc.write_type(&name, measured::text::MetricType::Gauge)
+
+fn make_label_pairs(
+    desc: &core::Desc,
+    label_values: &[&str],
+) -> prometheus::Result<Vec<proto::LabelPair>> {
+    if desc.variable_labels.len() != label_values.len() {
+        return Err(prometheus::Error::InconsistentCardinality {
+            expect: desc.variable_labels.len(),
+            got: label_values.len(),
+        });
    }
-    fn collect_into(
-        &self,
-        _: &(),
-        labels: impl LabelGroup,
-        name: impl MetricNameEncoder,
-        enc: &mut TextEncoder<W>,
-    ) -> Result<(), std::io::Error> {
-        struct I64(i64);
-        impl LabelValue for I64 {
-            fn visit<V: LabelVisitor>(&self, v: V) -> V::Output {
-                v.write_int(self.0)
-            }
-        }

-        struct HllShardLabel {
-            hll_shard: i64,
-        }
-
-        impl LabelGroup for HllShardLabel {
-            fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
-                const LE: &LabelName = LabelName::from_str("hll_shard");
-                v.write_value(LE, &I64(self.hll_shard));
-            }
-        }
-
-        self.take_sample()
-            .into_iter()
-            .enumerate()
-            .try_for_each(|(hll_shard, val)| {
-                enc.write_metric_value(
-                    name.by_ref(),
-                    labels.by_ref().compose_with(HllShardLabel {
-                        hll_shard: hll_shard as i64,
-                    }),
-                    MetricValue::Int(val as i64),
-                )
-            })
+    let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
+    if total_len == 0 {
+        return Ok(vec![]);
    }
+
+    if desc.variable_labels.is_empty() {
+        return Ok(desc.const_label_pairs.clone());
+    }
+
+    let mut label_pairs = Vec::with_capacity(total_len);
+    for (i, n) in desc.variable_labels.iter().enumerate() {
+        let mut label_pair = proto::LabelPair::default();
+        label_pair.set_name(n.clone());
+        label_pair.set_value(label_values[i].to_owned());
+        label_pairs.push(label_pair);
+    }
+
+    for label_pair in &desc.const_label_pairs {
+        label_pairs.push(label_pair.clone());
+    }
+    label_pairs.sort();
+    Ok(label_pairs)
 }

 #[cfg(test)]
 mod tests {
    use std::collections::HashSet;

-    use measured::{label::StaticLabelSet, FixedCardinalityLabel};
+    use prometheus::{proto, Opts};
    use rand::{rngs::StdRng, Rng, SeedableRng};
    use rand_distr::{Distribution, Zipf};

    use crate::HyperLogLogVec;

-    #[derive(FixedCardinalityLabel, Clone, Copy)]
-    #[label(singleton = "x")]
-    enum Label {
-        A,
-        B,
+    fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
+        let mut metrics = vec![];
+        hll.core
+            .children
+            .read()
+            .unwrap()
+            .values()
+            .for_each(|c| c.core.collect_into(&mut metrics));
+        metrics
    }
-
-    fn collect(hll: &HyperLogLogVec<StaticLabelSet<Label>, 32>) -> ([u8; 32], [u8; 32]) {
-        // cannot go through the `hll.collect_family_into` interface yet...
-        // need to see if I can fix the conflicting impls problem in measured.
-        (
-            hll.get_metric(hll.with_labels(Label::A)).take_sample(),
-            hll.get_metric(hll.with_labels(Label::B)).take_sample(),
-        )
-    }
-
-    fn get_cardinality(samples: &[[u8; 32]]) -> f64 {
+    fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
        let mut buckets = [0.0; 32];
-        for &sample in samples {
-            for (i, m) in sample.into_iter().enumerate() {
-                buckets[i] = f64::max(buckets[i], m as f64);
+        for metric in metrics.chunks_exact(32) {
+            if filter(&metric[0]) {
+                for (i, m) in metric.iter().enumerate() {
+                    buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
+                }
            }
        }

@@ -238,7 +437,7 @@ mod tests {
    }

    fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
-        let hll = HyperLogLogVec::<StaticLabelSet<Label>, 32>::new();
+        let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();

        let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
        let mut set_a = HashSet::new();
@@ -246,20 +445,18 @@ mod tests {

        for x in iter.by_ref().take(n) {
            set_a.insert(x.to_bits());
-            hll.get_metric(hll.with_labels(Label::A))
-                .measure(&x.to_bits());
+            hll.with_label_values(&["a"]).measure(&x.to_bits());
        }
        for x in iter.by_ref().take(n) {
            set_b.insert(x.to_bits());
-            hll.get_metric(hll.with_labels(Label::B))
-                .measure(&x.to_bits());
+            hll.with_label_values(&["b"]).measure(&x.to_bits());
        }
        let merge = &set_a | &set_b;

-        let (a, b) = collect(&hll);
-        let len = get_cardinality(&[a, b]);
-        let len_a = get_cardinality(&[a]);
-        let len_b = get_cardinality(&[b]);
+        let metrics = collect(&hll);
+        let len = get_cardinality(&metrics, |_| true);
+        let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
+        let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");

        ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
    }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -4,17 +4,6 @@
 //! a default registry.
 #![deny(clippy::undocumented_unsafe_blocks)]

-use measured::{
-    label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels},
-    metric::{
-        counter::CounterState,
-        gauge::GaugeState,
-        group::{Encoding, MetricValue},
-        name::{MetricName, MetricNameEncoder},
-        MetricEncoding, MetricFamilyEncoding,
-    },
-    FixedCardinalityLabel, LabelGroup, MetricGroup,
-};
 use once_cell::sync::Lazy;
 use prometheus::core::{
    Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
@@ -22,7 +11,6 @@ use prometheus::core::{
 pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::Error;
-use prometheus::Registry;
 pub use prometheus::{core, default_registry, proto};
 pub use prometheus::{exponential_buckets, linear_buckets};
 pub use prometheus::{register_counter_vec, Counter, CounterVec};
@@ -35,12 +23,13 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
 pub use prometheus::{register_int_gauge, IntGauge};
 pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
 pub use prometheus::{Encoder, TextEncoder};
+use prometheus::{Registry, Result};

 pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
 mod hll;
-pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec};
+pub use hll::{HyperLogLog, HyperLogLogVec};
 #[cfg(target_os = "linux")]
 pub mod more_process_metrics;

@@ -70,7 +59,7 @@ static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
 /// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
 /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
 /// while holding the lock.
-pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
+pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
    INTERNAL_REGISTRY.register(c)
 }

@@ -107,127 +96,6 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
 ];

-pub struct BuildInfo {
-    pub revision: &'static str,
-    pub build_tag: &'static str,
-}
-
-// todo: allow label group without the set
-impl LabelGroup for BuildInfo {
-    fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
-        const REVISION: &LabelName = LabelName::from_str("revision");
-        v.write_value(REVISION, &self.revision);
-        const BUILD_TAG: &LabelName = LabelName::from_str("build_tag");
-        v.write_value(BUILD_TAG, &self.build_tag);
-    }
-}
-
-impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
-where
-    GaugeState: MetricEncoding<T>,
-{
-    fn collect_family_into(
-        &self,
-        name: impl measured::metric::name::MetricNameEncoder,
-        enc: &mut T,
-    ) -> Result<(), T::Err> {
-        enc.write_help(&name, "Build/version information")?;
-        GaugeState::write_type(&name, enc)?;
-        GaugeState {
-            count: std::sync::atomic::AtomicI64::new(1),
-        }
-        .collect_into(&(), self, name, enc)
-    }
-}
-
-#[derive(MetricGroup)]
-#[metric(new(build_info: BuildInfo))]
-pub struct NeonMetrics {
-    #[cfg(target_os = "linux")]
-    #[metric(namespace = "process")]
-    #[metric(init = measured_process::ProcessCollector::for_self())]
-    process: measured_process::ProcessCollector,
-
-    #[metric(namespace = "libmetrics")]
-    #[metric(init = LibMetrics::new(build_info))]
-    libmetrics: LibMetrics,
-}
-
-#[derive(MetricGroup)]
-#[metric(new(build_info: BuildInfo))]
-pub struct LibMetrics {
-    #[metric(init = build_info)]
-    build_info: BuildInfo,
-
-    #[metric(flatten)]
-    rusage: Rusage,
-
-    serve_count: CollectionCounter,
-}
-
-fn write_gauge<Enc: Encoding>(
-    x: i64,
-    labels: impl LabelGroup,
-    name: impl MetricNameEncoder,
-    enc: &mut Enc,
-) -> Result<(), Enc::Err> {
-    enc.write_metric_value(name, labels, MetricValue::Int(x))
-}
-
-#[derive(Default)]
-struct Rusage;
-
-#[derive(FixedCardinalityLabel, Clone, Copy)]
-#[label(singleton = "io_operation")]
-enum IoOp {
-    Read,
-    Write,
-}
-
-impl<T: Encoding> MetricGroup<T> for Rusage
-where
-    GaugeState: MetricEncoding<T>,
-{
-    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
-        const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total");
-        const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb");
-
-        let ru = get_rusage_stats();
-
-        enc.write_help(
-            DISK_IO,
-            "Bytes written and read from disk, grouped by the operation (read|write)",
-        )?;
-        GaugeState::write_type(DISK_IO, enc)?;
-        write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?;
-        write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?;
-
-        enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?;
-        GaugeState::write_type(MAXRSS, enc)?;
-        write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?;
-
-        Ok(())
-    }
-}
-
-#[derive(Default)]
-struct CollectionCounter(CounterState);
-
-impl<T: Encoding> MetricFamilyEncoding<T> for CollectionCounter
-where
-    CounterState: MetricEncoding<T>,
-{
-    fn collect_family_into(
-        &self,
-        name: impl measured::metric::name::MetricNameEncoder,
-        enc: &mut T,
-    ) -> Result<(), T::Err> {
-        self.0.inc();
-        enc.write_help(&name, "Number of metric requests made")?;
-        self.0.collect_into(&(), NoLabels, name, enc)
-    }
-}
-
 pub fn set_build_info_metric(revision: &str, build_tag: &str) {
    let metric = register_int_gauge_vec!(
        "libmetrics_build_info",
@@ -237,7 +105,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
    .expect("Failed to register build info metric");
    metric.with_label_values(&[revision, build_tag]).set(1);
 }
-const BYTES_IN_BLOCK: i64 = 512;

 // Records I/O stats in a "cross-platform" way.
 // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
@@ -250,6 +117,7 @@ const BYTES_IN_BLOCK: i64 = 512;
 fn update_rusage_metrics() {
    let rusage_stats = get_rusage_stats();

+    const BYTES_IN_BLOCK: i64 = 512;
    DISK_IO_BYTES
        .with_label_values(&["read"])
        .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
@@ -283,7 +151,6 @@ macro_rules! register_int_counter_pair_vec {
        }
    }};
 }
-
 /// Create an [`IntCounterPair`] and registers to default registry.
 #[macro_export(local_inner_macros)]
 macro_rules! register_int_counter_pair {
@@ -321,10 +188,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
    ///
    /// An error is returned if the number of label values is not the same as the
    /// number of VariableLabels in Desc.
-    pub fn get_metric_with_label_values(
-        &self,
-        vals: &[&str],
-    ) -> prometheus::Result<GenericCounterPair<P>> {
+    pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
        Ok(GenericCounterPair {
            inc: self.inc.get_metric_with_label_values(vals)?,
            dec: self.dec.get_metric_with_label_values(vals)?,
@@ -337,7 +201,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
        self.get_metric_with_label_values(vals).unwrap()
    }

-    pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
+    pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
        res[0] = self.inc.remove_label_values(vals);
        res[1] = self.dec.remove_label_values(vals);
    }
@@ -421,171 +285,3 @@ pub type IntCounterPair = GenericCounterPair<AtomicU64>;

 /// A guard for [`IntCounterPair`] that will decrement the gauge on drop
 pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
-
-pub trait CounterPairAssoc {
-    const INC_NAME: &'static MetricName;
-    const DEC_NAME: &'static MetricName;
-
-    const INC_HELP: &'static str;
-    const DEC_HELP: &'static str;
-
-    type LabelGroupSet: LabelGroupSet;
-}
-
-pub struct CounterPairVec<A: CounterPairAssoc> {
-    vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
-}
-
-impl<A: CounterPairAssoc> Default for CounterPairVec<A>
-where
-    A::LabelGroupSet: Default,
-{
-    fn default() -> Self {
-        Self {
-            vec: Default::default(),
-        }
-    }
-}
-
-impl<A: CounterPairAssoc> CounterPairVec<A> {
-    pub fn guard(
-        &self,
-        labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
-    ) -> MeasuredCounterPairGuard<'_, A> {
-        let id = self.vec.with_labels(labels);
-        self.vec.get_metric(id).inc.inc();
-        MeasuredCounterPairGuard { vec: &self.vec, id }
-    }
-    pub fn inc(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
-        let id = self.vec.with_labels(labels);
-        self.vec.get_metric(id).inc.inc();
-    }
-    pub fn dec(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
-        let id = self.vec.with_labels(labels);
-        self.vec.get_metric(id).dec.inc();
-    }
-    pub fn remove_metric(
-        &self,
-        labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
-    ) -> Option<MeasuredCounterPairState> {
-        let id = self.vec.with_labels(labels);
-        self.vec.remove_metric(id)
-    }
-}
-
-impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
-where
-    T: ::measured::metric::group::Encoding,
-    A: CounterPairAssoc,
-    ::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding<T>,
-{
-    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
-        // write decrement first to avoid a race condition where inc - dec < 0
-        T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?;
-        self.vec
-            .collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?;
-
-        T::write_help(enc, A::INC_NAME, A::INC_HELP)?;
-        self.vec
-            .collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?;
-
-        Ok(())
-    }
-}
-
-#[derive(MetricGroup, Default)]
-pub struct MeasuredCounterPairState {
-    pub inc: CounterState,
-    pub dec: CounterState,
-}
-
-impl measured::metric::MetricType for MeasuredCounterPairState {
-    type Metadata = ();
-}
-
-pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> {
-    vec: &'a measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
-    id: measured::metric::LabelId<A::LabelGroupSet>,
-}
-
-impl<A: CounterPairAssoc> Drop for MeasuredCounterPairGuard<'_, A> {
-    fn drop(&mut self) {
-        self.vec.get_metric(self.id).dec.inc();
-    }
-}
-
-/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder.
-struct Inc<T>(T);
-/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder.
-struct Dec<T>(T);
-
-impl<T: Encoding> Encoding for Inc<T> {
-    type Err = T::Err;
-
-    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
-        self.0.write_help(name, help)
-    }
-
-    fn write_metric_value(
-        &mut self,
-        name: impl MetricNameEncoder,
-        labels: impl LabelGroup,
-        value: MetricValue,
-    ) -> Result<(), Self::Err> {
-        self.0.write_metric_value(name, labels, value)
-    }
-}
-
-impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
-where
-    CounterState: MetricEncoding<T>,
-{
-    fn write_type(name: impl MetricNameEncoder, enc: &mut Inc<T>) -> Result<(), T::Err> {
-        CounterState::write_type(name, &mut enc.0)
-    }
-    fn collect_into(
-        &self,
-        metadata: &(),
-        labels: impl LabelGroup,
-        name: impl MetricNameEncoder,
-        enc: &mut Inc<T>,
-    ) -> Result<(), T::Err> {
-        self.inc.collect_into(metadata, labels, name, &mut enc.0)
-    }
-}
-
-impl<T: Encoding> Encoding for Dec<T> {
-    type Err = T::Err;
-
-    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
-        self.0.write_help(name, help)
-    }
-
-    fn write_metric_value(
-        &mut self,
-        name: impl MetricNameEncoder,
-        labels: impl LabelGroup,
-        value: MetricValue,
-    ) -> Result<(), Self::Err> {
-        self.0.write_metric_value(name, labels, value)
-    }
-}
-
-/// Write the dec counter to the encoder
-impl<T: Encoding> MetricEncoding<Dec<T>> for MeasuredCounterPairState
-where
-    CounterState: MetricEncoding<T>,
-{
-    fn write_type(name: impl MetricNameEncoder, enc: &mut Dec<T>) -> Result<(), T::Err> {
-        CounterState::write_type(name, &mut enc.0)
-    }
-    fn collect_into(
-        &self,
-        metadata: &(),
-        labels: impl LabelGroup,
-        name: impl MetricNameEncoder,
-        enc: &mut Dec<T>,
-    ) -> Result<(), T::Err> {
-        self.dec.collect_into(metadata, labels, name, &mut enc.0)
-    }
-}
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -2,14 +2,11 @@ use std::str::FromStr;

 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
-/// in [`storage_controller::http`]
+/// in [`attachment_service::http`]
 use serde::{Deserialize, Serialize};
-use utils::id::{NodeId, TenantId};
+use utils::id::NodeId;

-use crate::{
-    models::{ShardParameters, TenantConfig},
-    shard::{ShardStripeSize, TenantShardId},
-};
+use crate::{models::ShardParameters, shard::TenantShardId};

 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateResponseShard {
@@ -38,16 +35,10 @@ pub struct NodeRegisterRequest {
 pub struct NodeConfigureRequest {
    pub node_id: NodeId,

-    pub availability: Option<NodeAvailabilityWrapper>,
+    pub availability: Option<NodeAvailability>,
    pub scheduling: Option<NodeSchedulingPolicy>,
 }

-#[derive(Serialize, Deserialize)]
-pub struct TenantPolicyRequest {
-    pub placement: Option<PlacementPolicy>,
-    pub scheduling: Option<ShardSchedulingPolicy>,
-}
-
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantLocateResponseShard {
    pub shard_id: TenantShardId,
@@ -66,48 +57,6 @@ pub struct TenantLocateResponse {
    pub shard_params: ShardParameters,
 }

-#[derive(Serialize, Deserialize)]
-pub struct TenantDescribeResponse {
-    pub tenant_id: TenantId,
-    pub shards: Vec<TenantDescribeResponseShard>,
-    pub stripe_size: ShardStripeSize,
-    pub policy: PlacementPolicy,
-    pub config: TenantConfig,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct NodeDescribeResponse {
-    pub id: NodeId,
-
-    pub availability: NodeAvailabilityWrapper,
-    pub scheduling: NodeSchedulingPolicy,
-
-    pub listen_http_addr: String,
-    pub listen_http_port: u16,
-
-    pub listen_pg_addr: String,
-    pub listen_pg_port: u16,
-}
-
-#[derive(Serialize, Deserialize)]
-pub struct TenantDescribeResponseShard {
-    pub tenant_shard_id: TenantShardId,
-
-    pub node_attached: Option<NodeId>,
-    pub node_secondary: Vec<NodeId>,
-
-    pub last_error: String,
-
-    /// A task is currently running to reconcile this tenant's intent state with the state on pageservers
-    pub is_reconciling: bool,
-    /// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending.
-    pub is_pending_compute_notification: bool,
-    /// A shard split is currently underway
-    pub is_splitting: bool,
-
-    pub scheduling_policy: ShardSchedulingPolicy,
-}
-
 /// Explicitly migrating a particular shard is a low level operation
 /// TODO: higher level "Reschedule tenant" operation where the request
 /// specifies some constraints, e.g. asking it to get off particular node(s)
@@ -117,94 +66,29 @@ pub struct TenantShardMigrateRequest {
    pub node_id: NodeId,
 }

-/// Utilisation score indicating how good a candidate a pageserver
-/// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
-/// Lower values are better.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
-pub struct UtilizationScore(pub u64);
-
-impl UtilizationScore {
-    pub fn worst() -> Self {
-        UtilizationScore(u64::MAX)
-    }
-}
-
-#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
-#[serde(into = "NodeAvailabilityWrapper")]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 pub enum NodeAvailability {
    // Normal, happy state
-    Active(UtilizationScore),
+    Active,
    // Offline: Tenants shouldn't try to attach here, but they may assume that their
    // secondary locations on this node still exist.  Newly added nodes are in this
    // state until we successfully contact them.
    Offline,
 }

-impl PartialEq for NodeAvailability {
-    fn eq(&self, other: &Self) -> bool {
-        use NodeAvailability::*;
-        matches!((self, other), (Active(_), Active(_)) | (Offline, Offline))
-    }
-}
+impl FromStr for NodeAvailability {
+    type Err = anyhow::Error;

-impl Eq for NodeAvailability {}
-
-// This wrapper provides serde functionality and it should only be used to
-// communicate with external callers which don't know or care about the
-// utilisation score of the pageserver it is targeting.
-#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
-pub enum NodeAvailabilityWrapper {
-    Active,
-    Offline,
-}
-
-impl From<NodeAvailabilityWrapper> for NodeAvailability {
-    fn from(val: NodeAvailabilityWrapper) -> Self {
-        match val {
-            // Assume the worst utilisation score to begin with. It will later be updated by
-            // the heartbeats.
-            NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
-            NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "offline" => Ok(Self::Offline),
+            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
        }
    }
 }

-impl From<NodeAvailability> for NodeAvailabilityWrapper {
-    fn from(val: NodeAvailability) -> Self {
-        match val {
-            NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active,
-            NodeAvailability::Offline => NodeAvailabilityWrapper::Offline,
-        }
-    }
-}
-
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
-pub enum ShardSchedulingPolicy {
-    // Normal mode: the tenant's scheduled locations may be updated at will, including
-    // for non-essential optimization.
-    Active,
-
-    // Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
-    // For example, this still permits a node's attachment location to change to a secondary in
-    // response to a node failure, or to assign a new secondary if a node was removed.
-    Essential,
-
-    // No scheduling: leave the shard running wherever it currently is.  Even if the shard is
-    // unavailable, it will not be rescheduled to another node.
-    Pause,
-
-    // No reconciling: we will make no location_conf API calls to pageservers at all.  If the
-    // shard is unavailable, it stays that way.  If a node fails, this shard doesn't get failed over.
-    Stop,
-}
-
-impl Default for ShardSchedulingPolicy {
-    fn default() -> Self {
-        Self::Active
-    }
-}
-
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 pub enum NodeSchedulingPolicy {
    Active,
    Filling,
@@ -243,8 +127,11 @@ impl From<NodeSchedulingPolicy> for String {
 /// to create secondary locations.
 #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
 pub enum PlacementPolicy {
-    /// Normal live state: one attached pageserver and zero or more secondaries.
-    Attached(usize),
+    /// Cheapest way to attach a tenant: just one pageserver, no secondary
+    Single,
+    /// Production-ready way to attach a tenant: one attached pageserver and
+    /// some number of secondaries.
+    Double(usize),
    /// Create one secondary mode locations. This is useful when onboarding
    /// a tenant, or for an idle tenant that we might want to bring online quickly.
    Secondary,
@@ -266,14 +153,14 @@ mod test {
    /// Check stability of PlacementPolicy's serialization
    #[test]
    fn placement_policy_encoding() -> anyhow::Result<()> {
-        let v = PlacementPolicy::Attached(1);
+        let v = PlacementPolicy::Double(1);
        let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "{\"Attached\":1}");
+        assert_eq!(encoded, "{\"Double\":1}");
        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);

-        let v = PlacementPolicy::Detached;
+        let v = PlacementPolicy::Single;
        let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "\"Detached\"");
+        assert_eq!(encoded, "\"Single\"");
        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
        Ok(())
    }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -4,7 +4,6 @@ pub mod utilization;
 pub use utilization::PageserverUtilization;

 use std::{
-    borrow::Cow,
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
@@ -20,7 +19,6 @@ use utils::{
    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
-    serde_system_time,
 };

 use crate::controller_api::PlacementPolicy;
@@ -296,13 +294,13 @@ pub struct TenantConfig {
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
    pub trace_read_requests: Option<bool>,
+    pub image_layer_compression: Option<CompressionAlgorithm>,
    pub eviction_policy: Option<EvictionPolicy>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
    pub heatmap_period: Option<String>,
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
-    pub image_layer_creation_check_threshold: Option<u8>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -330,6 +328,23 @@ pub enum CompactionAlgorithm {
    Tiered,
 }

+#[derive(
+    Debug,
+    Clone,
+    Copy,
+    PartialEq,
+    Eq,
+    Serialize,
+    Deserialize,
+    strum_macros::FromRepr,
+    enum_map::Enum,
+)]
+#[repr(u8)]
+pub enum CompressionAlgorithm {
+    NoCompression,
+    LZ4,
+}
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
    #[serde(with = "humantime_serde")]
@@ -580,7 +595,7 @@ pub struct TimelineInfo {
    pub walreceiver_status: String,
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize)]
 pub struct LayerMapInfo {
    pub in_memory_layers: Vec<InMemoryLayerInfo>,
    pub historic_layers: Vec<HistoricLayerInfo>,
@@ -598,7 +613,7 @@ pub enum LayerAccessKind {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerAccessStatFullDetails {
    pub when_millis_since_epoch: u64,
-    pub task_kind: Cow<'static, str>,
+    pub task_kind: &'static str,
    pub access_kind: LayerAccessKind,
 }

@@ -657,23 +672,23 @@ impl LayerResidenceEvent {
    }
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize)]
 pub struct LayerAccessStats {
    pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
-    pub task_kind_access_flag: Vec<Cow<'static, str>>,
+    pub task_kind_access_flag: Vec<&'static str>,
    pub first: Option<LayerAccessStatFullDetails>,
    pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
    pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum InMemoryLayerInfo {
    Open { lsn_start: Lsn },
    Frozen { lsn_start: Lsn, lsn_end: Lsn },
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum HistoricLayerInfo {
    Delta {
@@ -695,32 +710,6 @@ pub enum HistoricLayerInfo {
    },
 }

-impl HistoricLayerInfo {
-    pub fn layer_file_name(&self) -> &str {
-        match self {
-            HistoricLayerInfo::Delta {
-                layer_file_name, ..
-            } => layer_file_name,
-            HistoricLayerInfo::Image {
-                layer_file_name, ..
-            } => layer_file_name,
-        }
-    }
-    pub fn is_remote(&self) -> bool {
-        match self {
-            HistoricLayerInfo::Delta { remote, .. } => *remote,
-            HistoricLayerInfo::Image { remote, .. } => *remote,
-        }
-    }
-    pub fn set_remote(&mut self, value: bool) {
-        let field = match self {
-            HistoricLayerInfo::Delta { remote, .. } => remote,
-            HistoricLayerInfo::Image { remote, .. } => remote,
-        };
-        *field = value;
-    }
-}
-
 #[derive(Debug, Serialize, Deserialize)]
 pub struct DownloadRemoteLayersTaskSpawnRequest {
    pub max_concurrent_downloads: NonZeroUsize,
@@ -747,37 +736,10 @@ pub struct TimelineGcRequest {
    pub gc_horizon: Option<u64>,
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct WalRedoManagerProcessStatus {
-    pub pid: u32,
-    /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
-    /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
-    pub kind: Cow<'static, str>,
-}
-
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalRedoManagerStatus {
    pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
-    pub process: Option<WalRedoManagerProcessStatus>,
-}
-
-/// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
-/// a download job, timing out while waiting for it to run, and then inspecting this status to understand
-/// what's happening.
-#[derive(Default, Debug, Serialize, Deserialize, Clone)]
-pub struct SecondaryProgress {
-    /// The remote storage LastModified time of the heatmap object we last downloaded.
-    pub heatmap_mtime: Option<serde_system_time::SystemTime>,
-
-    /// The number of layers currently on-disk
-    pub layers_downloaded: usize,
-    /// The number of layers in the most recently seen heatmap
-    pub layers_total: usize,
-
-    /// The number of layer bytes currently on-disk
-    pub bytes_downloaded: u64,
-    /// The number of layer bytes in the most recently seen heatmap
-    pub bytes_total: u64,
+    pub pid: Option<u32>,
 }

 pub mod virtual_file {
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -1,4 +1,4 @@
-use utils::serde_system_time::SystemTime;
+use std::time::SystemTime;

 /// Pageserver current utilization and scoring for how good candidate the pageserver would be for
 /// the next tenant.
@@ -7,7 +7,7 @@ use utils::serde_system_time::SystemTime;
 ///
 /// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
 /// not handle full u64 values properly.
-#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
+#[derive(serde::Serialize, Debug)]
 pub struct PageserverUtilization {
    /// Used disk space
    #[serde(serialize_with = "ser_saturating_u63")]
@@ -21,9 +21,17 @@ pub struct PageserverUtilization {
    /// When was this snapshot captured, pageserver local time.
    ///
    /// Use millis to give confidence that the value is regenerated often enough.
+    #[serde(serialize_with = "ser_rfc3339_millis")]
    pub captured_at: SystemTime,
 }

+fn ser_rfc3339_millis<S: serde::Serializer>(
+    ts: &SystemTime,
+    serializer: S,
+) -> Result<S::Ok, S::Error> {
+    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
+}
+
 /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
 ///
 /// Instead of newtype, use this because a newtype would get require handling deserializing values
@@ -50,9 +58,7 @@ mod tests {
            disk_usage_bytes: u64::MAX,
            free_space_bytes: 0,
            utilization_score: u64::MAX,
-            captured_at: SystemTime(
-                std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
-            ),
+            captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
        };

        let s = serde_json::to_string(&doc).unwrap();
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -8,89 +8,12 @@ use hex::FromHex;
 use serde::{Deserialize, Serialize};
 use utils::id::TenantId;

-/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
-///
-/// This module contains a variety of types used to represent the concept of sharding
-/// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
-/// we provide an summary here.
-///
-/// Types used to describe shards:
-/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
-///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
-///   a shard suffix.
-/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
-/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
-///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
-///   tenant, such as layer files.
-/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
-///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
-/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
-///   four hex digits.  An unsharded tenant is `0000`.
-/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
-///
-/// Types used to describe the parameters for data distribution in a sharded tenant:
-/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
-///   multiple shards.  Its value is given in 8kiB pages.
-/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
-///   always zero: this is provided for future upgrades that might introduce different
-///   data distribution schemes.
-///
-/// Examples:
-/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
-/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
-/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
-///   and their slugs are 0004, 0104, 0204, and 0304.
-
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardNumber(pub u8);

 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(u8);

-/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
-/// when we need to know which shard we're dealing with, but do not need to know the full
-/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
-/// the fully qualified TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct ShardIndex {
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
-/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
-/// and to check whether that [`ShardNumber`] is the same as the current shard.
-#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
-pub struct ShardIdentity {
-    pub number: ShardNumber,
-    pub count: ShardCount,
-    pub stripe_size: ShardStripeSize,
-    layout: ShardLayout,
-}
-
-/// Formatting helper, for generating the `shard_id` label in traces.
-struct ShardSlug<'a>(&'a TenantShardId);
-
-/// TenantShardId globally identifies a particular shard in a particular tenant.
-///
-/// These are written as `<TenantId>-<ShardSlug>`, for example:
-///   # The second shard in a two-shard tenant
-///   072f1291a5310026820b2fe4b2968934-0102
-///
-/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
-/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
-/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
-///
-/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
-/// is both forward and backward compatible with TenantId: a legacy TenantId can be
-/// decoded as a TenantShardId, and when re-encoded it will be parseable
-/// as a TenantId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct TenantShardId {
-    pub tenant_id: TenantId,
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
 impl ShardCount {
    pub const MAX: Self = Self(u8::MAX);

@@ -115,7 +38,6 @@ impl ShardCount {
        self.0
    }

-    ///
    pub fn is_unsharded(&self) -> bool {
        self.0 == 0
    }
@@ -131,6 +53,33 @@ impl ShardNumber {
    pub const MAX: Self = Self(u8::MAX);
 }

+/// TenantShardId identify the units of work for the Pageserver.
+///
+/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
+///
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// Historically, tenants could not have multiple shards, and were identified
+/// by TenantId.  To support this, TenantShardId has a special legacy
+/// mode where `shard_count` is equal to zero: this represents a single-sharded
+/// tenant which should be written as a TenantId with no suffix.
+///
+/// The human-readable encoding of TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+///
+/// Note that the binary encoding is _not_ backward compatible, because
+/// at the time sharding is introduced, there are no existing binary structures
+/// containing TenantId that we need to handle.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
 impl TenantShardId {
    pub fn unsharded(tenant_id: TenantId) -> Self {
        Self {
@@ -162,13 +111,10 @@ impl TenantShardId {
    }

    /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_shard_zero(&self) -> bool {
+    pub fn is_zero(&self) -> bool {
        self.shard_number == ShardNumber(0)
    }

-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
    }
@@ -204,6 +150,9 @@ impl TenantShardId {
    }
 }

+/// Formatting helper
+struct ShardSlug<'a>(&'a TenantShardId);
+
 impl<'a> std::fmt::Display for ShardSlug<'a> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
@@ -273,6 +222,16 @@ impl From<[u8; 18]> for TenantShardId {
    }
 }

+/// For use within the context of a particular tenant, when we need to know which
+/// shard we're dealing with, but do not need to know the full ShardIdentity (because
+/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
+/// TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
 impl ShardIndex {
    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
        Self {
@@ -287,9 +246,6 @@ impl ShardIndex {
        }
    }

-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
    }
@@ -357,8 +313,6 @@ impl Serialize for TenantShardId {
        if serializer.is_human_readable() {
            serializer.collect_str(self)
        } else {
-            // Note: while human encoding of [`TenantShardId`] is backward and forward
-            // compatible, this binary encoding is not.
            let mut packed: [u8; 18] = [0; 18];
            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
            packed[16] = self.shard_number.0;
@@ -436,6 +390,16 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
 /// Default stripe size in pages: 256MiB divided by 8kiB page size.
 const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);

+/// The ShardIdentity contains the information needed for one member of map
+/// to resolve a key to a shard, and then check whether that shard is ==self.
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+pub struct ShardIdentity {
+    pub number: ShardNumber,
+    pub count: ShardCount,
+    pub stripe_size: ShardStripeSize,
+    layout: ShardLayout,
+}
+
 #[derive(thiserror::Error, Debug, PartialEq, Eq)]
 pub enum ShardConfigError {
    #[error("Invalid shard count")]
@@ -475,9 +439,6 @@ impl ShardIdentity {
        }
    }

-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.number == ShardNumber(0) && self.count == ShardCount(0)
    }
@@ -526,8 +487,6 @@ impl ShardIdentity {
    }

    /// Return true if the key should be ingested by this shard
-    ///
-    /// Shards must ingest _at least_ keys which return true from this check.
    pub fn is_key_local(&self, key: &Key) -> bool {
        assert!(!self.is_broken());
        if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
@@ -538,9 +497,7 @@ impl ShardIdentity {
    }

    /// Return true if the key should be discarded if found in this shard's
-    /// data store, e.g. during compaction after a split.
-    ///
-    /// Shards _may_ drop keys which return false here, but are not obliged to.
+    /// data store, e.g. during compaction after a split
    pub fn is_key_disposable(&self, key: &Key) -> bool {
        if key_is_shard0(key) {
            // Q: Why can't we dispose of shard0 content if we're not shard 0?
@@ -566,7 +523,7 @@ impl ShardIdentity {

    /// Convenience for checking if this identity is the 0th shard in a tenant,
    /// for special cases on shard 0 such as ingesting relation sizes.
-    pub fn is_shard_zero(&self) -> bool {
+    pub fn is_zero(&self) -> bool {
        self.number == ShardNumber(0)
    }
 }
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -6,9 +6,7 @@
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;

-use crate::{
-    controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId,
-};
+use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};

 /// Upcall message sent by the pageserver to the configured `control_plane_api` on
 /// startup.
@@ -22,20 +20,12 @@ pub struct ReAttachRequest {
    pub register: Option<NodeRegisterRequest>,
 }

-fn default_mode() -> LocationConfigMode {
-    LocationConfigMode::AttachedSingle
-}
-
-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize)]
 pub struct ReAttachResponseTenant {
    pub id: TenantShardId,
-    /// Mandatory if LocationConfigMode is None or set to an Attached* mode
-    pub gen: Option<u32>,
-
-    /// Default value only for backward compat: this field should be set
-    #[serde(default = "default_mode")]
-    pub mode: LocationConfigMode,
+    pub gen: u32,
 }
+
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponse {
    pub tenants: Vec<ReAttachResponseTenant>,
--- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
+++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
@@ -1,6 +1,5 @@
 use anyhow::*;
 use clap::{value_parser, Arg, ArgMatches, Command};
-use postgres::Client;
 use std::{path::PathBuf, str::FromStr};
 use wal_craft::*;

@@ -9,8 +8,8 @@ fn main() -> Result<()> {
        .init();
    let arg_matches = cli().get_matches();

-    let wal_craft = |arg_matches: &ArgMatches, client: &mut Client| {
-        let intermediate_lsns = match arg_matches
+    let wal_craft = |arg_matches: &ArgMatches, client| {
+        let (intermediate_lsns, end_of_wal_lsn) = match arg_matches
            .get_one::<String>("type")
            .map(|s| s.as_str())
            .context("'type' is required")?
@@ -26,7 +25,6 @@ fn main() -> Result<()> {
            LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
            a => panic!("Unknown --type argument: {a}"),
        };
-        let end_of_wal_lsn = client.pg_current_wal_insert_lsn()?;
        for lsn in intermediate_lsns {
            println!("intermediate_lsn = {lsn}");
        }
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -5,6 +5,7 @@ use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
 use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
+use std::cmp::Ordering;
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -231,52 +232,59 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow
 pub trait Crafter {
    const NAME: &'static str;

-    /// Generates WAL using the client `client`. Returns a vector of some valid
-    /// "interesting" intermediate LSNs which one may start reading from.
-    /// test_end_of_wal uses this to check various starting points.
-    ///
-    /// Note that postgres is generally keen about writing some WAL. While we
-    /// try to disable it (autovacuum, big wal_writer_delay, etc) it is always
-    /// possible, e.g. xl_running_xacts are dumped each 15s. So checks about
-    /// stable WAL end would be flaky unless postgres is shut down. For this
-    /// reason returning potential end of WAL here is pointless. Most of the
-    /// time this doesn't happen though, so it is reasonable to create needed
-    /// WAL structure and immediately kill postgres like test_end_of_wal does.
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>>;
+    /// Generates WAL using the client `client`. Returns a pair of:
+    /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from.
+    ///   May include or exclude Lsn(0) and the end-of-wal.
+    /// * The expected end-of-wal LSN.
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)>;
 }

-/// Wraps some WAL craft function, providing current LSN to it before the
-/// insertion and flushing WAL afterwards. Also pushes initial LSN to the
-/// result.
 fn craft_internal<C: postgres::GenericClient>(
    client: &mut C,
-    f: impl Fn(&mut C, PgLsn) -> anyhow::Result<Vec<PgLsn>>,
-) -> anyhow::Result<Vec<PgLsn>> {
+    f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec<PgLsn>, Option<PgLsn>)>,
+) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
    ensure_server_config(client)?;

    let initial_lsn = client.pg_current_wal_insert_lsn()?;
    info!("LSN initial = {}", initial_lsn);

-    let mut intermediate_lsns = f(client, initial_lsn)?;
+    let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
+    let last_lsn = match last_lsn {
+        None => client.pg_current_wal_insert_lsn()?,
+        Some(last_lsn) => {
+            let insert_lsn = client.pg_current_wal_insert_lsn()?;
+            match last_lsn.cmp(&insert_lsn) {
+                Ordering::Less => bail!(
+                    "Some records were inserted after the crafted WAL: {} vs {}",
+                    last_lsn,
+                    insert_lsn
+                ),
+                Ordering::Equal => last_lsn,
+                Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
+            }
+        }
+    };
    if !intermediate_lsns.starts_with(&[initial_lsn]) {
        intermediate_lsns.insert(0, initial_lsn);
    }

    // Some records may be not flushed, e.g. non-transactional logical messages.
-    //
-    // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
-    // because pg_current_wal_insert_lsn skips page headers.
    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
-    Ok(intermediate_lsns)
+    match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
+        Ordering::Less => bail!("Some records were flushed after the crafted WAL"),
+        Ordering::Equal => {}
+        Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
+    }
+    Ok((intermediate_lsns, last_lsn))
 }

 pub struct Simple;
 impl Crafter for Simple {
    const NAME: &'static str = "simple";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
        craft_internal(client, |client, _| {
            client.execute("CREATE table t(x int)", &[])?;
-            Ok(Vec::new())
+            Ok((Vec::new(), None))
        })
    }
 }
@@ -284,36 +292,29 @@ impl Crafter for Simple {
 pub struct LastWalRecordXlogSwitch;
 impl Crafter for LastWalRecordXlogSwitch {
    const NAME: &'static str = "last_wal_record_xlog_switch";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
-        // Do not use craft_internal because here we end up with flush_lsn exactly on
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+        // Do not use generate_internal because here we end up with flush_lsn exactly on
        // the segment boundary and insert_lsn after the initial page header, which is unusual.
        ensure_server_config(client)?;

        client.execute("CREATE table t(x int)", &[])?;
        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-        // pg_switch_wal returns end of last record of the switched segment,
-        // i.e. end of SWITCH itself.
-        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
-        let before_xlog_switch_u64 = u64::from(before_xlog_switch);
-        let next_segment = PgLsn::from(
-            before_xlog_switch_u64 - (before_xlog_switch_u64 % WAL_SEGMENT_SIZE as u64)
-                + WAL_SEGMENT_SIZE as u64,
-        );
+        let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+        let next_segment = PgLsn::from(0x0200_0000);
        ensure!(
-            xlog_switch_record_end <= next_segment,
-            "XLOG_SWITCH record ended after the expected segment boundary: {} > {}",
-            xlog_switch_record_end,
+            after_xlog_switch <= next_segment,
+            "XLOG_SWITCH message ended after the expected segment boundary: {} > {}",
+            after_xlog_switch,
            next_segment
        );
-        Ok(vec![before_xlog_switch, xlog_switch_record_end])
+        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
    }
 }

 pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
-/// Craft xlog SWITCH record ending at page boundary.
 impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
    const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
        // Do not use generate_internal because here we end up with flush_lsn exactly on
        // the segment boundary and insert_lsn after the initial page header, which is unusual.
        ensure_server_config(client)?;
@@ -360,29 +361,28 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {

        // Emit the XLOG_SWITCH
        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+        let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
        let next_segment = PgLsn::from(0x0200_0000);
        ensure!(
-            xlog_switch_record_end < next_segment,
-            "XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
-            xlog_switch_record_end,
+            after_xlog_switch < next_segment,
+            "XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}",
+            after_xlog_switch,
            next_segment
        );
        ensure!(
-            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
+            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
            "XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
-            xlog_switch_record_end,
-            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
+            after_xlog_switch,
+            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ
        );
-        Ok(vec![before_xlog_switch, xlog_switch_record_end])
+        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
    }
 }

-/// Write ~16MB logical message; it should cross WAL segment.
-fn craft_seg_size_logical_message(
+fn craft_single_logical_message(
    client: &mut impl postgres::GenericClient,
    transactional: bool,
-) -> anyhow::Result<Vec<PgLsn>> {
+) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
    craft_internal(client, |client, initial_lsn| {
        ensure!(
            initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
@@ -405,24 +405,34 @@ fn craft_seg_size_logical_message(
            "Logical message crossed two segments"
        );

-        Ok(vec![message_lsn])
+        if transactional {
+            // Transactional logical messages are part of a transaction, so the one above is
+            // followed by a small COMMIT record.
+
+            let after_message_lsn = client.pg_current_wal_insert_lsn()?;
+            ensure!(
+                message_lsn < after_message_lsn,
+                "No record found after the emitted message"
+            );
+            Ok((vec![message_lsn], Some(after_message_lsn)))
+        } else {
+            Ok((Vec::new(), Some(message_lsn)))
+        }
    })
 }

 pub struct WalRecordCrossingSegmentFollowedBySmallOne;
 impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
    const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
-        // Transactional message crossing WAL segment will be followed by small
-        // commit record.
-        craft_seg_size_logical_message(client, true)
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+        craft_single_logical_message(client, true)
    }
 }

 pub struct LastWalRecordCrossingSegment;
 impl Crafter for LastWalRecordCrossingSegment {
    const NAME: &'static str = "last_wal_record_crossing_segment";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
-        craft_seg_size_logical_message(client, false)
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
+        craft_single_logical_message(client, false)
    }
 }
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -11,15 +11,13 @@ use utils::const_assert;
 use utils::lsn::Lsn;

 fn init_logging() {
-    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(format!(
-        "crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"
-    )))
+    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
+        format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
+    ))
    .is_test(true)
    .try_init();
 }

-/// Test that find_end_of_wal returns the same results as pg_dump on various
-/// WALs created by Crafter.
 fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    use crate::*;

@@ -40,13 +38,13 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    }
    cfg.initdb().unwrap();
    let srv = cfg.start_server().unwrap();
-    let intermediate_lsns = C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
+    let (intermediate_lsns, expected_end_of_wal_partial) =
+        C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
    let intermediate_lsns: Vec<Lsn> = intermediate_lsns
        .iter()
        .map(|&lsn| u64::from(lsn).into())
        .collect();
-    // Kill postgres. Note that it might have inserted to WAL something after
-    // 'craft' did its job.
+    let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
    srv.kill();

    // Check find_end_of_wal on the initial WAL
@@ -58,7 +56,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
        .filter(|fname| IsXLogFileName(fname))
        .max()
        .unwrap();
-    let expected_end_of_wal = find_pg_waldump_end_of_wal(&cfg, &last_segment);
+    check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
    for start_lsn in intermediate_lsns
        .iter()
        .chain(std::iter::once(&expected_end_of_wal))
@@ -93,7 +91,11 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    }
 }

-fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
+fn check_pg_waldump_end_of_wal(
+    cfg: &crate::Conf,
+    last_segment: &str,
+    expected_end_of_wal: Lsn,
+) {
    // Get the actual end of WAL by pg_waldump
    let waldump_output = cfg
        .pg_waldump("000000010000000000000001", last_segment)
@@ -111,8 +113,11 @@ fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
        }
    };
    let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
-    info!("waldump erred on {}", waldump_wal_end);
-    waldump_wal_end
+    info!(
+        "waldump erred on {}, expected wal end at {}",
+        waldump_wal_end, expected_end_of_wal
+    );
+    assert_eq!(waldump_wal_end, expected_end_of_wal);
 }

 fn check_end_of_wal(
@@ -205,9 +210,9 @@ pub fn test_update_next_xid() {
 #[test]
 pub fn test_encode_logical_message() {
    let expected = [
-        64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38,
-        0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102,
-        105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
+        64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
+        38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
+        101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
    ];
    let actual = encode_logical_message("prefix", "message");
    assert_eq!(expected, actual[..]);
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -18,7 +18,6 @@ camino.workspace = true
 humantime.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
-rand.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -157,8 +157,9 @@ impl AzureBlobStorage {
            let mut bufs = Vec::new();
            while let Some(part) = response.next().await {
                let part = part?;
+                let etag_str: &str = part.blob.properties.etag.as_ref();
                if etag.is_none() {
-                    etag = Some(part.blob.properties.etag);
+                    etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
                }
                if last_modified.is_none() {
                    last_modified = Some(part.blob.properties.last_modified.into());
@@ -173,16 +174,6 @@ impl AzureBlobStorage {
                    .map_err(|e| DownloadError::Other(e.into()))?;
                bufs.push(data);
            }
-
-            if bufs.is_empty() {
-                return Err(DownloadError::Other(anyhow::anyhow!(
-                    "Azure GET response contained no buffers"
-                )));
-            }
-            // unwrap safety: if these were None, bufs would be empty and we would have returned an error already
-            let etag = etag.unwrap();
-            let last_modified = last_modified.unwrap();
-
            Ok(Download {
                download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
                etag,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -42,9 +42,6 @@ pub use self::{
 };
 use s3_bucket::RequestKind;

-/// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
-pub use azure_core::Etag;
-
 pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};

 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
@@ -294,9 +291,9 @@ pub type DownloadStream =
 pub struct Download {
    pub download_stream: DownloadStream,
    /// The last time the file was modified (`last-modified` HTTP header)
-    pub last_modified: SystemTime,
+    pub last_modified: Option<SystemTime>,
    /// A way to identify this specific version of the resource (`etag` HTTP header)
-    pub etag: Etag,
+    pub etag: Option<String>,
    /// Extra key-value data, associated with the current remote file.
    pub metadata: Option<StorageMetadata>,
 }
@@ -565,16 +562,6 @@ impl GenericRemoteStorage {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct StorageMetadata(HashMap<String, String>);

-impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
-    fn from(arr: [(&str, &str); N]) -> Self {
-        let map: HashMap<String, String> = arr
-            .iter()
-            .map(|(k, v)| (k.to_string(), v.to_string()))
-            .collect();
-        Self(map)
-    }
-}
-
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -10,7 +10,7 @@ use std::{
    io::ErrorKind,
    num::NonZeroU32,
    pin::Pin,
-    time::{Duration, SystemTime, UNIX_EPOCH},
+    time::{Duration, SystemTime},
 };

 use anyhow::{bail, ensure, Context};
@@ -30,7 +30,6 @@ use crate::{
 };

 use super::{RemoteStorage, StorageMetadata};
-use crate::Etag;

 const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";

@@ -198,7 +197,6 @@ impl LocalFs {
            fs::OpenOptions::new()
                .write(true)
                .create(true)
-                .truncate(true)
                .open(&temp_file_path)
                .await
                .with_context(|| {
@@ -408,37 +406,35 @@ impl RemoteStorage for LocalFs {
        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError> {
        let target_path = from.with_base(&self.storage_root);
+        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
+            let source = ReaderStream::new(
+                fs::OpenOptions::new()
+                    .read(true)
+                    .open(&target_path)
+                    .await
+                    .with_context(|| {
+                        format!("Failed to open source file {target_path:?} to use in the download")
+                    })
+                    .map_err(DownloadError::Other)?,
+            );

-        let file_metadata = file_metadata(&target_path).await?;
-
-        let source = ReaderStream::new(
-            fs::OpenOptions::new()
-                .read(true)
-                .open(&target_path)
+            let metadata = self
+                .read_storage_metadata(&target_path)
                .await
-                .with_context(|| {
-                    format!("Failed to open source file {target_path:?} to use in the download")
-                })
-                .map_err(DownloadError::Other)?,
-        );
+                .map_err(DownloadError::Other)?;

-        let metadata = self
-            .read_storage_metadata(&target_path)
-            .await
-            .map_err(DownloadError::Other)?;
+            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);

-        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
-        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
-
-        let etag = mock_etag(&file_metadata);
-        Ok(Download {
-            metadata,
-            last_modified: file_metadata
-                .modified()
-                .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
-            etag,
-            download_stream: Box::pin(source),
-        })
+            Ok(Download {
+                metadata,
+                last_modified: None,
+                etag: None,
+                download_stream: Box::pin(source),
+            })
+        } else {
+            Err(DownloadError::NotFound)
+        }
    }

    async fn download_byte_range(
@@ -456,51 +452,50 @@ impl RemoteStorage for LocalFs {
                return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
            }
        }
-
        let target_path = from.with_base(&self.storage_root);
-        let file_metadata = file_metadata(&target_path).await?;
-        let mut source = tokio::fs::OpenOptions::new()
-            .read(true)
-            .open(&target_path)
-            .await
-            .with_context(|| {
-                format!("Failed to open source file {target_path:?} to use in the download")
+        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
+            let mut source = tokio::fs::OpenOptions::new()
+                .read(true)
+                .open(&target_path)
+                .await
+                .with_context(|| {
+                    format!("Failed to open source file {target_path:?} to use in the download")
+                })
+                .map_err(DownloadError::Other)?;
+
+            let len = source
+                .metadata()
+                .await
+                .context("query file length")
+                .map_err(DownloadError::Other)?
+                .len();
+
+            source
+                .seek(io::SeekFrom::Start(start_inclusive))
+                .await
+                .context("Failed to seek to the range start in a local storage file")
+                .map_err(DownloadError::Other)?;
+
+            let metadata = self
+                .read_storage_metadata(&target_path)
+                .await
+                .map_err(DownloadError::Other)?;
+
+            let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
+            let source = ReaderStream::new(source);
+
+            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+
+            Ok(Download {
+                metadata,
+                last_modified: None,
+                etag: None,
+                download_stream: Box::pin(source),
            })
-            .map_err(DownloadError::Other)?;
-
-        let len = source
-            .metadata()
-            .await
-            .context("query file length")
-            .map_err(DownloadError::Other)?
-            .len();
-
-        source
-            .seek(io::SeekFrom::Start(start_inclusive))
-            .await
-            .context("Failed to seek to the range start in a local storage file")
-            .map_err(DownloadError::Other)?;
-
-        let metadata = self
-            .read_storage_metadata(&target_path)
-            .await
-            .map_err(DownloadError::Other)?;
-
-        let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
-        let source = ReaderStream::new(source);
-
-        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
-        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
-
-        let etag = mock_etag(&file_metadata);
-        Ok(Download {
-            metadata,
-            last_modified: file_metadata
-                .modified()
-                .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
-            etag,
-            download_stream: Box::pin(source),
-        })
+        } else {
+            Err(DownloadError::NotFound)
+        }
    }

    async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
@@ -615,22 +610,13 @@ async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<
    Ok(())
 }

-async fn file_metadata(file_path: &Utf8Path) -> Result<std::fs::Metadata, DownloadError> {
-    tokio::fs::metadata(&file_path).await.map_err(|e| {
-        if e.kind() == ErrorKind::NotFound {
-            DownloadError::NotFound
-        } else {
-            DownloadError::BadInput(e.into())
-        }
-    })
-}
-
-// Use mtime as stand-in for ETag.  We could calculate a meaningful one by md5'ing the contents of files we
-// read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests
-// quickly, with less overhead than using a mock S3 server.
-fn mock_etag(meta: &std::fs::Metadata) -> Etag {
-    let mtime = meta.modified().expect("Filesystem mtime missing");
-    format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis()).into()
+fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
+    if file_path.exists() {
+        ensure!(file_path.is_file(), "file path '{file_path}' is not a file");
+        Ok(true)
+    } else {
+        Ok(false)
+    }
 }

 #[cfg(test)]
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -35,8 +35,8 @@ use aws_sdk_s3::{
 };
 use aws_smithy_async::rt::sleep::TokioSleep;

+use aws_smithy_types::byte_stream::ByteStream;
 use aws_smithy_types::{body::SdkBody, DateTime};
-use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
 use bytes::Bytes;
 use futures::stream::Stream;
 use hyper::Body;
@@ -287,17 +287,8 @@ impl S3Bucket {
        let remaining = self.timeout.saturating_sub(started_at.elapsed());

        let metadata = object_output.metadata().cloned().map(StorageMetadata);
-        let etag = object_output
-            .e_tag
-            .ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))?
-            .into();
-        let last_modified = object_output
-            .last_modified
-            .ok_or(DownloadError::Other(anyhow::anyhow!(
-                "Missing LastModified header"
-            )))?
-            .try_into()
-            .map_err(|e: ConversionError| DownloadError::Other(e.into()))?;
+        let etag = object_output.e_tag;
+        let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());

        let body = object_output.body;
        let body = ByteStreamAsStream::from(body);
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -57,6 +57,7 @@ enum MaybeEnabledStorage {
    Disabled,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -85,6 +86,7 @@ struct AzureWithTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -146,6 +148,7 @@ struct AzureWithSimpleTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -118,7 +118,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    // A little check to ensure that our clock is not too far off from the S3 clock
    {
        let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
-        let last_modified = dl.last_modified;
+        let last_modified = dl.last_modified.unwrap();
        let half_wt = WAIT_TIME.mul_f32(0.5);
        let t0_hwt = t0 + half_wt;
        let t1_hwt = t1 - half_wt;
@@ -219,6 +219,7 @@ enum MaybeEnabledStorage {
    Disabled,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -247,6 +248,7 @@ struct S3WithTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -308,6 +310,7 @@ struct S3WithSimpleTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
--- a/libs/tenant_size_model/tests/tests.rs
+++ b/libs/tenant_size_model/tests/tests.rs
@@ -247,7 +247,7 @@ fn scenario_4() {
    //
    // This is in total 5000 + 1000 + 5000 + 1000 = 12000
    //
-    // (If we used the method from the previous scenario, and
+    // (If we used the the method from the previous scenario, and
    // kept only snapshot at the branch point, we'd need to keep
    // all the WAL between 10000-18000 on the main branch, so
    // the total size would be 5000 + 1000 + 8000 = 14000. The
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -13,7 +13,6 @@ testing = ["fail/failpoints"]
 [dependencies]
 arc-swap.workspace = true
 sentry.workspace = true
-async-compression.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
@@ -22,7 +21,6 @@ camino.workspace = true
 chrono.workspace = true
 heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
-humantime.workspace = true
 hyper = { workspace = true, features = ["full"] }
 fail.workspace = true
 futures = { workspace = true}
@@ -38,7 +36,6 @@ serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
-tokio-tar.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
@@ -49,7 +46,6 @@ strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
 uuid.workspace = true
-walkdir.workspace = true

 pq_proto.workspace = true
 postgres_connection.workspace = true
--- a/libs/utils/src/env.rs
+++ b/libs/utils/src/env.rs
@@ -1,21 +0,0 @@
-//! Wrapper around `std::env::var` for parsing environment variables.
-
-use std::{fmt::Display, str::FromStr};
-
-pub fn var<V, E>(varname: &str) -> Option<V>
-where
-    V: FromStr<Err = E>,
-    E: Display,
-{
-    match std::env::var(varname) {
-        Ok(s) => Some(
-            s.parse()
-                .map_err(|e| format!("failed to parse env var {varname}: {e:#}"))
-                .unwrap(),
-        ),
-        Err(std::env::VarError::NotPresent) => None,
-        Err(std::env::VarError::NotUnicode(_)) => {
-            panic!("env var {varname} is not unicode")
-        }
-    }
-}
--- a/libs/utils/src/history_buffer.rs
+++ b/libs/utils/src/history_buffer.rs
@@ -47,10 +47,9 @@ impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
    }
 }

-#[derive(serde::Serialize, serde::Deserialize)]
+#[derive(serde::Serialize)]
 struct SerdeRepr<T> {
    buffer: Vec<T>,
-    buffer_size: usize,
    drop_count: u64,
 }

@@ -62,7 +61,6 @@ where
        let HistoryBufferWithDropCounter { buffer, drop_count } = value;
        SerdeRepr {
            buffer: buffer.iter().cloned().collect(),
-            buffer_size: L,
            drop_count: *drop_count,
        }
    }
@@ -80,52 +78,19 @@ where
    }
 }

-impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
-where
-    T: Clone + serde::Deserialize<'de>,
-{
-    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
-    where
-        D: serde::Deserializer<'de>,
-    {
-        let SerdeRepr {
-            buffer: des_buffer,
-            drop_count,
-            buffer_size,
-        } = SerdeRepr::<T>::deserialize(deserializer)?;
-        if buffer_size != L {
-            use serde::de::Error;
-            return Err(D::Error::custom(format!(
-                "invalid buffer_size, expecting {L} got {buffer_size}"
-            )));
-        }
-        let mut buffer = HistoryBuffer::new();
-        buffer.extend(des_buffer);
-        Ok(HistoryBufferWithDropCounter { buffer, drop_count })
-    }
-}
-
 #[cfg(test)]
 mod test {
    use super::HistoryBufferWithDropCounter;

    #[test]
    fn test_basics() {
-        let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
+        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
        b.write(1);
        b.write(2);
        b.write(3);
        assert!(b.iter().any(|e| *e == 2));
        assert!(b.iter().any(|e| *e == 3));
        assert!(!b.iter().any(|e| *e == 1));
-
-        // round-trip serde
-        let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
-            serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
-        assert_eq!(
-            round_tripped.iter().cloned().collect::<Vec<_>>(),
-            b.iter().cloned().collect::<Vec<_>>()
-        );
    }

    #[test]
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -245,7 +245,7 @@ impl std::io::Write for ChannelWriter {
    }
 }

-pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    SERVE_METRICS_COUNT.inc();

    let started_at = std::time::Instant::now();
@@ -367,6 +367,7 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
        .middleware(Middleware::post_with_info(
            add_request_id_header_to_response,
        ))
+        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
        .err_handler(route_error_handler)
 }

--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -63,7 +63,6 @@ pub mod measured_stream;

 pub mod serde_percent;
 pub mod serde_regex;
-pub mod serde_system_time;

 pub mod pageserver_feedback;

@@ -88,12 +87,6 @@ pub mod failpoint_support;

 pub mod yielding_loop;

-pub mod zstd;
-
-pub mod env;
-
-pub mod poison;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -63,7 +63,6 @@ impl UnwrittenLockFile {
 pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
    let lock_file = fs::OpenOptions::new()
        .create(true) // O_CREAT
-        .truncate(true)
        .write(true)
        .open(lock_file_path)
        .context("open lock file")?;
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -29,10 +29,12 @@ pub struct PageserverFeedback {
    // Serialize with RFC3339 format.
    #[serde(with = "serde_systemtime")]
    pub replytime: SystemTime,
-    /// Used to track feedbacks from different shards. Always zero for unsharded tenants.
-    pub shard_number: u32,
 }

+// NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
+// Do not remove previously available fields because this might be backwards incompatible.
+pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
+
 impl PageserverFeedback {
    pub fn empty() -> PageserverFeedback {
        PageserverFeedback {
@@ -41,7 +43,6 @@ impl PageserverFeedback {
            remote_consistent_lsn: Lsn::INVALID,
            disk_consistent_lsn: Lsn::INVALID,
            replytime: *PG_EPOCH,
-            shard_number: 0,
        }
    }

@@ -58,26 +59,17 @@ impl PageserverFeedback {
    //
    // TODO: change serialized fields names once all computes migrate to rename.
    pub fn serialize(&self, buf: &mut BytesMut) {
-        let buf_ptr = buf.len();
-        buf.put_u8(0); // # of keys, will be filled later
-        let mut nkeys = 0;
-
-        nkeys += 1;
+        buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
        buf.put_slice(b"current_timeline_size\0");
        buf.put_i32(8);
        buf.put_u64(self.current_timeline_size);

-        nkeys += 1;
        buf.put_slice(b"ps_writelsn\0");
        buf.put_i32(8);
        buf.put_u64(self.last_received_lsn.0);
-
-        nkeys += 1;
        buf.put_slice(b"ps_flushlsn\0");
        buf.put_i32(8);
        buf.put_u64(self.disk_consistent_lsn.0);
-
-        nkeys += 1;
        buf.put_slice(b"ps_applylsn\0");
        buf.put_i32(8);
        buf.put_u64(self.remote_consistent_lsn.0);
@@ -88,19 +80,9 @@ impl PageserverFeedback {
            .expect("failed to serialize pg_replytime earlier than PG_EPOCH")
            .as_micros() as i64;

-        nkeys += 1;
        buf.put_slice(b"ps_replytime\0");
        buf.put_i32(8);
        buf.put_i64(timestamp);
-
-        if self.shard_number > 0 {
-            nkeys += 1;
-            buf.put_slice(b"shard_number\0");
-            buf.put_i32(4);
-            buf.put_u32(self.shard_number);
-        }
-
-        buf[buf_ptr] = nkeys;
    }

    // Deserialize PageserverFeedback message
@@ -143,8 +125,9 @@ impl PageserverFeedback {
                }
                b"shard_number" => {
                    let len = buf.get_i32();
-                    assert_eq!(len, 4);
-                    rf.shard_number = buf.get_u32();
+                    // TODO: this will be implemented in the next update,
+                    //  for now, we just skip the value.
+                    buf.advance(len as usize);
                }
                _ => {
                    let len = buf.get_i32();
@@ -217,7 +200,10 @@ mod tests {
        rf.serialize(&mut data);

        // Add an extra field to the buffer and adjust number of keys
-        data[0] += 1;
+        if let Some(first) = data.first_mut() {
+            *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
+        }
+
        data.put_slice(b"new_field_one\0");
        data.put_i32(8);
        data.put_u64(42);
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -1,121 +0,0 @@
-//!  Protect a piece of state from reuse after it is left in an inconsistent state.
-//!
-//!  # Example
-//!
-//!  ```
-//!  # tokio_test::block_on(async {
-//!  use utils::poison::Poison;
-//!  use std::time::Duration;
-//!
-//!  struct State {
-//!    clean: bool,
-//!  }
-//!  let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true }));
-//!
-//!  let mut mutex_guard = state.lock().await;
-//!  let mut poison_guard = mutex_guard.check_and_arm()?;
-//!  let state = poison_guard.data_mut();
-//!  state.clean = false;
-//!  // If we get cancelled at this await point, subsequent check_and_arm() calls will fail.
-//!  tokio::time::sleep(Duration::from_secs(10)).await;
-//!  state.clean = true;
-//!  poison_guard.disarm();
-//!  # Ok::<(), utils::poison::Error>(())
-//!  # });
-//!  ```
-
-use tracing::warn;
-
-pub struct Poison<T> {
-    what: &'static str,
-    state: State,
-    data: T,
-}
-
-#[derive(Clone, Copy)]
-enum State {
-    Clean,
-    Armed,
-    Poisoned { at: chrono::DateTime<chrono::Utc> },
-}
-
-impl<T> Poison<T> {
-    /// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed.
-    pub fn new(what: &'static str, data: T) -> Self {
-        Self {
-            what,
-            state: State::Clean,
-            data,
-        }
-    }
-
-    /// Check for poisoning and return a [`Guard`] that provides access to the wrapped state.
-    pub fn check_and_arm(&mut self) -> Result<Guard<T>, Error> {
-        match self.state {
-            State::Clean => {
-                self.state = State::Armed;
-                Ok(Guard(self))
-            }
-            State::Armed => unreachable!("transient state"),
-            State::Poisoned { at } => Err(Error::Poisoned {
-                what: self.what,
-                at,
-            }),
-        }
-    }
-}
-
-/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
-/// Once modifications are done, use [`Self::disarm`].
-/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
-/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
-pub struct Guard<'a, T>(&'a mut Poison<T>);
-
-impl<'a, T> Guard<'a, T> {
-    pub fn data(&self) -> &T {
-        &self.0.data
-    }
-    pub fn data_mut(&mut self) -> &mut T {
-        &mut self.0.data
-    }
-
-    pub fn disarm(self) {
-        match self.0.state {
-            State::Clean => unreachable!("we set it to Armed in check_and_arm()"),
-            State::Armed => {
-                self.0.state = State::Clean;
-            }
-            State::Poisoned { at } => {
-                unreachable!("we fail check_and_arm() if it's in that state: {at}")
-            }
-        }
-    }
-}
-
-impl<'a, T> Drop for Guard<'a, T> {
-    fn drop(&mut self) {
-        match self.0.state {
-            State::Clean => {
-                // set by disarm()
-            }
-            State::Armed => {
-                // still armed => poison it
-                let at = chrono::Utc::now();
-                self.0.state = State::Poisoned { at };
-                warn!(at=?at, "poisoning {}", self.0.what);
-            }
-            State::Poisoned { at } => {
-                unreachable!("we fail check_and_arm() if it's in that state: {at}")
-            }
-        }
-    }
-}
-
-#[derive(thiserror::Error, Debug)]
-pub enum Error {
-    #[error("poisoned at {at}: {what}")]
-    Poisoned {
-        what: &'static str,
-        at: chrono::DateTime<chrono::Utc>,
-    },
-}
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -182,18 +182,6 @@ where
        }
    }

-    /// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
-    pub fn would_wait_for(&self, num: V) -> Result<(), V> {
-        let internal = self.internal.lock().unwrap();
-        let cnt = internal.current.cnt_value();
-        drop(internal);
-        if cnt >= num {
-            Ok(())
-        } else {
-            Err(cnt)
-        }
-    }
-
    /// Register and return a channel that will be notified when a number arrives,
    /// or None, if it has already arrived.
    fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
--- a/libs/utils/src/serde_system_time.rs
+++ b/libs/utils/src/serde_system_time.rs
@@ -1,55 +0,0 @@
-//! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision.
-
-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)]
-#[serde(transparent)]
-pub struct SystemTime(
-    #[serde(
-        deserialize_with = "deser_rfc3339_millis",
-        serialize_with = "ser_rfc3339_millis"
-    )]
-    pub std::time::SystemTime,
-);
-
-fn ser_rfc3339_millis<S: serde::ser::Serializer>(
-    ts: &std::time::SystemTime,
-    serializer: S,
-) -> Result<S::Ok, S::Error> {
-    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
-}
-
-fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<std::time::SystemTime, D::Error>
-where
-    D: serde::de::Deserializer<'de>,
-{
-    let s: String = serde::de::Deserialize::deserialize(deserializer)?;
-    humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    /// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds.
-    fn to_millisecond_precision(time: SystemTime) -> SystemTime {
-        match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) {
-            Ok(duration) => {
-                let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis());
-                SystemTime(
-                    std::time::SystemTime::UNIX_EPOCH
-                        + std::time::Duration::from_millis(total_millis),
-                )
-            }
-            Err(_) => time,
-        }
-    }
-
-    #[test]
-    fn test_serialize_deserialize() {
-        let input = SystemTime(std::time::SystemTime::now());
-        let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0));
-        let serialized = serde_json::to_string(&input).unwrap();
-        assert_eq!(expected_serialized, serialized);
-        let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap();
-        assert_eq!(to_millisecond_precision(input), deserialized);
-    }
-}
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -110,49 +110,6 @@ impl<T> OnceCell<T> {
        }
    }

-    /// Returns a guard to an existing initialized value, or returns an unique initialization
-    /// permit which can be used to initialize this `OnceCell` using `OnceCell::set`.
-    pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
-        // It looks like OnceCell::get_or_init could be implemented using this method instead of
-        // duplication. However, that makes the future be !Send due to possibly holding on to the
-        // MutexGuard over an await point.
-        loop {
-            let sem = {
-                let guard = self.inner.lock().unwrap();
-                if guard.value.is_some() {
-                    return Ok(Guard(guard));
-                }
-                guard.init_semaphore.clone()
-            };
-
-            {
-                let permit = {
-                    // increment the count for the duration of queued
-                    let _guard = CountWaitingInitializers::start(self);
-                    sem.acquire().await
-                };
-
-                let Ok(permit) = permit else {
-                    let guard = self.inner.lock().unwrap();
-                    if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
-                        // there was a take_and_deinit in between
-                        continue;
-                    }
-                    assert!(
-                        guard.value.is_some(),
-                        "semaphore got closed, must be initialized"
-                    );
-                    return Ok(Guard(guard));
-                };
-
-                permit.forget();
-            }
-
-            let permit = InitPermit(sem);
-            return Err(permit);
-        }
-    }
-
    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
    /// to complete initializing the inner value.
    ///
@@ -192,14 +149,6 @@ impl<T> OnceCell<T> {
        }
    }

-    /// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never
-    /// initialized.
-    pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
-        let inner = self.inner.get_mut().unwrap();
-
-        inner.take_and_deinit()
-    }
-
    /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
    pub fn initializer_count(&self) -> usize {
        self.initializers.load(Ordering::Relaxed)
@@ -253,24 +202,16 @@ impl<'a, T> Guard<'a, T> {
    ///
    /// The permit will be on a semaphore part of the new internal value, and any following
    /// [`OnceCell::get_or_init`] will wait on it to complete.
-    pub fn take_and_deinit(mut self) -> (T, InitPermit) {
-        self.0
-            .take_and_deinit()
-            .expect("guard is not created unless value has been initialized")
-    }
-}
-
-impl<T> Inner<T> {
-    pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
-        let value = self.value.take()?;
-
+    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
        let mut swapped = Inner::default();
        let sem = swapped.init_semaphore.clone();
        // acquire and forget right away, moving the control over to InitPermit
        sem.try_acquire().expect("we just created this").forget();
-        let permit = InitPermit(sem);
-        std::mem::swap(self, &mut swapped);
-        Some((value, permit))
+        std::mem::swap(&mut *self.0, &mut swapped);
+        swapped
+            .value
+            .map(|v| (v, InitPermit(sem)))
+            .expect("guard is not created unless value has been initialized")
    }
 }

@@ -279,13 +220,6 @@ impl<T> Inner<T> {
 /// On drop, this type will return the permit.
 pub struct InitPermit(Arc<tokio::sync::Semaphore>);

-impl std::fmt::Debug for InitPermit {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let ptr = Arc::as_ptr(&self.0) as *const ();
-        f.debug_tuple("InitPermit").field(&ptr).finish()
-    }
-}
-
 impl Drop for InitPermit {
    fn drop(&mut self) {
        assert_eq!(
@@ -547,57 +481,4 @@ mod tests {

        assert_eq!("t1", *cell.get().unwrap());
    }
-
-    #[tokio::test(start_paused = true)]
-    async fn detached_init_smoke() {
-        let target = OnceCell::default();
-
-        let Err(permit) = target.get_or_init_detached().await else {
-            unreachable!("it is not initialized")
-        };
-
-        tokio::time::timeout(
-            std::time::Duration::from_secs(3600 * 24 * 7 * 365),
-            target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }),
-        )
-        .await
-        .expect_err("should timeout since we are already holding the permit");
-
-        target.set(42, permit);
-
-        let (_answer, permit) = {
-            let guard = target
-                .get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
-                .await
-                .unwrap();
-
-            assert_eq!(*guard, 42);
-
-            guard.take_and_deinit()
-        };
-
-        assert!(target.get().is_none());
-
-        target.set(11, permit);
-
-        assert_eq!(*target.get().unwrap(), 11);
-    }
-
-    #[tokio::test]
-    async fn take_and_deinit_on_mut() {
-        use std::convert::Infallible;
-
-        let mut target = OnceCell::<u32>::default();
-        assert!(target.take_and_deinit().is_none());
-
-        target
-            .get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) })
-            .await
-            .unwrap();
-
-        let again = target.take_and_deinit();
-        assert!(matches!(again, Some((42, _))), "{again:?}");
-
-        assert!(target.take_and_deinit().is_none());
-    }
 }
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -1,60 +1,27 @@
 use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};

-#[derive(Clone, Copy, Debug, PartialEq, Eq)]
-pub enum VecMapOrdering {
-    Greater,
-    GreaterOrEqual,
-}
-
 /// Ordered map datastructure implemented in a Vec.
 /// Append only - can only add keys that are larger than the
 /// current max key.
-/// Ordering can be adjusted using [`VecMapOrdering`]
-/// during `VecMap` construction.
 #[derive(Clone, Debug)]
-pub struct VecMap<K, V> {
-    data: Vec<(K, V)>,
-    ordering: VecMapOrdering,
-}
+pub struct VecMap<K, V>(Vec<(K, V)>);

 impl<K, V> Default for VecMap<K, V> {
    fn default() -> Self {
-        VecMap {
-            data: Default::default(),
-            ordering: VecMapOrdering::Greater,
-        }
+        VecMap(Default::default())
    }
 }

-#[derive(thiserror::Error, Debug)]
-pub enum VecMapError {
-    #[error("Key violates ordering constraint")]
-    InvalidKey,
-    #[error("Mismatched ordering constraints")]
-    ExtendOrderingError,
-}
+#[derive(Debug)]
+pub struct InvalidKey;

 impl<K: Ord, V> VecMap<K, V> {
-    pub fn new(ordering: VecMapOrdering) -> Self {
-        Self {
-            data: Vec::new(),
-            ordering,
-        }
-    }
-
-    pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self {
-        Self {
-            data: Vec::with_capacity(capacity),
-            ordering,
-        }
-    }
-
    pub fn is_empty(&self) -> bool {
-        self.data.is_empty()
+        self.0.is_empty()
    }

    pub fn as_slice(&self) -> &[(K, V)] {
-        self.data.as_slice()
+        self.0.as_slice()
    }

    /// This function may panic if given a range where the lower bound is
@@ -62,7 +29,7 @@ impl<K: Ord, V> VecMap<K, V> {
    pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
        use std::ops::Bound::*;

-        let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key);
+        let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key);

        let start_idx = match range.start_bound() {
            Unbounded => 0,
@@ -74,7 +41,7 @@ impl<K: Ord, V> VecMap<K, V> {
        };

        let end_idx = match range.end_bound() {
-            Unbounded => self.data.len(),
+            Unbounded => self.0.len(),
            Included(k) => match binary_search(k) {
                Ok(idx) => idx + 1,
                Err(idx) => idx,
@@ -82,30 +49,34 @@ impl<K: Ord, V> VecMap<K, V> {
            Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
        };

-        &self.data[start_idx..end_idx]
+        &self.0[start_idx..end_idx]
    }

    /// Add a key value pair to the map.
-    /// If `key` is not respective of the `self` ordering the
-    /// pair will not be added and `InvalidKey` error will be returned.
-    pub fn append(&mut self, key: K, value: V) -> Result<usize, VecMapError> {
-        self.validate_key_order(&key)?;
+    /// If `key` is less than or equal to the current maximum key
+    /// the pair will not be added and InvalidKey error will be returned.
+    pub fn append(&mut self, key: K, value: V) -> Result<usize, InvalidKey> {
+        if let Some((last_key, _last_value)) = self.0.last() {
+            if &key <= last_key {
+                return Err(InvalidKey);
+            }
+        }

        let delta_size = self.instrument_vec_op(|vec| vec.push((key, value)));
        Ok(delta_size)
    }

    /// Update the maximum key value pair or add a new key value pair to the map.
-    /// If `key` is not respective of the `self` ordering no updates or additions
-    /// will occur and `InvalidKey` error will be returned.
+    /// If `key` is less than the current maximum key no updates or additions
+    /// will occur and InvalidKey error will be returned.
    pub fn append_or_update_last(
        &mut self,
        key: K,
        mut value: V,
-    ) -> Result<(Option<V>, usize), VecMapError> {
-        if let Some((last_key, last_value)) = self.data.last_mut() {
+    ) -> Result<(Option<V>, usize), InvalidKey> {
+        if let Some((last_key, last_value)) = self.0.last_mut() {
            match key.cmp(last_key) {
-                Ordering::Less => return Err(VecMapError::InvalidKey),
+                Ordering::Less => return Err(InvalidKey),
                Ordering::Equal => {
                    std::mem::swap(last_value, &mut value);
                    const DELTA_SIZE: usize = 0;
@@ -129,67 +100,40 @@ impl<K: Ord, V> VecMap<K, V> {
        V: Clone,
    {
        let split_idx = self
-            .data
+            .0
            .binary_search_by_key(&cutoff, extract_key)
            .unwrap_or_else(std::convert::identity);

        (
-            VecMap {
-                data: self.data[..split_idx].to_vec(),
-                ordering: self.ordering,
-            },
-            VecMap {
-                data: self.data[split_idx..].to_vec(),
-                ordering: self.ordering,
-            },
+            VecMap(self.0[..split_idx].to_vec()),
+            VecMap(self.0[split_idx..].to_vec()),
        )
    }

    /// Move items from `other` to the end of `self`, leaving `other` empty.
-    /// If the `other` ordering is different from `self` ordering
-    /// `ExtendOrderingError` error will be returned.
-    /// If any keys in `other` is not respective of the ordering defined in
-    /// `self`, `InvalidKey` error will be returned and no mutation will occur.
-    pub fn extend(&mut self, other: &mut Self) -> Result<usize, VecMapError> {
-        if self.ordering != other.ordering {
-            return Err(VecMapError::ExtendOrderingError);
-        }
+    /// If any keys in `other` is less than or equal to any key in `self`,
+    /// `InvalidKey` error will be returned and no mutation will occur.
+    pub fn extend(&mut self, other: &mut Self) -> Result<usize, InvalidKey> {
+        let self_last_opt = self.0.last().map(extract_key);
+        let other_first_opt = other.0.last().map(extract_key);

-        let other_first_opt = other.data.last().map(extract_key);
-        if let Some(other_first) = other_first_opt {
-            self.validate_key_order(other_first)?;
-        }
-
-        let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data));
-        Ok(delta_size)
-    }
-
-    /// Validate the current last key in `self` and key being
-    /// inserted against the order defined in `self`.
-    fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> {
-        if let Some(last_key) = self.data.last().map(extract_key) {
-            match (&self.ordering, &key.cmp(last_key)) {
-                (VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => {
-                    return Err(VecMapError::InvalidKey);
-                }
-                (VecMapOrdering::Greater, Ordering::Greater) => {}
-                (VecMapOrdering::GreaterOrEqual, Ordering::Less) => {
-                    return Err(VecMapError::InvalidKey);
-                }
-                (VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {}
+        if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) {
+            if self_last >= other_first {
+                return Err(InvalidKey);
            }
        }

-        Ok(())
+        let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0));
+        Ok(delta_size)
    }

    /// Instrument an operation on the underlying [`Vec`].
    /// Will panic if the operation decreases capacity.
    /// Returns the increase in memory usage caused by the op.
    fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
-        let old_cap = self.data.capacity();
-        op(&mut self.data);
-        let new_cap = self.data.capacity();
+        let old_cap = self.0.capacity();
+        op(&mut self.0);
+        let new_cap = self.0.capacity();

        match old_cap.cmp(&new_cap) {
            Ordering::Less => {
@@ -201,36 +145,6 @@ impl<K: Ord, V> VecMap<K, V> {
            Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"),
        }
    }
-
-    /// Similar to `from_iter` defined in `FromIter` trait except
-    /// that it accepts an [`VecMapOrdering`]
-    pub fn from_iter<I: IntoIterator<Item = (K, V)>>(iter: I, ordering: VecMapOrdering) -> Self {
-        let iter = iter.into_iter();
-        let initial_capacity = {
-            match iter.size_hint() {
-                (lower_bound, None) => lower_bound,
-                (_, Some(upper_bound)) => upper_bound,
-            }
-        };
-
-        let mut vec_map = VecMap::with_capacity(initial_capacity, ordering);
-        for (key, value) in iter {
-            vec_map
-                .append(key, value)
-                .expect("The passed collection needs to be sorted!");
-        }
-
-        vec_map
-    }
-}
-
-impl<K: Ord, V> IntoIterator for VecMap<K, V> {
-    type Item = (K, V);
-    type IntoIter = std::vec::IntoIter<(K, V)>;
-
-    fn into_iter(self) -> Self::IntoIter {
-        self.data.into_iter()
-    }
 }

 fn extract_key<K, V>(entry: &(K, V)) -> &K {
@@ -241,7 +155,7 @@ fn extract_key<K, V>(entry: &(K, V)) -> &K {
 mod tests {
    use std::{collections::BTreeMap, ops::Bound};

-    use super::{VecMap, VecMapOrdering};
+    use super::VecMap;

    #[test]
    fn unbounded_range() {
@@ -396,59 +310,5 @@ mod tests {
        left.extend(&mut one_map).unwrap_err();
        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
        assert_eq!(one_map.as_slice(), &[(1, ())]);
-
-        let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual);
-        map_greater_or_equal.append(2, ()).unwrap();
-        map_greater_or_equal.append(2, ()).unwrap();
-
-        left.extend(&mut map_greater_or_equal).unwrap_err();
-        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
-        assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]);
-    }
-
-    #[test]
-    fn extend_with_ordering() {
-        let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual);
-        left.append(0, ()).unwrap();
-        assert_eq!(left.as_slice(), &[(0, ())]);
-
-        let mut greater_right = VecMap::new(VecMapOrdering::Greater);
-        greater_right.append(0, ()).unwrap();
-        left.extend(&mut greater_right).unwrap_err();
-        assert_eq!(left.as_slice(), &[(0, ())]);
-
-        let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual);
-        greater_or_equal_right.append(2, ()).unwrap();
-        greater_or_equal_right.append(2, ()).unwrap();
-        left.extend(&mut greater_or_equal_right).unwrap();
-        assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]);
-    }
-
-    #[test]
-    fn vec_map_from_sorted() {
-        let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())];
-        let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater);
-        assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]);
-
-        let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())];
-        let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
-        assert_eq!(
-            vec_map.as_slice(),
-            &[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]
-        );
-    }
-
-    #[test]
-    #[should_panic]
-    fn vec_map_from_unsorted_greater() {
-        let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())];
-        let _ = VecMap::from_iter(vec, VecMapOrdering::Greater);
-    }
-
-    #[test]
-    #[should_panic]
-    fn vec_map_from_unsorted_greater_or_equal() {
-        let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())];
-        let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
    }
 }
--- a/libs/utils/src/zstd.rs
+++ b/libs/utils/src/zstd.rs
@@ -1,78 +0,0 @@
-use std::io::SeekFrom;
-
-use anyhow::{Context, Result};
-use async_compression::{
-    tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
-    zstd::CParameter,
-    Level,
-};
-use camino::Utf8Path;
-use nix::NixPath;
-use tokio::{
-    fs::{File, OpenOptions},
-    io::AsyncBufRead,
-    io::AsyncSeekExt,
-    io::AsyncWriteExt,
-};
-use tokio_tar::{Archive, Builder, HeaderMode};
-use walkdir::WalkDir;
-
-/// Creates a Zstandard tarball.
-pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
-    let file = OpenOptions::new()
-        .create(true)
-        .truncate(true)
-        .read(true)
-        .write(true)
-        .open(&tarball)
-        .await
-        .with_context(|| format!("tempfile creation {tarball}"))?;
-
-    let mut paths = Vec::new();
-    for entry in WalkDir::new(path) {
-        let entry = entry?;
-        let metadata = entry.metadata().expect("error getting dir entry metadata");
-        // Also allow directories so that we also get empty directories
-        if !(metadata.is_file() || metadata.is_dir()) {
-            continue;
-        }
-        let path = entry.into_path();
-        paths.push(path);
-    }
-    // Do a sort to get a more consistent listing
-    paths.sort_unstable();
-    let zstd = ZstdEncoder::with_quality_and_params(
-        file,
-        Level::Default,
-        &[CParameter::enable_long_distance_matching(true)],
-    );
-    let mut builder = Builder::new(zstd);
-    // Use reproducible header mode
-    builder.mode(HeaderMode::Deterministic);
-    for p in paths {
-        let rel_path = p.strip_prefix(path)?;
-        if rel_path.is_empty() {
-            // The top directory should not be compressed,
-            // the tar crate doesn't like that
-            continue;
-        }
-        builder.append_path_with_name(&p, rel_path).await?;
-    }
-    let mut zstd = builder.into_inner().await?;
-    zstd.shutdown().await?;
-    let mut compressed = zstd.into_inner();
-    let compressed_len = compressed.metadata().await?.len();
-    compressed.seek(SeekFrom::Start(0)).await?;
-    Ok((compressed, compressed_len))
-}
-
-/// Creates a Zstandard tarball.
-pub async fn extract_zst_tarball(
-    path: &Utf8Path,
-    tarball: impl AsyncBufRead + Unpin,
-) -> Result<()> {
-    let decoder = Box::pin(ZstdDecoder::new(tarball));
-    let mut archive = Archive::new(decoder);
-    archive.unpack(path).await?;
-    Ok(())
-}
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -69,7 +69,7 @@ pub struct Config {
    /// should be removed once we have a better solution there.
    sys_buffer_bytes: u64,

-    /// Minimum fraction of total system memory reserved *before* the cgroup threshold; in
+    /// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
    /// other words, providing a ceiling for the highest value of the threshold by enforcing that
    /// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
    /// threshold.
--- a/libs/walproposer/src/api_bindings.rs
+++ b/libs/walproposer/src/api_bindings.rs
@@ -324,11 +324,11 @@ extern "C" fn finish_sync_safekeepers(wp: *mut WalProposer, lsn: XLogRecPtr) {
    }
 }

-extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer, sk: *mut Safekeeper) {
+extern "C" fn process_safekeeper_feedback(wp: *mut WalProposer) {
    unsafe {
        let callback_data = (*(*wp).config).callback_data;
        let api = callback_data as *mut Box<dyn ApiImpl>;
-        (*api).process_safekeeper_feedback(&mut (*wp), &mut (*sk));
+        (*api).process_safekeeper_feedback(&mut (*wp))
    }
 }

--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -142,7 +142,7 @@ pub trait ApiImpl {
        todo!()
    }

-    fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer, _sk: &mut Safekeeper) {
+    fn process_safekeeper_feedback(&mut self, _wp: &mut WalProposer) {
        todo!()
    }

--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Konstantin Knizhnik	70d1086e0f	Prepare for first stage of deployment: do not bump format version and do not write data in new format but recognoze new format	2024-03-15 10:02:51 +02:00
Konstantin Knizhnik	5a8e8baf9f	Make ruff happy	2024-03-14 18:05:30 +02:00
Konstantin Knizhnik	57a4119a7b	Add test for compression	2024-03-14 16:45:45 +02:00
Konstantin Knizhnik	aaef3789b0	Ignore format version when comparing summary for delta_layer	2024-03-14 14:21:35 +02:00
Konstantin Knizhnik	0b57e0b8f2	Fix image layer format version matching	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	485ecbaf8f	Fix test_attach_tenant_config.py test	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	0bcbce197a	Fix test_attach_tenent_config.py test	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	19d59e58d2	Use CompressionAlgorithm enum	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	ce65d13dbd	Add compress_image_layer to openapi spec	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	18fefff026	Fix compressed blob writer	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	2a69861896	Fix parse_tenant_config test	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	98375b3896	Support vectored comp[ressed blobs read	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	8c60359ae5	Emable iomage layer compression by default	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	8c7136b057	Add compress_image_layer property to TenantConfig	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	0df6c41eaa	Compress image layer	2024-03-14 08:33:37 +02:00