Prepare for first stage of deployment: do not bump format version and do not write data in new format but recognoze new format

Make ruff happy
Add test for compression
2026-06-30 02:30:38 +00:00 · 2024-03-15 10:02:51 +02:00 · 2024-03-14 18:05:30 +02:00 · 2024-03-14 16:45:45 +02:00 · 2024-03-14 14:21:35 +02:00 · 2024-03-14 08:33:37 +02:00
351 changed files with 9624 additions and 25131 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -22,7 +22,6 @@
 !s3_scrubber/
 !safekeeper/
 !storage_broker/
 !storage_controller/
 !trace/
 !vendor/postgres-*/
 !workspace_hack/
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -150,7 +150,7 @@ runs:
        # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
        # and to keep files on the host to upload them to the database
-        time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/"
+        time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
        # Generate redirect
        cat <<EOF > ${WORKDIR}/index.html
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -10,7 +10,7 @@ inputs:
    required: true
  api_host:
    desctiption: 'Neon API host'
-    default: console-stage.neon.build
+    default: console.stage.neon.tech
 outputs:
  dsn:
    description: 'Created Branch DSN (for main database)'
--- a/.github/actions/neon-branch-delete/action.yml
+++ b/.github/actions/neon-branch-delete/action.yml
@@ -13,7 +13,7 @@ inputs:
    required: true
  api_host:
    desctiption: 'Neon API host'
-    default: console-stage.neon.build
+    default: console.stage.neon.tech
 runs:
  using: "composite"
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -13,7 +13,7 @@ inputs:
    default: 15
  api_host:
    desctiption: 'Neon API host'
-    default: console-stage.neon.build
+    default: console.stage.neon.tech
  provisioner:
    desctiption: 'k8s-pod or k8s-neonvm'
    default: 'k8s-pod'
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -10,7 +10,7 @@ inputs:
    required: true
  api_host:
    desctiption: 'Neon API host'
-    default: console-stage.neon.build
+    default: console.stage.neon.tech
 runs:
  using: "composite"
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -18,7 +18,6 @@ on:
 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
  cancel-in-progress: false
 env:
  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -147,16 +147,15 @@ jobs:
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
+          "include": [{ "platform": "neon-captest-freetier",   "db_size": "3gb"  },
-                      { "platform": "neon-captest-new",              "db_size": "50gb" },
+                      { "platform": "neon-captest-new",        "db_size": "50gb" },
-                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
+                      { "platform": "neonvm-captest-freetier", "db_size": "3gb"  },
-                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
+                      { "platform": "neonvm-captest-new",      "db_size": "50gb" }]
                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
        }'
        if [ "$(date +%A)" = "Saturday" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
-                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
+                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi
        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -172,7 +171,7 @@ jobs:
        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
-                                                     { "platform": "rds-aurora"   }]')
+                                                   { "platform": "rds-aurora"   }]')
        fi
        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -191,7 +190,7 @@ jobs:
        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                     { "platform": "rds-aurora",   "scale": "10" }]')
+                                                    { "platform": "rds-aurora",   "scale": "10" }]')
        fi
        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -254,9 +253,6 @@ jobs:
          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
          neonvm-captest-sharding-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
            ;;
          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
@@ -274,15 +270,11 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
          QUERIES+=("SHOW neon.timeline_id")
        fi
-
+        psql ${CONNSTR} -c "${QUERY}"
        for q in "${QUERIES[@]}"; do
          psql ${CONNSTR} -c "${q}"
        done
    - name: Benchmark init
      uses: ./.github/actions/run-python-test-set
@@ -409,15 +401,11 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
          QUERIES+=("SHOW neon.timeline_id")
        fi
-
+        psql ${CONNSTR} -c "${QUERY}"
        for q in "${QUERIES[@]}"; do
          psql ${CONNSTR} -c "${q}"
        done
    - name: ClickBench benchmark
      uses: ./.github/actions/run-python-test-set
@@ -519,15 +507,11 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
          QUERIES+=("SHOW neon.timeline_id")
        fi
-
+        psql ${CONNSTR} -c "${QUERY}"
        for q in "${QUERIES[@]}"; do
          psql ${CONNSTR} -c "${q}"
        done
    - name: Run TPC-H benchmark
      uses: ./.github/actions/run-python-test-set
@@ -613,15 +597,11 @@ jobs:
        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT
-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
          QUERIES+=("SHOW neon.timeline_id")
        fi
-
+        psql ${CONNSTR} -c "${QUERY}"
        for q in "${QUERIES[@]}"; do
          psql ${CONNSTR} -c "${q}"
        done
    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -21,7 +21,6 @@ defaults:
 concurrency:
  group: build-build-tools-image-${{ inputs.image-tag }}
  cancel-in-progress: false
 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -461,7 +461,6 @@ jobs:
      - name: Pytest regression tests
        uses: ./.github/actions/run-python-test-set
        timeout-minutes: 60
        with:
          build_type: ${{ matrix.build_type }}
          test_selection: regress
@@ -1121,34 +1120,18 @@ jobs:
        run: |
          if [[ "$GITHUB_REF_NAME" == "main" ]]; then
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=false
            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
+            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
              -f deployPgSniRouter=false \
              -f deployProxy=false \
              -f deployStorage=true \
              -f deployStorageBroker=true \
              -f deployStorageController=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true
            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
              -f deployStorage=true \
              -f deployStorageBroker=true \
              -f deployStorageController=true \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}}
          elif [[ "$GITHUB_REF_NAME" == "release-proxy" ]]; then
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main \
              -f deployPgSniRouter=true \
              -f deployProxy=true \
              -f deployStorage=false \
              -f deployStorageBroker=false \
              -f deployStorageController=false \
              -f branch=main \
              -f dockerTag=${{needs.tag.outputs.build-tag}} \
              -f deployPreprodRegion=true
            gh workflow --repo neondatabase/aws run deploy-proxy-prod.yml --ref main \
              -f deployPgSniRouter=true \
              -f deployProxy=true \
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -28,9 +28,7 @@ jobs:
      - name: Get build-tools image tag for the current commit
        id: get-build-tools-tag
        env:
-          # Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
          # we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
          COMMIT_SHA: ${{ github.sha }}
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          LAST_BUILD_TOOLS_SHA=$(
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -20,7 +20,6 @@ defaults:
 concurrency:
  group: pin-build-tools-image-${{ inputs.from-tag }}
  cancel-in-progress: false
 permissions: {}
--- a/.github/workflows/trigger-e2e-tests.yml
+++ b/.github/workflows/trigger-e2e-tests.yml
@@ -62,14 +62,14 @@ jobs:
  trigger-e2e-tests:
    needs: [ tag ]
-    runs-on: ubuntu-latest
+    runs-on: [ self-hosted, gen3, small ]
    env:
      TAG: ${{ needs.tag.outputs.build-tag }}
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/base:pinned
      options: --init
    steps:
      - name: check if ecr image are present
        env:
          AWS_ACCESS_KEY_ID: ${{ secrets.AWS_ACCESS_KEY_DEV }}
          AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_SECRET_KEY_DEV }}
        run: |
          for REPO in neon compute-tools compute-node-v14 vm-compute-node-v14 compute-node-v15 vm-compute-node-v15 compute-node-v16 vm-compute-node-v16; do
            OUTPUT=$(aws ecr describe-images --repository-name ${REPO} --region eu-central-1 --query "imageDetails[?imageTags[?contains(@, '${TAG}')]]" --output text)
@@ -79,55 +79,41 @@ jobs:
            fi
          done
      - name: Set e2e-platforms
        id: e2e-platforms
        env:
          PR_NUMBER: ${{ github.event.pull_request.number }}
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          # Default set of platforms to run e2e tests on
          platforms='["docker", "k8s"]'
          # If the PR changes vendor/, pgxn/ or libs/vm_monitor/ directories, or Dockerfile.compute-node, add k8s-neonvm to the list of platforms.
          # If the workflow run is not a pull request, add k8s-neonvm to the list.
          if [ "$GITHUB_EVENT_NAME" == "pull_request" ]; then
            for f in $(gh api "/repos/${GITHUB_REPOSITORY}/pulls/${PR_NUMBER}/files" --paginate --jq '.[].filename'); do
              case "$f" in
                vendor/*|pgxn/*|libs/vm_monitor/*|Dockerfile.compute-node)
                  platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
                  ;;
                *)
                  # no-op
                  ;;
              esac
            done
          else
            platforms=$(echo "${platforms}" | jq --compact-output '. += ["k8s-neonvm"] | unique')
          fi
          echo "e2e-platforms=${platforms}" | tee -a $GITHUB_OUTPUT
      - name: Set PR's status to pending and request a remote CI test
        env:
          E2E_PLATFORMS: ${{ steps.e2e-platforms.outputs.e2e-platforms }}
          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
          GH_TOKEN: ${{ secrets.CI_ACCESS_TOKEN }}
        run: |
-          REMOTE_REPO="${GITHUB_REPOSITORY_OWNER}/cloud"
+          # For pull requests, GH Actions set "github.sha" variable to point at a fake merge commit
          # but we need to use a real sha of a latest commit in the PR's branch for the e2e job,
          # to place a job run status update later.
          COMMIT_SHA=${{ github.event.pull_request.head.sha }}
          # For non-PR kinds of runs, the above will produce an empty variable, pick the original sha value for those
          COMMIT_SHA=${COMMIT_SHA:-${{ github.sha }}}
-          gh api "/repos/${GITHUB_REPOSITORY}/statuses/${COMMIT_SHA}" \
+          REMOTE_REPO="${{ github.repository_owner }}/cloud"
            --method POST \
            --raw-field "state=pending" \
            --raw-field "description=[$REMOTE_REPO] Remote CI job is about to start" \
            --raw-field "context=neon-cloud-e2e"
-          gh workflow --repo ${REMOTE_REPO} \
+          curl -f -X POST \
-            run testing.yml \
+          https://api.github.com/repos/${{ github.repository }}/statuses/$COMMIT_SHA \
-              --ref "main" \
+          -H "Accept: application/vnd.github.v3+json" \
-              --raw-field "ci_job_name=neon-cloud-e2e" \
+          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
-              --raw-field "commit_hash=$COMMIT_SHA" \
+          --data \
-              --raw-field "remote_repo=${GITHUB_REPOSITORY}" \
+            "{
-              --raw-field "storage_image_tag=${TAG}" \
+              \"state\": \"pending\",
-              --raw-field "compute_image_tag=${TAG}" \
+              \"context\": \"neon-cloud-e2e\",
-              --raw-field "concurrency_group=${E2E_CONCURRENCY_GROUP}" \
+              \"description\": \"[$REMOTE_REPO] Remote CI job is about to start\"
-              --raw-field "e2e-platforms=${E2E_PLATFORMS}"
+            }"
          curl -f -X POST \
          https://api.github.com/repos/$REMOTE_REPO/actions/workflows/testing.yml/dispatches \
          -H "Accept: application/vnd.github.v3+json" \
          --user "${{ secrets.CI_ACCESS_TOKEN }}" \
          --data \
            "{
              \"ref\": \"main\",
              \"inputs\": {
                \"ci_job_name\": \"neon-cloud-e2e\",
                \"commit_hash\": \"$COMMIT_SHA\",
                \"remote_repo\": \"${{ github.repository }}\",
                \"storage_image_tag\": \"${TAG}\",
                \"compute_image_tag\": \"${TAG}\",
                \"concurrency_group\": \"${{ env.E2E_CONCURRENCY_GROUP }}\"
              }
            }"
--- a/5
+++ b/5
@@ -1,13 +1,12 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/storage_controller @neondatabase/storage
+/control_plane/attachment_service @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
-/libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
+/libs/postgres_ffi/ @neondatabase/compute
 /libs/remote_storage/ @neondatabase/storage
 /libs/safekeeper_api/ @neondatabase/safekeepers
 /libs/vm_monitor/ @neondatabase/autoscaling
 /pageserver/ @neondatabase/storage
 /pgxn/ @neondatabase/compute
 /pgxn/neon/ @neondatabase/compute @neondatabase/safekeepers
 /proxy/ @neondatabase/proxy
 /safekeeper/ @neondatabase/safekeepers
 /vendor/ @neondatabase/compute
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
 members = [
    "compute_tools",
    "control_plane",
-    "control_plane/storcon_cli",
+    "control_plane/attachment_service",
    "pageserver",
    "pageserver/compaction",
    "pageserver/ctl",
@@ -12,7 +12,6 @@ members = [
    "proxy",
    "safekeeper",
    "storage_broker",
    "storage_controller",
    "s3_scrubber",
    "workspace_hack",
    "trace",
@@ -44,7 +43,6 @@ license = "Apache-2.0"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 atomic-take = "1.1.0"
 azure_core = "0.18"
 azure_identity = "0.18"
 azure_storage = "0.18"
@@ -54,12 +52,10 @@ async-stream = "0.3"
 async-trait = "0.1"
 aws-config = { version = "1.1.4", default-features = false, features=["rustls"] }
 aws-sdk-s3 = "1.14"
-aws-sdk-iam = "1.15.0"
+aws-sdk-secretsmanager = { version = "1.14.0" }
 aws-smithy-async = { version = "1.1.4", default-features = false, features=["rt-tokio"] }
 aws-smithy-types = "1.1.4"
 aws-credential-types = "1.1.4"
 aws-sigv4 = { version = "1.2.0", features = ["sign-http"] }
 aws-types = "1.1.7"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -80,7 +76,6 @@ either = "1.8"
 enum-map = "2.4.2"
 enumset = "1.0.12"
 fail = "0.5.0"
 fallible-iterator = "0.2"
 fs2 = "0.4.3"
 futures = "0.3"
 futures-core = "0.3"
@@ -93,12 +88,11 @@ hex = "0.4"
 hex-literal = "0.4"
 hmac = "0.12.1"
 hostname = "0.3.1"
 http = {version = "1.1.0", features = ["std"]}
 http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
-hyper-tungstenite = "0.13.0"
+hyper-tungstenite = "0.11"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
@@ -106,9 +100,8 @@ jsonwebtoken = "9"
 lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 lz4_flex = "0.11.1"
 md5 = "0.7.0"
 measured = { version = "0.0.21", features=["lasso"] }
 measured-process = { version = "0.0.21" }
 memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -128,7 +121,7 @@ procfs = "0.14"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
-redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
+redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
@@ -156,12 +149,11 @@ smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
 svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
-test-context = "0.3"
+test-context = "0.1"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"
 tikv-jemalloc-ctl = "0.5"
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -58,12 +58,6 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
    && mv protoc/include/google /usr/local/include/google \
    && rm -rf protoc.zip protoc
 # s5cmd
 ENV S5CMD_VERSION=2.2.2
 RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
    && chmod +x s5cmd \
    && mv s5cmd /usr/local/bin/s5cmd
 # LLVM
 ENV LLVM_VERSION=17
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
@@ -141,7 +135,7 @@ WORKDIR /home/nonroot
 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.77.0
+ENV RUSTC_VERSION=1.76.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
@@ -155,7 +149,7 @@ RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux
    cargo install --git https://github.com/paritytech/cachepot && \
    cargo install rustfilt && \
    cargo install cargo-hakari && \
-    cargo install cargo-deny --locked && \
+    cargo install cargo-deny && \
    cargo install cargo-hack && \
    cargo install cargo-nextest && \
    rm -rf /home/nonroot/.cargo/registry && \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -944,9 +944,6 @@ RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
 COPY --from=postgres-cleanup-layer --chown=postgres /usr/local/pgsql /usr/local
 COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-debug-size-lto/compute_ctl /usr/local/bin/compute_ctl
 # Create remote extension download directory
 RUN mkdir /usr/local/download_extensions && chown -R postgres:postgres /usr/local/download_extensions
 # Install:
 # libreadline8 for psql
 # libicu67, locales for collations (including ICU and plpgsql_check)
--- a/README.md
+++ b/README.md
@@ -238,14 +238,6 @@ If you encounter errors during setting up the initial tenant, it's best to stop
 ## Running tests
 ### Rust unit tests
 We are using [`cargo-nextest`](https://nexte.st/) to run the tests in Github Workflows.
 Some crates do not support running plain `cargo test` anymore, prefer `cargo nextest run` instead.
 You can install `cargo-nextest` with `cargo install cargo-nextest`.
 ### Integration tests
 Ensure your dependencies are installed as described [here](https://github.com/neondatabase/neon#dependency-installation-notes).
 ```sh
--- a/clippy.toml
+++ b/clippy.toml
@@ -2,8 +2,6 @@ disallowed-methods = [
    "tokio::task::block_in_place",
    # Allow this for now, to deny it later once we stop using Handle::block_on completely
    # "tokio::runtime::Handle::block_on",
    # use tokio_epoll_uring_ext instead
    "tokio_epoll_uring::thread_local_system",
 ]
 disallowed-macros = [
--- a/compute_tools/README.md
+++ b/compute_tools/README.md
@@ -32,29 +32,6 @@ compute_ctl -D /var/db/postgres/compute \
            -b /usr/local/bin/postgres
 ```
 ## State Diagram
 Computes can be in various states. Below is a diagram that details how a
 compute moves between states.
 ```mermaid
 %% https://mermaid.js.org/syntax/stateDiagram.html
 stateDiagram-v2
  [*] --> Empty : Compute spawned
  Empty --> ConfigurationPending : Waiting for compute spec
  ConfigurationPending --> Configuration : Received compute spec
  Configuration --> Failed : Failed to configure the compute
  Configuration --> Running : Compute has been configured
  Empty --> Init : Compute spec is immediately available
  Empty --> TerminationPending : Requested termination
  Init --> Failed : Failed to start Postgres
  Init --> Running : Started Postgres
  Running --> TerminationPending : Requested termination
  TerminationPending --> Terminated : Terminated compute
  Failed --> [*] : Compute exited
  Terminated --> [*] : Compute exited
 ```
 ## Tests
 Cargo formatter:
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -818,15 +818,9 @@ impl ComputeNode {
                        Client::connect(zenith_admin_connstr.as_str(), NoTls)
                            .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
                    // Disable forwarding so that users don't get a cloud_admin role
-
+                    client.simple_query("SET neon.forward_ddl = false")?;
-                    let mut func = || {
+                    client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
-                        client.simple_query("SET neon.forward_ddl = false")?;
+                    client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                        client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
                        client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                        Ok::<_, anyhow::Error>(())
                    };
                    func().context("apply_config setup cloud_admin")?;
                    drop(client);
                    // reconnect with connstring with expected name
@@ -838,29 +832,24 @@ impl ComputeNode {
        };
        // Disable DDL forwarding because control plane already knows about these roles/databases.
-        client
+        client.simple_query("SET neon.forward_ddl = false")?;
            .simple_query("SET neon.forward_ddl = false")
            .context("apply_config SET neon.forward_ddl = false")?;
        // Proceed with post-startup configuration. Note, that order of operations is important.
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
-        create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?;
+        create_neon_superuser(spec, &mut client)?;
-        cleanup_instance(&mut client).context("apply_config cleanup_instance")?;
+        cleanup_instance(&mut client)?;
-        handle_roles(spec, &mut client).context("apply_config handle_roles")?;
+        handle_roles(spec, &mut client)?;
-        handle_databases(spec, &mut client).context("apply_config handle_databases")?;
+        handle_databases(spec, &mut client)?;
-        handle_role_deletions(spec, connstr.as_str(), &mut client)
+        handle_role_deletions(spec, connstr.as_str(), &mut client)?;
            .context("apply_config handle_role_deletions")?;
        handle_grants(
            spec,
            &mut client,
            connstr.as_str(),
            self.has_feature(ComputeFeature::AnonExtension),
-        )
+        )?;
-        .context("apply_config handle_grants")?;
+        handle_extensions(spec, &mut client)?;
-        handle_extensions(spec, &mut client).context("apply_config handle_extensions")?;
+        handle_extension_neon(&mut client)?;
-        handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?;
+        create_availability_check_data(&mut client)?;
        create_availability_check_data(&mut client)
            .context("apply_config create_availability_check_data")?;
        // 'Close' connection
        drop(client);
@@ -868,7 +857,7 @@ impl ComputeNode {
        // Run migrations separately to not hold up cold starts
        thread::spawn(move || {
            let mut client = Client::connect(connstr.as_str(), NoTls)?;
-            handle_migrations(&mut client).context("apply_config handle_migrations")
+            handle_migrations(&mut client)
        });
        Ok(())
    }
@@ -1273,12 +1262,10 @@ LIMIT 100",
        .await
        .map_err(DownloadError::Other);
-        if download_size.is_ok() {
+        self.ext_download_progress
-            self.ext_download_progress
+            .write()
-                .write()
+            .expect("bad lock")
-                .expect("bad lock")
+            .insert(ext_archive_name.to_string(), (download_start, true));
                .insert(ext_archive_name.to_string(), (download_start, true));
        }
        download_size
    }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -6,8 +6,8 @@ use std::path::Path;
 use anyhow::Result;
 use crate::pg_helpers::escape_conf_value;
-use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize};
+use crate::pg_helpers::PgOptionsSerialize;
-use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
+use compute_api::spec::{ComputeMode, ComputeSpec};
 /// Check that `line` is inside a text file and put it there if it is not.
 /// Create file if it doesn't exist.
@@ -17,7 +17,6 @@ pub fn line_in_file(path: &Path, line: &str) -> Result<bool> {
        .write(true)
        .create(true)
        .append(false)
        .truncate(false)
        .open(path)?;
    let buf = io::BufReader::new(&file);
    let mut count: usize = 0;
@@ -92,27 +91,6 @@ pub fn write_postgres_conf(
        }
    }
    if cfg!(target_os = "linux") {
        // Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is
        // disabled), then the control plane has enabled swap and we should set
        // dynamic_shared_memory_type = 'mmap'.
        //
        // This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047.
        let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory")
            // ignore any errors - they may be expected to occur under certain situations (e.g. when
            // not running in Linux).
            .unwrap_or_else(|_| String::new());
        if overcommit_memory_contents.trim() == "2" {
            let opt = GenericOption {
                name: "dynamic_shared_memory_type".to_owned(),
                value: Some("mmap".to_owned()),
                vartype: "enum".to_owned(),
            };
            write!(file, "{}", opt.to_pg_setting())?;
        }
    }
    // If there are any extra options in the 'settings' field, append those
    if spec.cluster.settings.is_some() {
        writeln!(file, "# Managed by compute_ctl: begin")?;
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String {
    format!("'{}'", res)
 }
-pub trait GenericOptionExt {
+trait GenericOptionExt {
    fn to_pg_option(&self) -> String;
    fn to_pg_setting(&self) -> String;
 }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -2,7 +2,7 @@ use std::fs::File;
 use std::path::Path;
 use std::str::FromStr;
-use anyhow::{anyhow, bail, Context, Result};
+use anyhow::{anyhow, bail, Result};
 use postgres::config::Config;
 use postgres::{Client, NoTls};
 use reqwest::StatusCode;
@@ -302,9 +302,9 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            RoleAction::Create => {
                // This branch only runs when roles are created through the console, so it is
                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
-                // from neon_superuser.
+                // from neon_superuser. (NOTE: REPLICATION has been removed from here for now).
                let mut query: String = format!(
-                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("running role create query: '{}'", &query);
@@ -698,8 +698,7 @@ pub fn handle_grants(
        // it is important to run this after all grants
        if enable_anon_extension {
-            handle_extension_anon(spec, &db.owner, &mut db_client, false)
+            handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
                .context("handle_grants handle_extension_anon")?;
        }
    }
@@ -746,12 +745,7 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
    // - extension was already installed and is up to date
    let query = "ALTER EXTENSION neon UPDATE";
    info!("update neon extension version with query: {}", query);
-    if let Err(e) = client.simple_query(query) {
+    client.simple_query(query)?;
        error!(
            "failed to upgrade neon extension during `handle_extension_neon`: {}",
            e
        );
    }
    Ok(())
 }
@@ -810,40 +804,43 @@ $$;"#,
        "",
        "",
        "",
        "",
        // Add new migrations below.
        r#"
 DO $$
 DECLARE
    role_name TEXT;
 BEGIN
    FOR role_name IN SELECT rolname FROM pg_roles WHERE rolreplication IS TRUE
    LOOP
        RAISE NOTICE 'EXECUTING ALTER ROLE % NOREPLICATION', quote_ident(role_name);
        EXECUTE 'ALTER ROLE ' || quote_ident(role_name) || ' NOREPLICATION';
    END LOOP;
 END
 $$;"#,
    ];
-    let mut func = || {
+    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+    client.simple_query(query)?;
        client.simple_query(query)?;
-        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-        client.simple_query(query)?;
+    client.simple_query(query)?;
-        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+    query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-        client.simple_query(query)?;
+    client.simple_query(query)?;
-        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+    query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-        client.simple_query(query)?;
+    client.simple_query(query)?;
-        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+    query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-        client.simple_query(query)?;
+    client.simple_query(query)?;
        Ok::<_, anyhow::Error>(())
    };
    func().context("handle_migrations prepare")?;
-    let query = "SELECT id FROM neon_migration.migration_id";
+    query = "SELECT id FROM neon_migration.migration_id";
-    let row = client
+    let row = client.query_one(query, &[])?;
        .query_one(query, &[])
        .context("handle_migrations get migration_id")?;
    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
    let starting_migration_id = current_migration;
-    let query = "BEGIN";
+    query = "BEGIN";
-    client
+    client.simple_query(query)?;
        .simple_query(query)
        .context("handle_migrations begin")?;
    while current_migration < migrations.len() {
        let migration = &migrations[current_migration];
@@ -851,9 +848,7 @@ $$;"#,
            info!("Skip migration id={}", current_migration);
        } else {
            info!("Running migration:\n{}\n", migration);
-            client.simple_query(migration).with_context(|| {
+            client.simple_query(migration)?;
                format!("handle_migrations current_migration={}", current_migration)
            })?;
        }
        current_migration += 1;
    }
@@ -861,14 +856,10 @@ $$;"#,
        "UPDATE neon_migration.migration_id SET id={}",
        migrations.len()
    );
-    client
+    client.simple_query(&setval)?;
        .simple_query(&setval)
        .context("handle_migrations update id")?;
-    let query = "COMMIT";
+    query = "COMMIT";
-    client
+    client.simple_query(query)?;
        .simple_query(query)
        .context("handle_migrations commit")?;
    info!(
        "Ran {} migrations",
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -12,7 +12,6 @@ clap.workspace = true
 comfy-table.workspace = true
 futures.workspace = true
 git-version.workspace = true
 humantime.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "storage_controller"
+name = "attachment_service"
 version = "0.1.0"
 edition.workspace = true
 license.workspace = true
@@ -16,37 +16,31 @@ testing = []
 [dependencies]
 anyhow.workspace = true
 aws-config.workspace = true
-bytes.workspace = true
+aws-sdk-secretsmanager.workspace = true
 camino.workspace = true
 clap.workspace = true
 fail.workspace = true
 futures.workspace = true
 git-version.workspace = true
 hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
 itertools.workspace = true
 lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
 reqwest.workspace = true
 routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 measured.workspace = true
 diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
 r2d2 = { version = "0.8.10" }
-utils = { path = "../libs/utils/" }
+utils = { path = "../../libs/utils/" }
-metrics = { path = "../libs/metrics/" }
+metrics = { path = "../../libs/metrics/" }
-control_plane = { path = "../control_plane" }
+control_plane = { path = ".." }
-workspace_hack = { version = "0.1", path = "../workspace_hack" }
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
--- a/control_plane/attachment_service/migrations/.keep
+++ b/control_plane/attachment_service/migrations/.keep
--- a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
--- a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
--- a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
--- a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
--- a/control_plane/attachment_service/src/auth.rs
+++ b/control_plane/attachment_service/src/auth.rs
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -1,4 +1,3 @@
 use std::sync::Arc;
 use std::{collections::HashMap, time::Duration};
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
@@ -15,32 +14,19 @@ use utils::{
 use crate::service::Config;
 const BUSY_DELAY: Duration = Duration::from_secs(1);
 const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
 const NOTIFY_REQUEST_TIMEOUT: Duration = Duration::from_secs(10);
 pub(crate) const API_CONCURRENCY: usize = 32;
 struct UnshardedComputeHookTenant {
    // Which node is this tenant attached to
    node_id: NodeId,
    // Must hold this lock to send a notification.
    send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
 }
 struct ShardedComputeHookTenant {
    stripe_size: ShardStripeSize,
    shard_count: ShardCount,
    shards: Vec<(ShardNumber, NodeId)>,
    // Must hold this lock to send a notification.  The contents represent
    // the last successfully sent notification, and are used to coalesce multiple
    // updates by only sending when there is a chance since our last successful send.
    send_lock: Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>,
 }
 enum ComputeHookTenant {
-    Unsharded(UnshardedComputeHookTenant),
+    Unsharded(NodeId),
    Sharded(ShardedComputeHookTenant),
 }
@@ -52,20 +38,9 @@ impl ComputeHookTenant {
                shards: vec![(tenant_shard_id.shard_number, node_id)],
                stripe_size,
                shard_count: tenant_shard_id.shard_count,
                send_lock: Arc::default(),
            })
        } else {
-            Self::Unsharded(UnshardedComputeHookTenant {
+            Self::Unsharded(node_id)
                node_id,
                send_lock: Arc::default(),
            })
        }
    }
    fn get_send_lock(&self) -> &Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>> {
        match self {
            Self::Unsharded(unsharded_tenant) => &unsharded_tenant.send_lock,
            Self::Sharded(sharded_tenant) => &sharded_tenant.send_lock,
        }
    }
@@ -78,8 +53,8 @@ impl ComputeHookTenant {
        node_id: NodeId,
    ) {
        match self {
-            Self::Unsharded(unsharded_tenant) if tenant_shard_id.shard_count.count() == 1 => {
+            Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
-                unsharded_tenant.node_id = node_id
+                *existing_node_id = node_id
            }
            Self::Sharded(sharded_tenant)
                if sharded_tenant.stripe_size == stripe_size
@@ -106,14 +81,14 @@ impl ComputeHookTenant {
    }
 }
-#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Debug)]
 struct ComputeHookNotifyRequestShard {
    node_id: NodeId,
    shard_number: ShardNumber,
 }
 /// Request body that we send to the control plane to notify it of where a tenant is attached
-#[derive(Serialize, Deserialize, Debug, Eq, PartialEq)]
+#[derive(Serialize, Deserialize, Debug)]
 struct ComputeHookNotifyRequest {
    tenant_id: TenantId,
    stripe_size: Option<ShardStripeSize>,
@@ -146,44 +121,14 @@ pub(crate) enum NotifyError {
    Fatal(StatusCode),
 }
 enum MaybeSendResult {
    // Please send this request while holding the lock, and if you succeed then write
    // the request into the lock.
    Transmit(
        (
            ComputeHookNotifyRequest,
            tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>,
        ),
    ),
    // Something requires sending, but you must wait for a current sender then call again
    AwaitLock(Arc<tokio::sync::Mutex<Option<ComputeHookNotifyRequest>>>),
    // Nothing requires sending
    Noop,
 }
 impl ComputeHookTenant {
-    fn maybe_send(
+    fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
-        &self,
+        match self {
-        tenant_id: TenantId,
+            Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
        lock: Option<tokio::sync::OwnedMutexGuard<Option<ComputeHookNotifyRequest>>>,
    ) -> MaybeSendResult {
        let locked = match lock {
            Some(already_locked) => already_locked,
            None => {
                // Lock order: this _must_ be only a try_lock, because we are called inside of the [`ComputeHook::state`] lock.
                let Ok(locked) = self.get_send_lock().clone().try_lock_owned() else {
                    return MaybeSendResult::AwaitLock(self.get_send_lock().clone());
                };
                locked
            }
        };
        let request = match self {
            Self::Unsharded(unsharded_tenant) => Some(ComputeHookNotifyRequest {
                tenant_id,
                shards: vec![ComputeHookNotifyRequestShard {
                    shard_number: ShardNumber(0),
-                    node_id: unsharded_tenant.node_id,
+                    node_id: *node_id,
                }],
                stripe_size: None,
            }),
@@ -207,25 +152,12 @@ impl ComputeHookTenant {
                // Sharded tenant doesn't yet have information for all its shards
                tracing::info!(
-                    "ComputeHookTenant::maybe_send: not enough shards ({}/{})",
+                    "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
                    sharded_tenant.shards.len(),
                    sharded_tenant.shard_count.count()
                );
                None
            }
        };
        match request {
            None => {
                // Not yet ready to emit a notification
                tracing::info!("Tenant isn't yet ready to emit a notification");
                MaybeSendResult::Noop
            }
            Some(request) if Some(&request) == locked.as_ref() => {
                // No change from the last value successfully sent
                MaybeSendResult::Noop
            }
            Some(request) => MaybeSendResult::Transmit((request, locked)),
        }
    }
 }
@@ -235,19 +167,8 @@ impl ComputeHookTenant {
 /// the compute connection string.
 pub(super) struct ComputeHook {
    config: Config,
-    state: std::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
+    state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
    authorization_header: Option<String>,
    // Concurrency limiter, so that we do not overload the cloud control plane when updating
    // large numbers of tenants (e.g. when failing over after a node failure)
    api_concurrency: tokio::sync::Semaphore,
    // This lock is only used in testing enviroments, to serialize calls into neon_lock
    neon_local_lock: tokio::sync::Mutex<()>,
    // We share a client across all notifications to enable connection re-use etc when
    // sending large numbers of notifications
    client: reqwest::Client,
 }
 impl ComputeHook {
@@ -257,30 +178,18 @@ impl ComputeHook {
            .clone()
            .map(|jwt| format!("Bearer {}", jwt));
        let client = reqwest::ClientBuilder::new()
            .timeout(NOTIFY_REQUEST_TIMEOUT)
            .build()
            .expect("Failed to construct HTTP client");
        Self {
            state: Default::default(),
            config,
            authorization_header,
            neon_local_lock: Default::default(),
            api_concurrency: tokio::sync::Semaphore::new(API_CONCURRENCY),
            client,
        }
    }
    /// For test environments: use neon_local's LocalEnv to update compute
    async fn do_notify_local(
        &self,
-        reconfigure_request: &ComputeHookNotifyRequest,
+        reconfigure_request: ComputeHookNotifyRequest,
    ) -> anyhow::Result<()> {
        // neon_local updates are not safe to call concurrently, use a lock to serialize
        // all calls to this function
        let _locked = self.neon_local_lock.lock().await;
        let env = match LocalEnv::load_config() {
            Ok(e) => e,
            Err(e) => {
@@ -297,7 +206,7 @@ impl ComputeHook {
        } = reconfigure_request;
        let compute_pageservers = shards
-            .iter()
+            .into_iter()
            .map(|shard| {
                let ps_conf = env
                    .get_pageserver_conf(shard.node_id)
@@ -309,10 +218,10 @@ impl ComputeHook {
            .collect::<Vec<_>>();
        for (endpoint_name, endpoint) in &cplane.endpoints {
-            if endpoint.tenant_id == *tenant_id && endpoint.status() == EndpointStatus::Running {
+            if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
                tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
                endpoint
-                    .reconfigure(compute_pageservers.clone(), *stripe_size)
+                    .reconfigure(compute_pageservers.clone(), stripe_size)
                    .await?;
            }
        }
@@ -322,11 +231,12 @@ impl ComputeHook {
    async fn do_notify_iteration(
        &self,
        client: &reqwest::Client,
        url: &String,
        reconfigure_request: &ComputeHookNotifyRequest,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
-        let req = self.client.request(Method::PUT, url);
+        let req = client.request(Method::PUT, url);
        let req = if let Some(value) = &self.authorization_header {
            req.header(reqwest::header::AUTHORIZATION, value)
        } else {
@@ -370,10 +280,11 @@ impl ComputeHook {
                Err(NotifyError::SlowDown)
            }
            StatusCode::LOCKED => {
-                // We consider this fatal, because it's possible that the operation blocking the control one is
+                // Delay our retry if busy: the usual fast exponential backoff in backoff::retry
-                // also the one that is waiting for this reconcile.  We should let the reconciler calling
+                // is not appropriate
-                // this hook fail, to give control plane a chance to un-lock.
+                tokio::time::timeout(BUSY_DELAY, cancel.cancelled())
-                tracing::info!("Control plane reports tenant is locked, dropping out of notify");
+                    .await
                    .ok();
                Err(NotifyError::Busy)
            }
            StatusCode::SERVICE_UNAVAILABLE
@@ -389,27 +300,13 @@ impl ComputeHook {
    async fn do_notify(
        &self,
        url: &String,
-        reconfigure_request: &ComputeHookNotifyRequest,
+        reconfigure_request: ComputeHookNotifyRequest,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
-        // We hold these semaphore units across all retries, rather than only across each
+        let client = reqwest::Client::new();
        // HTTP request: this is to preserve fairness and avoid a situation where a retry might
        // time out waiting for a semaphore.
        let _units = self
            .api_concurrency
            .acquire()
            .await
            // Interpret closed semaphore as shutdown
            .map_err(|_| NotifyError::ShuttingDown)?;
        backoff::retry(
-            || self.do_notify_iteration(url, reconfigure_request, cancel),
+            || self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
-            |e| {
+            |e| matches!(e, NotifyError::Fatal(_) | NotifyError::Unexpected(_)),
                matches!(
                    e,
                    NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy
                )
            },
            3,
            10,
            "Send compute notification",
@@ -443,70 +340,42 @@ impl ComputeHook {
        stripe_size: ShardStripeSize,
        cancel: &CancellationToken,
    ) -> Result<(), NotifyError> {
-        let maybe_send_result = {
+        let mut locked = self.state.lock().await;
            let mut state_locked = self.state.lock().unwrap();
-            use std::collections::hash_map::Entry;
+        use std::collections::hash_map::Entry;
-            let tenant = match state_locked.entry(tenant_shard_id.tenant_id) {
+        let tenant = match locked.entry(tenant_shard_id.tenant_id) {
-                Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
+            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
-                    tenant_shard_id,
+                tenant_shard_id,
-                    stripe_size,
+                stripe_size,
-                    node_id,
+                node_id,
-                )),
+            )),
-                Entry::Occupied(e) => {
+            Entry::Occupied(e) => {
-                    let tenant = e.into_mut();
+                let tenant = e.into_mut();
-                    tenant.update(tenant_shard_id, stripe_size, node_id);
+                tenant.update(tenant_shard_id, stripe_size, node_id);
-                    tenant
+                tenant
-                }
+            }
            };
            tenant.maybe_send(tenant_shard_id.tenant_id, None)
        };
-        // Process result: we may get an update to send, or we may have to wait for a lock
+        let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
-        // before trying again.
+        let Some(reconfigure_request) = reconfigure_request else {
-        let (request, mut send_lock_guard) = match maybe_send_result {
+            // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
-            MaybeSendResult::Noop => {
+            // until it does.
-                return Ok(());
+            tracing::info!("Tenant isn't yet ready to emit a notification");
-            }
+            return Ok(());
            MaybeSendResult::AwaitLock(send_lock) => {
                let send_locked = send_lock.lock_owned().await;
                // Lock order: maybe_send is called within the `[Self::state]` lock, and takes the send lock, but here
                // we have acquired the send lock and take `[Self::state]` lock.  This is safe because maybe_send only uses
                // try_lock.
                let state_locked = self.state.lock().unwrap();
                let Some(tenant) = state_locked.get(&tenant_shard_id.tenant_id) else {
                    return Ok(());
                };
                match tenant.maybe_send(tenant_shard_id.tenant_id, Some(send_locked)) {
                    MaybeSendResult::AwaitLock(_) => {
                        unreachable!("We supplied lock guard")
                    }
                    MaybeSendResult::Noop => {
                        return Ok(());
                    }
                    MaybeSendResult::Transmit((request, lock)) => (request, lock),
                }
            }
            MaybeSendResult::Transmit((request, lock)) => (request, lock),
        };
-        let result = if let Some(notify_url) = &self.config.compute_hook_url {
+        if let Some(notify_url) = &self.config.compute_hook_url {
-            self.do_notify(notify_url, &request, cancel).await
+            self.do_notify(notify_url, reconfigure_request, cancel)
                .await
        } else {
-            self.do_notify_local(&request).await.map_err(|e| {
+            self.do_notify_local(reconfigure_request)
-                // This path is for testing only, so munge the error into our prod-style error type.
+                .await
-                tracing::error!("Local notification hook failed: {e}");
+                .map_err(|e| {
-                NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
+                    // This path is for testing only, so munge the error into our prod-style error type.
-            })
+                    tracing::error!("Local notification hook failed: {e}");
-        };
+                    NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
-
+                })
        if result.is_ok() {
            // Before dropping the send lock, stash the request we just sent so that
            // subsequent callers can avoid redundantly re-sending the same thing.
            *send_lock_guard = Some(request);
        }
        result
    }
 }
@@ -530,22 +399,21 @@ pub(crate) mod tests {
            NodeId(1),
        );
-        // An unsharded tenant is always ready to emit a notification, but won't
+        // An unsharded tenant is always ready to emit a notification
-        // send the same one twice
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
-        let send_result = tenant_state.maybe_send(tenant_id, None);
+        assert_eq!(
-        let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
+            tenant_state
-            anyhow::bail!("Wrong send result");
+                .maybe_reconfigure(tenant_id)
-        };
+                .unwrap()
-        assert_eq!(request.shards.len(), 1);
+                .shards
-        assert!(request.stripe_size.is_none());
+                .len(),
-
+            1
-        // Simulate successful send
+        );
-        *guard = Some(request);
+        assert!(tenant_state
-        drop(guard);
+            .maybe_reconfigure(tenant_id)
-
+            .unwrap()
-        // Try asking again: this should be a no-op
+            .stripe_size
-        let send_result = tenant_state.maybe_send(tenant_id, None);
+            .is_none());
        assert!(matches!(send_result, MaybeSendResult::Noop));
        // Writing the first shard of a multi-sharded situation (i.e. in a split)
        // resets the tenant state and puts it in an non-notifying state (need to
@@ -559,10 +427,7 @@ pub(crate) mod tests {
            ShardStripeSize(32768),
            NodeId(1),
        );
-        assert!(matches!(
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());
            tenant_state.maybe_send(tenant_id, None),
            MaybeSendResult::Noop
        ));
        // Writing the second shard makes it ready to notify
        tenant_state.update(
@@ -575,16 +440,22 @@ pub(crate) mod tests {
            NodeId(1),
        );
-        let send_result = tenant_state.maybe_send(tenant_id, None);
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
-        let MaybeSendResult::Transmit((request, mut guard)) = send_result else {
+        assert_eq!(
-            anyhow::bail!("Wrong send result");
+            tenant_state
-        };
+                .maybe_reconfigure(tenant_id)
-        assert_eq!(request.shards.len(), 2);
+                .unwrap()
-        assert_eq!(request.stripe_size, Some(ShardStripeSize(32768)));
+                .shards
-
+                .len(),
-        // Simulate successful send
+            2
-        *guard = Some(request);
+        );
-        drop(guard);
+        assert_eq!(
            tenant_state
                .maybe_reconfigure(tenant_id)
                .unwrap()
                .stripe_size,
            Some(ShardStripeSize(32768))
        );
        Ok(())
    }
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -1,14 +1,7 @@
 use crate::metrics::{
    HttpRequestLatencyLabelGroup, HttpRequestStatusLabelGroup, PageserverRequestLabelGroup,
    METRICS_REGISTRY,
 };
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
 use futures::Future;
 use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
 use metrics::{BuildInfo, NeonMetrics};
 use pageserver_api::models::{
    TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
    TenantTimeTravelRequest, TimelineCreateRequest,
@@ -17,11 +10,9 @@ use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
 use tokio_util::sync::CancellationToken;
 use utils::auth::{Scope, SwappableJwtAuth};
 use utils::failpoint_support::failpoints_handler;
 use utils::http::endpoint::{auth_middleware, check_permission_with, request_span};
-use utils::http::request::{must_get_query_param, parse_query_param, parse_request_param};
+use utils::http::request::{must_get_query_param, parse_request_param};
 use utils::id::{TenantId, TimelineId};
 use utils::{
@@ -35,29 +26,22 @@ use utils::{
 };
 use pageserver_api::controller_api::{
-    NodeAvailability, NodeConfigureRequest, NodeRegisterRequest, TenantPolicyRequest,
+    NodeConfigureRequest, NodeRegisterRequest, TenantShardMigrateRequest,
    TenantShardMigrateRequest,
 };
 use pageserver_api::upcall_api::{ReAttachRequest, ValidateRequest};
 use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
 use routerify::Middleware;
 /// State available to HTTP request handlers
 #[derive(Clone)]
 pub struct HttpState {
    service: Arc<crate::service::Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
    neon_metrics: NeonMetrics,
    allowlist_routes: Vec<Uri>,
 }
 impl HttpState {
-    pub fn new(
+    pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
        service: Arc<crate::service::Service>,
        auth: Option<Arc<SwappableJwtAuth>>,
        build_info: BuildInfo,
    ) -> Self {
        let allowlist_routes = ["/status", "/ready", "/metrics"]
            .iter()
            .map(|v| v.parse().unwrap())
@@ -65,7 +49,6 @@ impl HttpState {
        Self {
            service,
            auth,
            neon_metrics: NeonMetrics::new(build_info),
            allowlist_routes,
        }
    }
@@ -263,10 +246,8 @@ async fn handle_tenant_secondary_download(
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);
+    service.tenant_secondary_download(tenant_id).await?;
-
+    json_response(StatusCode::OK, ())
    let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
    json_response(status, progress)
 }
 async fn handle_tenant_delete(
@@ -328,7 +309,7 @@ async fn handle_tenant_timeline_passthrough(
    tracing::info!("Proxying request for tenant {} ({})", tenant_id, path);
    // Find the node that holds shard zero
-    let (node, tenant_shard_id) = service.tenant_shard0_node(tenant_id)?;
+    let (base_url, tenant_shard_id) = service.tenant_shard0_baseurl(tenant_id)?;
    // Callers will always pass an unsharded tenant ID.  Before proxying, we must
    // rewrite this to a shard-aware shard zero ID.
@@ -337,39 +318,12 @@ async fn handle_tenant_timeline_passthrough(
    let tenant_shard_str = format!("{}", tenant_shard_id);
    let path = path.replace(&tenant_str, &tenant_shard_str);
-    let latency = &METRICS_REGISTRY
+    let client = mgmt_api::Client::new(base_url, service.get_config().jwt_token.as_deref());
        .metrics_group
        .storage_controller_passthrough_request_latency;
    // This is a bit awkward. We remove the param from the request
    // and join the words by '_' to get a label for the request.
    let just_path = path.replace(&tenant_shard_str, "");
    let path_label = just_path
        .split('/')
        .filter(|token| !token.is_empty())
        .collect::<Vec<_>>()
        .join("_");
    let labels = PageserverRequestLabelGroup {
        pageserver_id: &node.get_id().to_string(),
        path: &path_label,
        method: crate::metrics::Method::Get,
    };
    let _timer = latency.start_timer(labels.clone());
    let client = mgmt_api::Client::new(node.base_url(), service.get_config().jwt_token.as_deref());
    let resp = client.get_raw(path).await.map_err(|_e|
        // FIXME: give APiError a proper Unavailable variant.  We return 503 here because
        // if we can't successfully send a request to the pageserver, we aren't available.
        ApiError::ShuttingDown)?;
    if !resp.status().is_success() {
        let error_counter = &METRICS_REGISTRY
            .metrics_group
            .storage_controller_passthrough_request_error;
        error_counter.inc(labels);
    }
    // We have a reqest::Response, would like a http::Response
    let mut builder = hyper::Response::builder()
        .status(resp.status())
@@ -395,25 +349,6 @@ async fn handle_tenant_locate(
    json_response(StatusCode::OK, service.tenant_locate(tenant_id)?)
 }
 async fn handle_tenant_describe(
    service: Arc<Service>,
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
 }
 async fn handle_tenant_list(
    service: Arc<Service>,
    req: Request<Body>,
 ) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;
    json_response(StatusCode::OK, service.tenant_list())
 }
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;
@@ -427,10 +362,7 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
    check_permissions(&req, Scope::Admin)?;
    let state = get_state(&req);
-    let nodes = state.service.node_list().await?;
+    json_response(StatusCode::OK, state.service.node_list().await?)
    let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
    json_response(StatusCode::OK, api_nodes)
 }
 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -455,14 +387,7 @@ async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>,
    json_response(
        StatusCode::OK,
-        state
+        state.service.node_configure(config_req).await?,
            .service
            .node_configure(
                config_req.node_id,
                config_req.availability.map(NodeAvailability::from),
                config_req.scheduling,
            )
            .await?,
    )
 }
@@ -497,22 +422,6 @@ async fn handle_tenant_shard_migrate(
    )
 }
 async fn handle_tenant_update_policy(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    let update_req = json_request::<TenantPolicyRequest>(&mut req).await?;
    let state = get_state(&req);
    json_response(
        StatusCode::OK,
        state
            .service
            .tenant_update_policy(tenant_id, update_req)
            .await?,
    )
 }
 async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    check_permissions(&req, Scope::PageServerApi)?;
@@ -522,18 +431,6 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
    json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
 }
 async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
    check_permissions(&req, Scope::PageServerApi)?;
    let state = get_state(&req);
    json_response(
        StatusCode::OK,
        state.service.tenant_import(tenant_id).await?,
    )
 }
 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;
@@ -556,14 +453,6 @@ async fn handle_consistency_check(req: Request<Body>) -> Result<Response<Body>,
    json_response(StatusCode::OK, state.service.consistency_check().await?)
 }
 async fn handle_reconcile_all(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;
    let state = get_state(&req);
    json_response(StatusCode::OK, state.service.reconcile_all_now().await?)
 }
 /// Status endpoint is just used for checking that our HTTP listener is up
 async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(StatusCode::OK, ())
@@ -588,11 +477,7 @@ impl From<ReconcileError> for ApiError {
 /// Common wrapper for request handlers that call into Service and will operate on tenants: they must only
 /// be allowed to run if Service has finished its initial reconciliation.
-async fn tenant_service_handler<R, H>(
+async fn tenant_service_handler<R, H>(request: Request<Body>, handler: H) -> R::Output
    request: Request<Body>,
    handler: H,
    request_name: RequestName,
 ) -> R::Output
 where
    R: std::future::Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
    H: FnOnce(Arc<Service>, Request<Body>) -> R + Send + Sync + 'static,
@@ -612,122 +497,24 @@ where
        ));
    }
-    named_request_span(
+    request_span(
        request,
        |request| async move { handler(service, request).await },
        request_name,
    )
    .await
 }
 /// Check if the required scope is held in the request's token, or if the request has
 /// a token with 'admin' scope then always permit it.
 fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
    check_permission_with(request, |claims| {
-        match crate::auth::check_permission(claims, required_scope) {
+        crate::auth::check_permission(claims, required_scope)
            Err(e) => match crate::auth::check_permission(claims, Scope::Admin) {
                Ok(()) => Ok(()),
                Err(_) => Err(e),
            },
            Ok(()) => Ok(()),
        }
    })
 }
 #[derive(Clone, Debug)]
 struct RequestMeta {
    method: hyper::http::Method,
    at: Instant,
 }
 fn prologue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
 ) -> Middleware<B, ApiError> {
    Middleware::pre(move |req| async move {
        let meta = RequestMeta {
            method: req.method().clone(),
            at: Instant::now(),
        };
        req.set_context(meta);
        Ok(req)
    })
 }
 fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>(
 ) -> Middleware<B, ApiError> {
    Middleware::post_with_info(move |resp, req_info| async move {
        let request_name = match req_info.context::<RequestName>() {
            Some(name) => name,
            None => {
                return Ok(resp);
            }
        };
        if let Some(meta) = req_info.context::<RequestMeta>() {
            let status = &crate::metrics::METRICS_REGISTRY
                .metrics_group
                .storage_controller_http_request_status;
            let latency = &crate::metrics::METRICS_REGISTRY
                .metrics_group
                .storage_controller_http_request_latency;
            status.inc(HttpRequestStatusLabelGroup {
                path: request_name.0,
                method: meta.method.clone().into(),
                status: crate::metrics::StatusCode(resp.status()),
            });
            latency.observe(
                HttpRequestLatencyLabelGroup {
                    path: request_name.0,
                    method: meta.method.into(),
                },
                meta.at.elapsed().as_secs_f64(),
            );
        }
        Ok(resp)
    })
 }
 pub async fn measured_metrics_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";
    let state = get_state(&req);
    let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics);
    let response = Response::builder()
        .status(200)
        .header(CONTENT_TYPE, TEXT_FORMAT)
        .body(payload.into())
        .unwrap();
    Ok(response)
 }
 #[derive(Clone)]
 struct RequestName(&'static str);
 async fn named_request_span<R, H>(
    request: Request<Body>,
    handler: H,
    name: RequestName,
 ) -> R::Output
 where
    R: Future<Output = Result<Response<Body>, ApiError>> + Send + 'static,
    H: FnOnce(Request<Body>) -> R + Send + Sync + 'static,
 {
    request.set_context(name);
    request_span(request, handler).await
 }
 pub fn make_router(
    service: Arc<Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
    build_info: BuildInfo,
 ) -> RouterBuilder<hyper::Body, ApiError> {
-    let mut router = endpoint::make_router()
+    let mut router = endpoint::make_router();
        .middleware(prologue_metrics_middleware())
        .middleware(epilogue_metrics_middleware());
    if auth.is_some() {
        router = router.middleware(auth_middleware(|request| {
            let state = get_state(request);
@@ -736,186 +523,93 @@ pub fn make_router(
            } else {
                state.auth.as_deref()
            }
-        }));
+        }))
    }
    router
-        .data(Arc::new(HttpState::new(service, auth, build_info)))
+        .data(Arc::new(HttpState::new(service, auth)))
        .get("/metrics", |r| {
            named_request_span(r, measured_metrics_handler, RequestName("metrics"))
        })
        // Non-prefixed generic endpoints (status, metrics)
-        .get("/status", |r| {
+        .get("/status", |r| request_span(r, handle_status))
-            named_request_span(r, handle_status, RequestName("status"))
+        .get("/ready", |r| request_span(r, handle_ready))
        })
        .get("/ready", |r| {
            named_request_span(r, handle_ready, RequestName("ready"))
        })
        // Upcalls for the pageserver: point the pageserver's `control_plane_api` config to this prefix
        .post("/upcall/v1/re-attach", |r| {
-            named_request_span(r, handle_re_attach, RequestName("upcall_v1_reattach"))
+            request_span(r, handle_re_attach)
        })
        .post("/upcall/v1/validate", |r| {
            named_request_span(r, handle_validate, RequestName("upcall_v1_validate"))
        })
        .post("/upcall/v1/validate", |r| request_span(r, handle_validate))
        // Test/dev/debug endpoints
        .post("/debug/v1/attach-hook", |r| {
-            named_request_span(r, handle_attach_hook, RequestName("debug_v1_attach_hook"))
+            request_span(r, handle_attach_hook)
        })
        .post("/debug/v1/inspect", |r| {
            named_request_span(r, handle_inspect, RequestName("debug_v1_inspect"))
        })
        .post("/debug/v1/inspect", |r| request_span(r, handle_inspect))
        .post("/debug/v1/tenant/:tenant_id/drop", |r| {
-            named_request_span(r, handle_tenant_drop, RequestName("debug_v1_tenant_drop"))
+            request_span(r, handle_tenant_drop)
        })
        .post("/debug/v1/node/:node_id/drop", |r| {
-            named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
+            request_span(r, handle_node_drop)
        })
        .post("/debug/v1/tenant/:tenant_id/import", |r| {
            named_request_span(
                r,
                handle_tenant_import,
                RequestName("debug_v1_tenant_import"),
            )
        })
        .get("/debug/v1/tenant", |r| {
            named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
        })
        .get("/debug/v1/tenant/:tenant_id/locate", |r| {
            tenant_service_handler(
                r,
                handle_tenant_locate,
                RequestName("debug_v1_tenant_locate"),
            )
        })
        .get("/debug/v1/tenant", |r| request_span(r, handle_tenants_dump))
        .get("/debug/v1/scheduler", |r| {
-            named_request_span(r, handle_scheduler_dump, RequestName("debug_v1_scheduler"))
+            request_span(r, handle_scheduler_dump)
        })
        .post("/debug/v1/consistency_check", |r| {
-            named_request_span(
+            request_span(r, handle_consistency_check)
                r,
                handle_consistency_check,
                RequestName("debug_v1_consistency_check"),
            )
        })
-        .post("/debug/v1/reconcile_all", |r| {
+        .get("/control/v1/tenant/:tenant_id/locate", |r| {
-            request_span(r, handle_reconcile_all)
+            tenant_service_handler(r, handle_tenant_locate)
        })
        .put("/debug/v1/failpoints", |r| {
            request_span(r, |r| failpoints_handler(r, CancellationToken::new()))
        })
        // Node operations
        .post("/control/v1/node", |r| {
-            named_request_span(r, handle_node_register, RequestName("control_v1_node"))
+            request_span(r, handle_node_register)
        })
        .get("/control/v1/node", |r| {
            named_request_span(r, handle_node_list, RequestName("control_v1_node"))
        })
        .get("/control/v1/node", |r| request_span(r, handle_node_list))
        .put("/control/v1/node/:node_id/config", |r| {
-            named_request_span(
+            request_span(r, handle_node_configure)
                r,
                handle_node_configure,
                RequestName("control_v1_node_config"),
            )
        })
        // Tenant Shard operations
        .put("/control/v1/tenant/:tenant_shard_id/migrate", |r| {
-            tenant_service_handler(
+            tenant_service_handler(r, handle_tenant_shard_migrate)
                r,
                handle_tenant_shard_migrate,
                RequestName("control_v1_tenant_migrate"),
            )
        })
        .put("/control/v1/tenant/:tenant_id/shard_split", |r| {
-            tenant_service_handler(
+            tenant_service_handler(r, handle_tenant_shard_split)
                r,
                handle_tenant_shard_split,
                RequestName("control_v1_tenant_shard_split"),
            )
        })
        .get("/control/v1/tenant/:tenant_id", |r| {
            tenant_service_handler(
                r,
                handle_tenant_describe,
                RequestName("control_v1_tenant_describe"),
            )
        })
        .get("/control/v1/tenant", |r| {
            tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list"))
        })
        .put("/control/v1/tenant/:tenant_id/policy", |r| {
            named_request_span(
                r,
                handle_tenant_update_policy,
                RequestName("control_v1_tenant_policy"),
            )
        })
        // Tenant operations
        // The ^/v1/ endpoints act as a "Virtual Pageserver", enabling shard-naive clients to call into
        // this service to manage tenants that actually consist of many tenant shards, as if they are a single entity.
        .post("/v1/tenant", |r| {
-            tenant_service_handler(r, handle_tenant_create, RequestName("v1_tenant"))
+            tenant_service_handler(r, handle_tenant_create)
        })
        .delete("/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(r, handle_tenant_delete, RequestName("v1_tenant"))
+            tenant_service_handler(r, handle_tenant_delete)
        })
        .put("/v1/tenant/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_set, RequestName("v1_tenant_config"))
+            tenant_service_handler(r, handle_tenant_config_set)
        })
        .get("/v1/tenant/:tenant_id/config", |r| {
-            tenant_service_handler(r, handle_tenant_config_get, RequestName("v1_tenant_config"))
+            tenant_service_handler(r, handle_tenant_config_get)
        })
        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
-            tenant_service_handler(
+            tenant_service_handler(r, handle_tenant_location_config)
                r,
                handle_tenant_location_config,
                RequestName("v1_tenant_location_config"),
            )
        })
        .put("/v1/tenant/:tenant_id/time_travel_remote_storage", |r| {
-            tenant_service_handler(
+            tenant_service_handler(r, handle_tenant_time_travel_remote_storage)
                r,
                handle_tenant_time_travel_remote_storage,
                RequestName("v1_tenant_time_travel_remote_storage"),
            )
        })
        .post("/v1/tenant/:tenant_id/secondary/download", |r| {
-            tenant_service_handler(
+            tenant_service_handler(r, handle_tenant_secondary_download)
                r,
                handle_tenant_secondary_download,
                RequestName("v1_tenant_secondary_download"),
            )
        })
        // Timeline operations
        .delete("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
-            tenant_service_handler(
+            tenant_service_handler(r, handle_tenant_timeline_delete)
                r,
                handle_tenant_timeline_delete,
                RequestName("v1_tenant_timeline"),
            )
        })
        .post("/v1/tenant/:tenant_id/timeline", |r| {
-            tenant_service_handler(
+            tenant_service_handler(r, handle_tenant_timeline_create)
                r,
                handle_tenant_timeline_create,
                RequestName("v1_tenant_timeline"),
            )
        })
        // Tenant detail GET passthrough to shard zero
        .get("/v1/tenant/:tenant_id", |r| {
-            tenant_service_handler(
+            tenant_service_handler(r, handle_tenant_timeline_passthrough)
                r,
                handle_tenant_timeline_passthrough,
                RequestName("v1_tenant_passthrough"),
            )
        })
        // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
        // timeline GET APIs will be implicitly included.
        .get("/v1/tenant/:tenant_id/timeline*", |r| {
-            tenant_service_handler(
+            tenant_service_handler(r, handle_tenant_timeline_passthrough)
                r,
                handle_tenant_timeline_passthrough,
                RequestName("v1_tenant_timeline_passthrough"),
            )
        })
 }
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -3,18 +3,15 @@ use utils::seqwait::MonotonicCounter;
 mod auth;
 mod compute_hook;
 mod heartbeater;
 pub mod http;
 mod id_lock_map;
 pub mod metrics;
 mod node;
 mod pageserver_client;
 pub mod persistence;
 mod reconciler;
 mod scheduler;
 mod schema;
 pub mod service;
-mod tenant_shard;
+mod tenant_state;
 #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
 struct Sequence(u64);
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -1,20 +1,19 @@
 use anyhow::{anyhow, Context};
 use attachment_service::http::make_router;
 use attachment_service::metrics::preinitialize_metrics;
 use attachment_service::persistence::Persistence;
 use attachment_service::service::{Config, Service};
 use aws_config::{BehaviorVersion, Region};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use metrics::BuildInfo;
 use std::sync::Arc;
 use storage_controller::http::make_router;
 use storage_controller::metrics::preinitialize_metrics;
 use storage_controller::persistence::Persistence;
 use storage_controller::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};
 use utils::sentry_init::init_sentry;
 use utils::{project_build_tag, project_git_version, tcp_listener};
 project_git_version!(GIT_VERSION);
@@ -52,33 +51,9 @@ struct Cli {
    #[arg(short, long)]
    path: Option<Utf8PathBuf>,
-    /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller
+    /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
    #[arg(long)]
    database_url: Option<String>,
    /// Flag to enable dev mode, which permits running without auth
    #[arg(long, default_value = "false")]
    dev: bool,
    /// Grace period before marking unresponsive pageserver offline
    #[arg(long)]
    max_unavailable_interval: Option<humantime::Duration>,
 }
 enum StrictMode {
    /// In strict mode, we will require that all secrets are loaded, i.e. security features
    /// may not be implicitly turned off by omitting secrets in the environment.
    Strict,
    /// In dev mode, secrets are optional, and omitting a particular secret will implicitly
    /// disable the auth related to it (e.g. no pageserver jwt key -> send unauthenticated
    /// requests, no public key -> don't authenticate incoming requests).
    Dev,
 }
 impl Default for StrictMode {
    fn default() -> Self {
        Self::Strict
    }
 }
 /// Secrets may either be provided on the command line (for testing), or loaded from AWS SecretManager: this
@@ -91,6 +66,13 @@ struct Secrets {
 }
 impl Secrets {
    const DATABASE_URL_SECRET: &'static str = "rds-neon-storage-controller-url";
    const PAGESERVER_JWT_TOKEN_SECRET: &'static str =
        "neon-storage-controller-pageserver-jwt-token";
    const CONTROL_PLANE_JWT_TOKEN_SECRET: &'static str =
        "neon-storage-controller-control-plane-jwt-token";
    const PUBLIC_KEY_SECRET: &'static str = "neon-storage-controller-public-key";
    const DATABASE_URL_ENV: &'static str = "DATABASE_URL";
    const PAGESERVER_JWT_TOKEN_ENV: &'static str = "PAGESERVER_JWT_TOKEN";
    const CONTROL_PLANE_JWT_TOKEN_ENV: &'static str = "CONTROL_PLANE_JWT_TOKEN";
@@ -101,41 +83,111 @@ impl Secrets {
    /// - Environment variables if DATABASE_URL is set.
    /// - AWS Secrets Manager secrets
    async fn load(args: &Cli) -> anyhow::Result<Self> {
-        let Some(database_url) =
+        match &args.database_url {
-            Self::load_secret(&args.database_url, Self::DATABASE_URL_ENV).await
+            Some(url) => Self::load_cli(url, args),
-        else {
+            None => match std::env::var(Self::DATABASE_URL_ENV) {
-            anyhow::bail!(
+                Ok(database_url) => Self::load_env(database_url),
-                "Database URL is not set (set `--database-url`, or `DATABASE_URL` environment)"
+                Err(_) => Self::load_aws_sm().await,
-            )
+            },
-        };
+        }
        let public_key = match Self::load_secret(&args.public_key, Self::PUBLIC_KEY_ENV).await {
            Some(v) => Some(JwtAuth::from_key(v).context("Loading public key")?),
            None => None,
        };
        let this = Self {
            database_url,
            public_key,
            jwt_token: Self::load_secret(&args.jwt_token, Self::PAGESERVER_JWT_TOKEN_ENV).await,
            control_plane_jwt_token: Self::load_secret(
                &args.control_plane_jwt_token,
                Self::CONTROL_PLANE_JWT_TOKEN_ENV,
            )
            .await,
        };
        Ok(this)
    }
-    async fn load_secret(cli: &Option<String>, env_name: &str) -> Option<String> {
+    fn load_env(database_url: String) -> anyhow::Result<Self> {
-        if let Some(v) = cli {
+        let public_key = match std::env::var(Self::PUBLIC_KEY_ENV) {
-            Some(v.clone())
+            Ok(public_key) => Some(JwtAuth::from_key(public_key).context("Loading public key")?),
-        } else if let Ok(v) = std::env::var(env_name) {
+            Err(_) => None,
-            Some(v)
+        };
-        } else {
+        Ok(Self {
-            None
+            database_url,
            public_key,
            jwt_token: std::env::var(Self::PAGESERVER_JWT_TOKEN_ENV).ok(),
            control_plane_jwt_token: std::env::var(Self::CONTROL_PLANE_JWT_TOKEN_ENV).ok(),
        })
    }
    async fn load_aws_sm() -> anyhow::Result<Self> {
        let Ok(region) = std::env::var("AWS_REGION") else {
            anyhow::bail!("AWS_REGION is not set, cannot load secrets automatically: either set this, or use CLI args to supply secrets");
        };
        let config = aws_config::defaults(BehaviorVersion::v2023_11_09())
            .region(Region::new(region.clone()))
            .load()
            .await;
        let asm = aws_sdk_secretsmanager::Client::new(&config);
        let Some(database_url) = asm
            .get_secret_value()
            .secret_id(Self::DATABASE_URL_SECRET)
            .send()
            .await?
            .secret_string()
            .map(str::to_string)
        else {
            anyhow::bail!(
                "Database URL secret not found at {region}/{}",
                Self::DATABASE_URL_SECRET
            )
        };
        let jwt_token = asm
            .get_secret_value()
            .secret_id(Self::PAGESERVER_JWT_TOKEN_SECRET)
            .send()
            .await?
            .secret_string()
            .map(str::to_string);
        if jwt_token.is_none() {
            tracing::warn!("No pageserver JWT token set: this will only work if authentication is disabled on the pageserver");
        }
        let control_plane_jwt_token = asm
            .get_secret_value()
            .secret_id(Self::CONTROL_PLANE_JWT_TOKEN_SECRET)
            .send()
            .await?
            .secret_string()
            .map(str::to_string);
        if jwt_token.is_none() {
            tracing::warn!("No control plane JWT token set: this will only work if authentication is disabled on the pageserver");
        }
        let public_key = asm
            .get_secret_value()
            .secret_id(Self::PUBLIC_KEY_SECRET)
            .send()
            .await?
            .secret_string()
            .map(str::to_string);
        let public_key = match public_key {
            Some(key) => Some(JwtAuth::from_key(key)?),
            None => {
                tracing::warn!(
                    "No public key set: inccoming HTTP requests will not be authenticated"
                );
                None
            }
        };
        Ok(Self {
            database_url,
            public_key,
            jwt_token,
            control_plane_jwt_token,
        })
    }
    fn load_cli(database_url: &str, args: &Cli) -> anyhow::Result<Self> {
        let public_key = match &args.public_key {
            None => None,
            Some(key) => Some(JwtAuth::from_key(key.clone()).context("Loading public key")?),
        };
        Ok(Self {
            database_url: database_url.to_owned(),
            public_key,
            jwt_token: args.jwt_token.clone(),
            control_plane_jwt_token: args.control_plane_jwt_token.clone(),
        })
    }
 }
@@ -154,14 +206,6 @@ async fn migration_run(database_url: &str) -> anyhow::Result<()> {
 }
 fn main() -> anyhow::Result<()> {
    let default_panic = std::panic::take_hook();
    std::panic::set_hook(Box::new(move |info| {
        default_panic(info);
        std::process::exit(1);
    }));
    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
    tokio::runtime::Builder::new_current_thread()
        // We use spawn_blocking for database operations, so require approximately
        // as many blocking threads as we will open database connections.
@@ -193,55 +237,12 @@ async fn async_main() -> anyhow::Result<()> {
        args.listen
    );
    let build_info = BuildInfo {
        revision: GIT_VERSION,
        build_tag: BUILD_TAG,
    };
    let strict_mode = if args.dev {
        StrictMode::Dev
    } else {
        StrictMode::Strict
    };
    let secrets = Secrets::load(&args).await?;
    // Validate required secrets and arguments are provided in strict mode
    match strict_mode {
        StrictMode::Strict
            if (secrets.public_key.is_none()
                || secrets.jwt_token.is_none()
                || secrets.control_plane_jwt_token.is_none()) =>
        {
            // Production systems should always have secrets configured: if public_key was not set
            // then we would implicitly disable auth.
            anyhow::bail!(
                    "Insecure config!  One or more secrets is not set.  This is only permitted in `--dev` mode"
                );
        }
        StrictMode::Strict if args.compute_hook_url.is_none() => {
            // Production systems should always have a compute hook set, to prevent falling
            // back to trying to use neon_local.
            anyhow::bail!(
                "`--compute-hook-url` is not set: this is only permitted in `--dev` mode"
            );
        }
        StrictMode::Strict => {
            tracing::info!("Starting in strict mode: configuration is OK.")
        }
        StrictMode::Dev => {
            tracing::warn!("Starting in dev mode: this may be an insecure configuration.")
        }
    }
    let config = Config {
        jwt_token: secrets.jwt_token,
        control_plane_jwt_token: secrets.control_plane_jwt_token,
        compute_hook_url: args.compute_hook_url,
        max_unavailable_interval: args
            .max_unavailable_interval
            .map(humantime::Duration::into)
            .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
    };
    // After loading secrets & config, but before starting anything else, apply database migrations
@@ -259,7 +260,7 @@ async fn async_main() -> anyhow::Result<()> {
    let auth = secrets
        .public_key
        .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
-    let router = make_router(service.clone(), auth, build_info)
+    let router = make_router(service.clone(), auth)
        .build()
        .map_err(|err| anyhow!(err))?;
    let router_service = utils::http::RouterService::new(router).unwrap();
--- a/control_plane/attachment_service/src/metrics.rs
+++ b/control_plane/attachment_service/src/metrics.rs
@@ -0,0 +1,32 @@
 use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
 use once_cell::sync::Lazy;
 pub(crate) struct ReconcilerMetrics {
    pub(crate) spawned: IntCounter,
    pub(crate) complete: IntCounterVec,
 }
 impl ReconcilerMetrics {
    // Labels used on [`Self::complete`]
    pub(crate) const SUCCESS: &'static str = "ok";
    pub(crate) const ERROR: &'static str = "success";
    pub(crate) const CANCEL: &'static str = "cancel";
 }
 pub(crate) static RECONCILER: Lazy<ReconcilerMetrics> = Lazy::new(|| ReconcilerMetrics {
    spawned: register_int_counter!(
        "storage_controller_reconcile_spawn",
        "Count of how many times we spawn a reconcile task",
    )
    .expect("failed to define a metric"),
    complete: register_int_counter_vec!(
        "storage_controller_reconcile_complete",
        "Reconciler tasks completed, broken down by success/failure/cancelled",
        &["status"],
    )
    .expect("failed to define a metric"),
 });
 pub fn preinitialize_metrics() {
    Lazy::force(&RECONCILER);
 }
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -3,8 +3,7 @@ use std::{str::FromStr, time::Duration};
 use hyper::StatusCode;
 use pageserver_api::{
    controller_api::{
-        NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
+        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
        TenantLocateResponseShard,
    },
    shard::TenantShardId,
 };
@@ -13,9 +12,7 @@ use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, id::NodeId};
-use crate::{
+use crate::persistence::NodePersistence;
    pageserver_client::PageserverClient, persistence::NodePersistence, scheduler::MaySchedule,
 };
 /// Represents the in-memory description of a Node.
 ///
@@ -86,38 +83,29 @@ impl Node {
        }
    }
-    pub(crate) fn set_availability(&mut self, availability: NodeAvailability) {
+    pub(crate) fn set_availability(
-        match self.get_availability_transition(availability) {
+        &mut self,
-            AvailabilityTransition::ToActive => {
+        availability: NodeAvailability,
    ) -> AvailabilityTransition {
        use NodeAvailability::*;
        let transition = match (self.availability, availability) {
            (Offline, Active) => {
                // Give the node a new cancellation token, effectively resetting it to un-cancelled.  Any
                // users of previously-cloned copies of the node will still see the old cancellation
                // state.  For example, Reconcilers in flight will have to complete and be spawned
                // again to realize that the node has become available.
                self.cancel = CancellationToken::new();
                AvailabilityTransition::ToActive
            }
-            AvailabilityTransition::ToOffline => {
+            (Active, Offline) => {
                // Fire the node's cancellation token to cancel any in-flight API requests to it
                self.cancel.cancel();
                AvailabilityTransition::ToOffline
            }
-            AvailabilityTransition::Unchanged => {}
+            _ => AvailabilityTransition::Unchanged,
-        }
+        };
        self.availability = availability;
-    }
+        transition
    /// Without modifying the availability of the node, convert the intended availability
    /// into a description of the transition.
    pub(crate) fn get_availability_transition(
        &self,
        availability: NodeAvailability,
    ) -> AvailabilityTransition {
        use AvailabilityTransition::*;
        use NodeAvailability::*;
        match (self.availability, availability) {
            (Offline, Active(_)) => ToActive,
            (Active(_), Offline) => ToOffline,
            _ => Unchanged,
        }
    }
    /// Whether we may send API requests to this node.
@@ -126,21 +114,21 @@ impl Node {
        // a reference to the original Node's cancellation status.  Checking both of these results
        // in a "pessimistic" check where we will consider a Node instance unavailable if it was unavailable
        // when we cloned it, or if the original Node instance's cancellation token was fired.
-        matches!(self.availability, NodeAvailability::Active(_)) && !self.cancel.is_cancelled()
+        matches!(self.availability, NodeAvailability::Active) && !self.cancel.is_cancelled()
    }
    /// Is this node elegible to have work scheduled onto it?
-    pub(crate) fn may_schedule(&self) -> MaySchedule {
+    pub(crate) fn may_schedule(&self) -> bool {
-        let score = match self.availability {
+        match self.availability {
-            NodeAvailability::Active(score) => score,
+            NodeAvailability::Active => {}
-            NodeAvailability::Offline => return MaySchedule::No,
+            NodeAvailability::Offline => return false,
-        };
+        }
        match self.scheduling {
-            NodeSchedulingPolicy::Active => MaySchedule::Yes(score),
+            NodeSchedulingPolicy::Active => true,
-            NodeSchedulingPolicy::Draining => MaySchedule::No,
+            NodeSchedulingPolicy::Draining => false,
-            NodeSchedulingPolicy::Filling => MaySchedule::Yes(score),
+            NodeSchedulingPolicy::Filling => true,
-            NodeSchedulingPolicy::Pause => MaySchedule::No,
+            NodeSchedulingPolicy::Pause => false,
        }
    }
@@ -158,7 +146,8 @@ impl Node {
            listen_pg_addr,
            listen_pg_port,
            scheduling: NodeSchedulingPolicy::Filling,
-            availability: NodeAvailability::Offline,
+            // TODO: we shouldn't really call this Active until we've heartbeated it.
            availability: NodeAvailability::Active,
            cancel: CancellationToken::new(),
        }
    }
@@ -205,7 +194,7 @@ impl Node {
        cancel: &CancellationToken,
    ) -> Option<mgmt_api::Result<T>>
    where
-        O: FnMut(PageserverClient) -> F,
+        O: FnMut(mgmt_api::Client) -> F,
        F: std::future::Future<Output = mgmt_api::Result<T>>,
    {
        fn is_fatal(e: &mgmt_api::Error) -> bool {
@@ -227,12 +216,8 @@ impl Node {
                    .build()
                    .expect("Failed to construct HTTP client");
-                let client = PageserverClient::from_client(
+                let client =
-                    self.get_id(),
+                    mgmt_api::Client::from_client(http_client, self.base_url(), jwt.as_deref());
                    http_client,
                    self.base_url(),
                    jwt.as_deref(),
                );
                let node_cancel_fut = self.cancel.cancelled();
@@ -257,19 +242,6 @@ impl Node {
        )
        .await
    }
    /// Generate the simplified API-friendly description of a node's state
    pub(crate) fn describe(&self) -> NodeDescribeResponse {
        NodeDescribeResponse {
            id: self.id,
            availability: self.availability.into(),
            scheduling: self.scheduling,
            listen_http_addr: self.listen_http_addr.clone(),
            listen_http_port: self.listen_http_port,
            listen_pg_addr: self.listen_pg_addr.clone(),
            listen_pg_port: self.listen_pg_port,
        }
    }
 }
 impl std::fmt::Display for Node {
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -9,20 +9,13 @@ use camino::Utf8PathBuf;
 use diesel::pg::PgConnection;
 use diesel::prelude::*;
 use diesel::Connection;
 use pageserver_api::controller_api::ShardSchedulingPolicy;
 use pageserver_api::controller_api::{NodeSchedulingPolicy, PlacementPolicy};
 use pageserver_api::models::TenantConfig;
 use pageserver_api::shard::ShardConfigError;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::ShardStripeSize;
 use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use serde::{Deserialize, Serialize};
 use utils::generation::Generation;
 use utils::id::{NodeId, TenantId};
 use crate::metrics::{
    DatabaseQueryErrorLabelGroup, DatabaseQueryLatencyLabelGroup, METRICS_REGISTRY,
 };
 use crate::node::Node;
 /// ## What do we store?
@@ -79,41 +72,8 @@ pub(crate) enum DatabaseError {
    Logical(String),
 }
 #[derive(measured::FixedCardinalityLabel, Copy, Clone)]
 pub(crate) enum DatabaseOperation {
    InsertNode,
    UpdateNode,
    DeleteNode,
    ListNodes,
    BeginShardSplit,
    CompleteShardSplit,
    AbortShardSplit,
    Detach,
    ReAttach,
    IncrementGeneration,
    ListTenantShards,
    InsertTenantShards,
    UpdateTenantShard,
    DeleteTenant,
    UpdateTenantConfig,
 }
 #[must_use]
 pub(crate) enum AbortShardSplitStatus {
    /// We aborted the split in the database by reverting to the parent shards
    Aborted,
    /// The split had already been persisted.
    Complete,
 }
 pub(crate) type DatabaseResult<T> = Result<T, DatabaseError>;
 /// Some methods can operate on either a whole tenant or a single shard
 pub(crate) enum TenantFilter {
    Tenant(TenantId),
    Shard(TenantShardId),
 }
 impl Persistence {
    // The default postgres connection limit is 100.  We use up to 99, to leave one free for a human admin under
    // normal circumstances.  This assumes we have exclusive use of the database cluster to which we connect.
@@ -144,36 +104,10 @@ impl Persistence {
        }
    }
    /// Wraps `with_conn` in order to collect latency and error metrics
    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
    where
        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
        R: Send + 'static,
    {
        let latency = &METRICS_REGISTRY
            .metrics_group
            .storage_controller_database_query_latency;
        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
        let res = self.with_conn(func).await;
        if let Err(err) = &res {
            let error_counter = &METRICS_REGISTRY
                .metrics_group
                .storage_controller_database_query_error;
            error_counter.inc(DatabaseQueryErrorLabelGroup {
                error_type: err.error_label(),
                operation: op,
            })
        }
        res
    }
    /// Call the provided function in a tokio blocking thread, with a Diesel database connection.
    async fn with_conn<F, R>(&self, func: F) -> DatabaseResult<R>
    where
-        F: FnOnce(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
+        F: Fn(&mut PgConnection) -> DatabaseResult<R> + Send + 'static,
        R: Send + 'static,
    {
        let mut conn = self.connection_pool.get()?;
@@ -185,27 +119,21 @@ impl Persistence {
    /// When a node is first registered, persist it before using it for anything
    pub(crate) async fn insert_node(&self, node: &Node) -> DatabaseResult<()> {
        let np = node.to_persistent();
-        self.with_measured_conn(
+        self.with_conn(move |conn| -> DatabaseResult<()> {
-            DatabaseOperation::InsertNode,
+            diesel::insert_into(crate::schema::nodes::table)
-            move |conn| -> DatabaseResult<()> {
+                .values(&np)
-                diesel::insert_into(crate::schema::nodes::table)
+                .execute(conn)?;
-                    .values(&np)
+            Ok(())
-                    .execute(conn)?;
+        })
                Ok(())
            },
        )
        .await
    }
    /// At startup, populate the list of nodes which our shards may be placed on
    pub(crate) async fn list_nodes(&self) -> DatabaseResult<Vec<NodePersistence>> {
        let nodes: Vec<NodePersistence> = self
-            .with_measured_conn(
+            .with_conn(move |conn| -> DatabaseResult<_> {
-                DatabaseOperation::ListNodes,
+                Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
-                move |conn| -> DatabaseResult<_> {
+            })
                    Ok(crate::schema::nodes::table.load::<NodePersistence>(conn)?)
                },
            )
            .await?;
        tracing::info!("list_nodes: loaded {} nodes", nodes.len());
@@ -220,7 +148,7 @@ impl Persistence {
    ) -> DatabaseResult<()> {
        use crate::schema::nodes::dsl::*;
        let updated = self
-            .with_measured_conn(DatabaseOperation::UpdateNode, move |conn| {
+            .with_conn(move |conn| {
                let updated = diesel::update(nodes)
                    .filter(node_id.eq(input_node_id.0 as i64))
                    .set((scheduling_policy.eq(String::from(input_scheduling)),))
@@ -242,12 +170,9 @@ impl Persistence {
    /// be enriched at runtime with state discovered on pageservers.
    pub(crate) async fn list_tenant_shards(&self) -> DatabaseResult<Vec<TenantShardPersistence>> {
        let loaded = self
-            .with_measured_conn(
+            .with_conn(move |conn| -> DatabaseResult<_> {
-                DatabaseOperation::ListTenantShards,
+                Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
-                move |conn| -> DatabaseResult<_> {
+            })
                    Ok(crate::schema::tenant_shards::table.load::<TenantShardPersistence>(conn)?)
                },
            )
            .await?;
        if loaded.is_empty() {
@@ -275,15 +200,15 @@ impl Persistence {
        let mut decoded = serde_json::from_slice::<JsonPersistence>(&bytes)
            .map_err(|e| DatabaseError::Logical(format!("Deserialization error: {e}")))?;
-        for shard in decoded.tenants.values_mut() {
+        for (tenant_id, tenant) in &mut decoded.tenants {
-            if shard.placement_policy == "\"Single\"" {
+            // Backward compat: an old attachments.json from before PR #6251, replace
-                // Backward compat for test data after PR https://github.com/neondatabase/neon/pull/7165
+            // empty strings with proper defaults.
-                shard.placement_policy = "{\"Attached\":0}".to_string();
+            if tenant.tenant_id.is_empty() {
-            }
+                tenant.tenant_id = tenant_id.to_string();
-
+                tenant.config = serde_json::to_string(&TenantConfig::default())
-            if shard.scheduling_policy.is_empty() {
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
-                shard.scheduling_policy =
+                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::Single)
-                    serde_json::to_string(&ShardSchedulingPolicy::default()).unwrap();
+                    .map_err(|e| DatabaseError::Logical(format!("Serialization error: {e}")))?;
            }
        }
@@ -329,20 +254,17 @@ impl Persistence {
        shards: Vec<TenantShardPersistence>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
+        self.with_conn(move |conn| -> DatabaseResult<()> {
-            DatabaseOperation::InsertTenantShards,
+            conn.transaction(|conn| -> QueryResult<()> {
-            move |conn| -> DatabaseResult<()> {
+                for tenant in &shards {
-                conn.transaction(|conn| -> QueryResult<()> {
+                    diesel::insert_into(tenant_shards)
-                    for tenant in &shards {
+                        .values(tenant)
-                        diesel::insert_into(tenant_shards)
+                        .execute(conn)?;
-                            .values(tenant)
+                }
                            .execute(conn)?;
                    }
                    Ok(())
                })?;
                Ok(())
-            },
+            })?;
-        )
+            Ok(())
        })
        .await
    }
@@ -350,31 +272,25 @@ impl Persistence {
    /// the tenant from memory on this server.
    pub(crate) async fn delete_tenant(&self, del_tenant_id: TenantId) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
+        self.with_conn(move |conn| -> DatabaseResult<()> {
-            DatabaseOperation::DeleteTenant,
+            diesel::delete(tenant_shards)
-            move |conn| -> DatabaseResult<()> {
+                .filter(tenant_id.eq(del_tenant_id.to_string()))
-                diesel::delete(tenant_shards)
+                .execute(conn)?;
                    .filter(tenant_id.eq(del_tenant_id.to_string()))
                    .execute(conn)?;
-                Ok(())
+            Ok(())
-            },
+        })
        )
        .await
    }
    pub(crate) async fn delete_node(&self, del_node_id: NodeId) -> DatabaseResult<()> {
        use crate::schema::nodes::dsl::*;
-        self.with_measured_conn(
+        self.with_conn(move |conn| -> DatabaseResult<()> {
-            DatabaseOperation::DeleteNode,
+            diesel::delete(nodes)
-            move |conn| -> DatabaseResult<()> {
+                .filter(node_id.eq(del_node_id.0 as i64))
-                diesel::delete(nodes)
+                .execute(conn)?;
                    .filter(node_id.eq(del_node_id.0 as i64))
                    .execute(conn)?;
-                Ok(())
+            Ok(())
-            },
+        })
        )
        .await
    }
@@ -388,7 +304,7 @@ impl Persistence {
    ) -> DatabaseResult<HashMap<TenantShardId, Generation>> {
        use crate::schema::tenant_shards::dsl::*;
        let updated = self
-            .with_measured_conn(DatabaseOperation::ReAttach, move |conn| {
+            .with_conn(move |conn| {
                let rows_updated = diesel::update(tenant_shards)
                    .filter(generation_pageserver.eq(node_id.0 as i64))
                    .set(generation.eq(generation + 1))
@@ -438,7 +354,7 @@ impl Persistence {
    ) -> anyhow::Result<Generation> {
        use crate::schema::tenant_shards::dsl::*;
        let updated = self
-            .with_measured_conn(DatabaseOperation::IncrementGeneration, move |conn| {
+            .with_conn(move |conn| {
                let updated = diesel::update(tenant_shards)
                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -475,45 +391,59 @@ impl Persistence {
    /// that we only do the first time a tenant is set to an attached policy via /location_config.
    pub(crate) async fn update_tenant_shard(
        &self,
-        tenant: TenantFilter,
+        tenant_shard_id: TenantShardId,
-        input_placement_policy: Option<PlacementPolicy>,
+        input_placement_policy: PlacementPolicy,
-        input_config: Option<TenantConfig>,
+        input_config: TenantConfig,
        input_generation: Option<Generation>,
        input_scheduling_policy: Option<ShardSchedulingPolicy>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::UpdateTenantShard, move |conn| {
+        self.with_conn(move |conn| {
-            let query = match tenant {
+            let query = diesel::update(tenant_shards)
-                TenantFilter::Shard(tenant_shard_id) => diesel::update(tenant_shards)
+                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
-                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
+                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32));
                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
                    .into_boxed(),
                TenantFilter::Tenant(input_tenant_id) => diesel::update(tenant_shards)
                    .filter(tenant_id.eq(input_tenant_id.to_string()))
                    .into_boxed(),
            };
-            #[derive(AsChangeset)]
+            if let Some(input_generation) = input_generation {
-            #[diesel(table_name = crate::schema::tenant_shards)]
+                // Update includes generation column
-            struct ShardUpdate {
+                query
-                generation: Option<i32>,
+                    .set((
-                placement_policy: Option<String>,
+                        generation.eq(Some(input_generation.into().unwrap() as i32)),
-                config: Option<String>,
+                        placement_policy
-                scheduling_policy: Option<String>,
+                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
                        config.eq(serde_json::to_string(&input_config).unwrap()),
                    ))
                    .execute(conn)?;
            } else {
                // Update does not include generation column
                query
                    .set((
                        placement_policy
                            .eq(serde_json::to_string(&input_placement_policy).unwrap()),
                        config.eq(serde_json::to_string(&input_config).unwrap()),
                    ))
                    .execute(conn)?;
            }
-            let update = ShardUpdate {
+            Ok(())
-                generation: input_generation.map(|g| g.into().unwrap() as i32),
+        })
-                placement_policy: input_placement_policy
+        .await?;
                    .map(|p| serde_json::to_string(&p).unwrap()),
                config: input_config.map(|c| serde_json::to_string(&c).unwrap()),
                scheduling_policy: input_scheduling_policy
                    .map(|p| serde_json::to_string(&p).unwrap()),
            };
-            query.set(update).execute(conn)?;
+        Ok(())
    }
    pub(crate) async fn update_tenant_config(
        &self,
        input_tenant_id: TenantId,
        input_config: TenantConfig,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
        self.with_conn(move |conn| {
            diesel::update(tenant_shards)
                .filter(tenant_id.eq(input_tenant_id.to_string()))
                .set((config.eq(serde_json::to_string(&input_config).unwrap()),))
                .execute(conn)?;
            Ok(())
        })
@@ -524,7 +454,7 @@ impl Persistence {
    pub(crate) async fn detach(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::Detach, move |conn| {
+        self.with_conn(move |conn| {
            let updated = diesel::update(tenant_shards)
                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
@@ -554,7 +484,7 @@ impl Persistence {
        parent_to_children: Vec<(TenantShardId, Vec<TenantShardPersistence>)>,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(DatabaseOperation::BeginShardSplit, move |conn| -> DatabaseResult<()> {
+        self.with_conn(move |conn| -> DatabaseResult<()> {
            conn.transaction(|conn| -> DatabaseResult<()> {
                // Mark parent shards as splitting
@@ -618,83 +548,31 @@ impl Persistence {
        old_shard_count: ShardCount,
    ) -> DatabaseResult<()> {
        use crate::schema::tenant_shards::dsl::*;
-        self.with_measured_conn(
+        self.with_conn(move |conn| -> DatabaseResult<()> {
-            DatabaseOperation::CompleteShardSplit,
+            conn.transaction(|conn| -> QueryResult<()> {
-            move |conn| -> DatabaseResult<()> {
+                // Drop parent shards
-                conn.transaction(|conn| -> QueryResult<()> {
+                diesel::delete(tenant_shards)
-                    // Drop parent shards
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    diesel::delete(tenant_shards)
+                    .filter(shard_count.eq(old_shard_count.literal() as i32))
-                        .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .execute(conn)?;
                        .filter(shard_count.eq(old_shard_count.literal() as i32))
                        .execute(conn)?;
-                    // Clear sharding flag
+                // Clear sharding flag
-                    let updated = diesel::update(tenant_shards)
+                let updated = diesel::update(tenant_shards)
-                        .filter(tenant_id.eq(split_tenant_id.to_string()))
+                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                        .set((splitting.eq(0),))
+                    .set((splitting.eq(0),))
-                        .execute(conn)?;
+                    .execute(conn)?;
-                    debug_assert!(updated > 0);
+                debug_assert!(updated > 0);
                    Ok(())
                })?;
                Ok(())
-            },
+            })?;
        )
        .await
    }
-    /// Used when the remote part of a shard split failed: we will revert the database state to have only
+            Ok(())
-    /// the parent shards, with SplitState::Idle.
+        })
    pub(crate) async fn abort_shard_split(
        &self,
        split_tenant_id: TenantId,
        new_shard_count: ShardCount,
    ) -> DatabaseResult<AbortShardSplitStatus> {
        use crate::schema::tenant_shards::dsl::*;
        self.with_measured_conn(
            DatabaseOperation::AbortShardSplit,
            move |conn| -> DatabaseResult<AbortShardSplitStatus> {
                let aborted =
                    conn.transaction(|conn| -> DatabaseResult<AbortShardSplitStatus> {
                        // Clear the splitting state on parent shards
                        let updated = diesel::update(tenant_shards)
                            .filter(tenant_id.eq(split_tenant_id.to_string()))
                            .filter(shard_count.ne(new_shard_count.literal() as i32))
                            .set((splitting.eq(0),))
                            .execute(conn)?;
                        // Parent shards are already gone: we cannot abort.
                        if updated == 0 {
                            return Ok(AbortShardSplitStatus::Complete);
                        }
                        // Sanity check: if parent shards were present, their cardinality should
                        // be less than the number of child shards.
                        if updated >= new_shard_count.count() as usize {
                            return Err(DatabaseError::Logical(format!(
                                "Unexpected parent shard count {updated} while aborting split to \
                            count {new_shard_count:?} on tenant {split_tenant_id}"
                            )));
                        }
                        // Erase child shards
                        diesel::delete(tenant_shards)
                            .filter(tenant_id.eq(split_tenant_id.to_string()))
                            .filter(shard_count.eq(new_shard_count.literal() as i32))
                            .execute(conn)?;
                        Ok(AbortShardSplitStatus::Aborted)
                    })?;
                Ok(aborted)
            },
        )
        .await
    }
 }
-/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
+/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
 #[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
 #[diesel(table_name = crate::schema::tenant_shards)]
 pub(crate) struct TenantShardPersistence {
@@ -724,30 +602,6 @@ pub(crate) struct TenantShardPersistence {
    pub(crate) splitting: SplitState,
    #[serde(default)]
    pub(crate) config: String,
    #[serde(default)]
    pub(crate) scheduling_policy: String,
 }
 impl TenantShardPersistence {
    pub(crate) fn get_shard_identity(&self) -> Result<ShardIdentity, ShardConfigError> {
        if self.shard_count == 0 {
            Ok(ShardIdentity::unsharded())
        } else {
            Ok(ShardIdentity::new(
                ShardNumber(self.shard_number as u8),
                ShardCount::new(self.shard_count as u8),
                ShardStripeSize(self.shard_stripe_size as u32),
            )?)
        }
    }
    pub(crate) fn get_tenant_shard_id(&self) -> Result<TenantShardId, hex::FromHexError> {
        Ok(TenantShardId {
            tenant_id: TenantId::from_str(self.tenant_id.as_str())?,
            shard_number: ShardNumber(self.shard_number as u8),
            shard_count: ShardCount::new(self.shard_count as u8),
        })
    }
 }
 /// Parts of [`crate::node::Node`] that are stored durably
--- a/control_plane/attachment_service/src/persistence/split_state.rs
+++ b/control_plane/attachment_service/src/persistence/split_state.rs
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,7 +1,5 @@
 use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
 use crate::service;
 use hyper::StatusCode;
 use pageserver_api::models::{
    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
@@ -9,7 +7,7 @@ use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_client::mgmt_api;
 use std::collections::HashMap;
 use std::sync::Arc;
-use std::time::{Duration, Instant};
+use std::time::Duration;
 use tokio_util::sync::CancellationToken;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
@@ -18,14 +16,12 @@ use utils::sync::gate::GateGuard;
 use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
-use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation};
+use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
 const DEFAULT_HEATMAP_PERIOD: &str = "60s";
 /// Object with the lifetime of the background reconcile task that is created
 /// for tenants which have a difference between their intent and observed states.
 pub(super) struct Reconciler {
-    /// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot
+    /// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
    /// of a tenant's state from when we spawned a reconcile task.
    pub(super) tenant_shard_id: TenantShardId,
    pub(crate) shard: ShardIdentity,
@@ -48,11 +44,11 @@ pub(super) struct Reconciler {
    /// To avoid stalling if the cloud control plane is unavailable, we may proceed
    /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
-    /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry.
+    /// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry.
    pub(crate) compute_notify_failure: bool,
    /// A means to abort background reconciliation: it is essential to
-    /// call this when something changes in the original TenantShard that
+    /// call this when something changes in the original TenantState that
    /// will make this reconciliation impossible or unnecessary, for
    /// example when a pageserver node goes offline, or the PlacementPolicy for
    /// the tenant is changed.
@@ -66,7 +62,7 @@ pub(super) struct Reconciler {
    pub(crate) persistence: Arc<Persistence>,
 }
-/// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any
+/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
 /// reference counting for Scheduler.  The IntentState is what the scheduler works with,
 /// and the TargetState is just the instruction for a particular Reconciler run.
 #[derive(Debug)]
@@ -118,15 +114,6 @@ impl Reconciler {
        flush_ms: Option<Duration>,
        lazy: bool,
    ) -> Result<(), ReconcileError> {
        if !node.is_available() && config.mode == LocationConfigMode::Detached {
            // Attempts to detach from offline nodes may be imitated without doing I/O: a node which is offline
            // will get fully reconciled wrt the shard's intent state when it is reactivated, irrespective of
            // what we put into `observed`, in [`crate::service::Service::node_activate_reconcile`]
            tracing::info!("Node {node} is unavailable during detach: proceeding anyway, it will be detached on next activation");
            self.observed.locations.remove(&node.get_id());
            return Ok(());
        }
        self.observed
            .locations
            .insert(node.get_id(), ObservedStateLocation { conf: None });
@@ -159,16 +146,9 @@ impl Reconciler {
        };
        tracing::info!("location_config({node}) complete: {:?}", config);
-        match config.mode {
+        self.observed
-            LocationConfigMode::Detached => {
+            .locations
-                self.observed.locations.remove(&node.get_id());
+            .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
            }
            _ => {
                self.observed
                    .locations
                    .insert(node.get_id(), ObservedStateLocation { conf: Some(config) });
            }
        }
        Ok(())
    }
@@ -260,11 +240,8 @@ impl Reconciler {
        tenant_shard_id: TenantShardId,
        node: &Node,
    ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-        let client = PageserverClient::new(
+        let client =
-            node.get_id(),
+            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
            node.base_url(),
            self.service_config.jwt_token.as_deref(),
        );
        let timelines = client.timeline_list(&tenant_shard_id).await?;
        Ok(timelines
@@ -278,81 +255,22 @@ impl Reconciler {
        tenant_shard_id: TenantShardId,
        node: &Node,
    ) -> Result<(), ReconcileError> {
-        // This is not the timeout for a request, but the total amount of time we're willing to wait
+        match node
-        // for a secondary location to get up to date before
+            .with_client_retries(
-        const TOTAL_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(300);
+                |client| async move { client.tenant_secondary_download(tenant_shard_id).await },
-
+                &self.service_config.jwt_token,
-        // This the long-polling interval for the secondary download requests we send to destination pageserver
+                1,
-        // during a migration.
+                1,
-        const REQUEST_DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(20);
+                Duration::from_secs(60),
-
+                &self.cancel,
-        let started_at = Instant::now();
+            )
-
+            .await
-        loop {
+        {
-            let (status, progress) = match node
+            None => Err(ReconcileError::Cancel),
-                .with_client_retries(
+            Some(Ok(_)) => Ok(()),
-                    |client| async move {
+            Some(Err(e)) => {
-                        client
+                tracing::info!("  (skipping destination download: {})", e);
-                            .tenant_secondary_download(
+                Ok(())
                                tenant_shard_id,
                                Some(REQUEST_DOWNLOAD_TIMEOUT),
                            )
                            .await
                    },
                    &self.service_config.jwt_token,
                    1,
                    3,
                    REQUEST_DOWNLOAD_TIMEOUT * 2,
                    &self.cancel,
                )
                .await
            {
                None => Err(ReconcileError::Cancel),
                Some(Ok(v)) => Ok(v),
                Some(Err(e)) => {
                    // Give up, but proceed: it's unfortunate if we couldn't freshen the destination before
                    // attaching, but we should not let an issue with a secondary location stop us proceeding
                    // with a live migration.
                    tracing::warn!("Failed to prepare by downloading layers on node {node}: {e})");
                    return Ok(());
                }
            }?;
            if status == StatusCode::OK {
                tracing::info!(
                    "Downloads to {} complete: {}/{} layers, {}/{} bytes",
                    node,
                    progress.layers_downloaded,
                    progress.layers_total,
                    progress.bytes_downloaded,
                    progress.bytes_total
                );
                return Ok(());
            } else if status == StatusCode::ACCEPTED {
                let total_runtime = started_at.elapsed();
                if total_runtime > TOTAL_DOWNLOAD_TIMEOUT {
                    tracing::warn!("Timed out after {}ms downloading layers to {node}.  Progress so far: {}/{} layers, {}/{} bytes",
                        total_runtime.as_millis(),
                        progress.layers_downloaded,
                        progress.layers_total,
                        progress.bytes_downloaded,
                        progress.bytes_total
                    );
                    // Give up, but proceed: an incompletely warmed destination doesn't prevent migration working,
                    // it just makes the I/O performance for users less good.
                    return Ok(());
                }
                // Log and proceed around the loop to retry.  We don't sleep between requests, because our HTTP call
                // to the pageserver is a long-poll.
                tracing::info!(
                    "Downloads to {} not yet complete: {}/{} layers, {}/{} bytes",
                    node,
                    progress.layers_downloaded,
                    progress.layers_total,
                    progress.bytes_downloaded,
                    progress.bytes_total
                );
            }
        }
    }
@@ -487,7 +405,6 @@ impl Reconciler {
        while let Err(e) = self.compute_notify().await {
            match e {
                NotifyError::Fatal(_) => return Err(ReconcileError::Notify(e)),
                NotifyError::ShuttingDown => return Err(ReconcileError::Cancel),
                _ => {
                    tracing::warn!(
                        "Live migration blocked by compute notification error, retrying: {e}"
@@ -496,7 +413,7 @@ impl Reconciler {
            }
        }
-        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Attached(0), then
+        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Single, then
        // this location will be deleted in the general case reconciliation that runs after this.
        let origin_secondary_conf = build_location_config(
            &self.shard,
@@ -568,29 +485,17 @@ impl Reconciler {
                )
                .await
            {
-                Some(Ok(observed)) => Some(observed),
+                Some(Ok(observed)) => observed,
                Some(Err(mgmt_api::Error::ApiError(status, _msg)))
                    if status == StatusCode::NOT_FOUND =>
                {
                    None
                }
                Some(Err(e)) => return Err(e.into()),
                None => return Err(ReconcileError::Cancel),
            };
            tracing::info!("Scanned location configuration on {attached_node}: {observed_conf:?}");
-            match observed_conf {
+            self.observed.locations.insert(
-                Some(conf) => {
+                attached_node.get_id(),
-                    // Pageserver returned a state: update it in observed.  This may still be an indeterminate (None) state,
+                ObservedStateLocation {
-                    // if internally the pageserver's TenantSlot was being mutated (e.g. some long running API call is still running)
+                    conf: observed_conf,
-                    self.observed
+                },
-                        .locations
+            );
                        .insert(attached_node.get_id(), ObservedStateLocation { conf });
                }
                None => {
                    // Pageserver returned 404: we have confirmation that there is no state for this shard on that pageserver.
                    self.observed.locations.remove(&attached_node.get_id());
                }
            }
        }
        Ok(())
@@ -620,12 +525,7 @@ impl Reconciler {
                )));
            };
-            let mut wanted_conf = attached_location_conf(
+            let mut wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
                generation,
                &self.shard,
                &self.config,
                !self.intent.secondary.is_empty(),
            );
            match self.observed.locations.get(&node.get_id()) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
                    // Nothing to do
@@ -762,26 +662,10 @@ impl Reconciler {
    }
 }
 /// We tweak the externally-set TenantConfig while configuring
 /// locations, using our awareness of whether secondary locations
 /// are in use to automatically enable/disable heatmap uploads.
 fn ha_aware_config(config: &TenantConfig, has_secondaries: bool) -> TenantConfig {
    let mut config = config.clone();
    if has_secondaries {
        if config.heatmap_period.is_none() {
            config.heatmap_period = Some(DEFAULT_HEATMAP_PERIOD.to_string());
        }
    } else {
        config.heatmap_period = None;
    }
    config
 }
 pub(crate) fn attached_location_conf(
    generation: Generation,
    shard: &ShardIdentity,
    config: &TenantConfig,
    has_secondaries: bool,
 ) -> LocationConfig {
    LocationConfig {
        mode: LocationConfigMode::AttachedSingle,
@@ -790,7 +674,7 @@ pub(crate) fn attached_location_conf(
        shard_number: shard.number.0,
        shard_count: shard.count.literal(),
        shard_stripe_size: shard.stripe_size.0,
-        tenant_conf: ha_aware_config(config, has_secondaries),
+        tenant_conf: config.clone(),
    }
 }
@@ -805,6 +689,6 @@ pub(crate) fn secondary_location_conf(
        shard_number: shard.number.0,
        shard_count: shard.count.literal(),
        shard_stripe_size: shard.stripe_size.0,
-        tenant_conf: ha_aware_config(config, true),
+        tenant_conf: config.clone(),
    }
 }
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -1,5 +1,4 @@
-use crate::{node::Node, tenant_shard::TenantShard};
+use crate::{node::Node, tenant_state::TenantState};
 use pageserver_api::controller_api::UtilizationScore;
 use serde::Serialize;
 use std::collections::HashMap;
 use utils::{http::error::ApiError, id::NodeId};
@@ -20,34 +19,15 @@ impl From<ScheduleError> for ApiError {
 }
 #[derive(Serialize, Eq, PartialEq)]
 pub enum MaySchedule {
    Yes(UtilizationScore),
    No,
 }
 #[derive(Serialize)]
 struct SchedulerNode {
-    /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
+    /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
    shard_count: usize,
    /// Whether this node is currently elegible to have new shards scheduled (this is derived
    /// from a node's availability state and scheduling policy).
-    may_schedule: MaySchedule,
+    may_schedule: bool,
 }
 impl PartialEq for SchedulerNode {
    fn eq(&self, other: &Self) -> bool {
        let may_schedule_matches = matches!(
            (&self.may_schedule, &other.may_schedule),
            (MaySchedule::Yes(_), MaySchedule::Yes(_)) | (MaySchedule::No, MaySchedule::No)
        );
        may_schedule_matches && self.shard_count == other.shard_count
    }
 }
 impl Eq for SchedulerNode {}
 /// This type is responsible for selecting which node is used when a tenant shard needs to choose a pageserver
 /// on which to run.
 ///
@@ -58,86 +38,6 @@ pub(crate) struct Scheduler {
    nodes: HashMap<NodeId, SchedulerNode>,
 }
 /// Score for soft constraint scheduling: lower scores are preferred to higher scores.
 ///
 /// For example, we may set an affinity score based on the number of shards from the same
 /// tenant already on a node, to implicitly prefer to balance out shards.
 #[derive(Copy, Clone, Debug, Eq, PartialEq, PartialOrd, Ord)]
 pub(crate) struct AffinityScore(pub(crate) usize);
 impl AffinityScore {
    /// If we have no anti-affinity at all toward a node, this is its score.  It means
    /// the scheduler has a free choice amongst nodes with this score, and may pick a node
    /// based on other information such as total utilization.
    pub(crate) const FREE: Self = Self(0);
    pub(crate) fn inc(&mut self) {
        self.0 += 1;
    }
 }
 impl std::ops::Add for AffinityScore {
    type Output = Self;
    fn add(self, rhs: Self) -> Self::Output {
        Self(self.0 + rhs.0)
    }
 }
 /// Hint for whether this is a sincere attempt to schedule, or a speculative
 /// check for where we _would_ schedule (done during optimization)
 #[derive(Debug)]
 pub(crate) enum ScheduleMode {
    Normal,
    Speculative,
 }
 impl Default for ScheduleMode {
    fn default() -> Self {
        Self::Normal
    }
 }
 // For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
 // it for many shards in the same tenant.
 #[derive(Debug, Default)]
 pub(crate) struct ScheduleContext {
    /// Sparse map of nodes: omitting a node implicitly makes its affinity [`AffinityScore::FREE`]
    pub(crate) nodes: HashMap<NodeId, AffinityScore>,
    /// Specifically how many _attached_ locations are on each node
    pub(crate) attached_nodes: HashMap<NodeId, usize>,
    pub(crate) mode: ScheduleMode,
 }
 impl ScheduleContext {
    /// Input is a list of nodes we would like to avoid using again within this context.  The more
    /// times a node is passed into this call, the less inclined we are to use it.
    pub(crate) fn avoid(&mut self, nodes: &[NodeId]) {
        for node_id in nodes {
            let entry = self.nodes.entry(*node_id).or_insert(AffinityScore::FREE);
            entry.inc()
        }
    }
    pub(crate) fn push_attached(&mut self, node_id: NodeId) {
        let entry = self.attached_nodes.entry(node_id).or_default();
        *entry += 1;
    }
    pub(crate) fn get_node_affinity(&self, node_id: NodeId) -> AffinityScore {
        self.nodes
            .get(&node_id)
            .copied()
            .unwrap_or(AffinityScore::FREE)
    }
    pub(crate) fn get_node_attachments(&self, node_id: NodeId) -> usize {
        self.attached_nodes.get(&node_id).copied().unwrap_or(0)
    }
 }
 impl Scheduler {
    pub(crate) fn new<'a>(nodes: impl Iterator<Item = &'a Node>) -> Self {
        let mut scheduler_nodes = HashMap::new();
@@ -163,7 +63,7 @@ impl Scheduler {
    pub(crate) fn consistency_check<'a>(
        &self,
        nodes: impl Iterator<Item = &'a Node>,
-        shards: impl Iterator<Item = &'a TenantShard>,
+        shards: impl Iterator<Item = &'a TenantState>,
    ) -> anyhow::Result<()> {
        let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
        for node in nodes {
@@ -286,15 +186,13 @@ impl Scheduler {
            return None;
        }
        // TODO: When the utilization score returned by the pageserver becomes meaningful,
        // schedule based on that instead of the shard count.
        let node = nodes
            .iter()
            .map(|node_id| {
                let may_schedule = self
                    .nodes
                    .get(node_id)
-                    .map(|n| n.may_schedule != MaySchedule::No)
+                    .map(|n| n.may_schedule)
                    .unwrap_or(false);
                (*node_id, may_schedule)
            })
@@ -304,94 +202,59 @@ impl Scheduler {
        node.and_then(|(node_id, may_schedule)| if may_schedule { Some(node_id) } else { None })
    }
-    /// hard_exclude: it is forbidden to use nodes in this list, typically becacuse they
+    pub(crate) fn schedule_shard(&self, hard_exclude: &[NodeId]) -> Result<NodeId, ScheduleError> {
    /// are already in use by this shard -- we use this to avoid picking the same node
    /// as both attached and secondary location.  This is a hard constraint: if we cannot
    /// find any nodes that aren't in this list, then we will return a [`ScheduleError::ImpossibleConstraint`].
    ///
    /// context: we prefer to avoid using nodes identified in the context, according
    /// to their anti-affinity score.  We use this to prefeer to avoid placing shards in
    /// the same tenant on the same node.  This is a soft constraint: the context will never
    /// cause us to fail to schedule a shard.
    pub(crate) fn schedule_shard(
        &self,
        hard_exclude: &[NodeId],
        context: &ScheduleContext,
    ) -> Result<NodeId, ScheduleError> {
        if self.nodes.is_empty() {
            return Err(ScheduleError::NoPageservers);
        }
-        let mut scores: Vec<(NodeId, AffinityScore, usize)> = self
+        let mut tenant_counts: Vec<(NodeId, usize)> = self
            .nodes
            .iter()
            .filter_map(|(k, v)| {
-                if hard_exclude.contains(k) || v.may_schedule == MaySchedule::No {
+                if hard_exclude.contains(k) || !v.may_schedule {
                    None
                } else {
-                    Some((
+                    Some((*k, v.shard_count))
                        *k,
                        context.nodes.get(k).copied().unwrap_or(AffinityScore::FREE),
                        v.shard_count,
                    ))
                }
            })
            .collect();
-        // Sort by, in order of precedence:
+        // Sort by tenant count.  Nodes with the same tenant count are sorted by ID.
-        //  1st: Affinity score.  We should never pick a higher-score node if a lower-score node is available
+        tenant_counts.sort_by_key(|i| (i.1, i.0));
        //  2nd: Utilization.  Within nodes with the same affinity, use the least loaded nodes.
        //  3rd: Node ID.  This is a convenience to make selection deterministic in tests and empty systems.
        scores.sort_by_key(|i| (i.1, i.2, i.0));
-        if scores.is_empty() {
+        if tenant_counts.is_empty() {
-            // After applying constraints, no pageservers were left.
+            // After applying constraints, no pageservers were left.  We log some detail about
-            if !matches!(context.mode, ScheduleMode::Speculative) {
+            // the state of nodes to help understand why this happened.  This is not logged as an error because
-                // If this was not a speculative attempt, log details to understand why we couldn't
+            // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
-                // schedule: this may help an engineer understand if some nodes are marked offline
+            tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
-                // in a way that's preventing progress.
+            for (node_id, node) in &self.nodes {
                tracing::info!(
-                    "Scheduling failure, while excluding {hard_exclude:?}, node states:"
+                    "Node {node_id}: may_schedule={} shards={}",
                    node.may_schedule,
                    node.shard_count
                );
                for (node_id, node) in &self.nodes {
                    tracing::info!(
                        "Node {node_id}: may_schedule={} shards={}",
                        node.may_schedule != MaySchedule::No,
                        node.shard_count
                    );
                }
            }
            return Err(ScheduleError::ImpossibleConstraint);
        }
-        // Lowest score wins
+        let node_id = tenant_counts.first().unwrap().0;
-        let node_id = scores.first().unwrap().0;
+        tracing::info!(
-
+            "scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
-        if !matches!(context.mode, ScheduleMode::Speculative) {
+            tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
            tracing::info!(
            "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
            scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
        );
        }
        // Note that we do not update shard count here to reflect the scheduling: that
        // is IntentState's job when the scheduled location is used.
        Ok(node_id)
    }
    /// Unit test access to internal state
    #[cfg(test)]
    pub(crate) fn get_node_shard_count(&self, node_id: NodeId) -> usize {
        self.nodes.get(&node_id).unwrap().shard_count
    }
 }
 #[cfg(test)]
 pub(crate) mod test_utils {
    use crate::node::Node;
    use pageserver_api::controller_api::{NodeAvailability, UtilizationScore};
    use std::collections::HashMap;
    use utils::id::NodeId;
    /// Test helper: synthesize the requested number of nodes, all in active state.
@@ -401,14 +264,13 @@ pub(crate) mod test_utils {
        (1..n + 1)
            .map(|i| {
                (NodeId(i), {
-                    let mut node = Node::new(
+                    let node = Node::new(
                        NodeId(i),
                        format!("httphost-{i}"),
                        80 + i as u16,
                        format!("pghost-{i}"),
                        5432 + i as u16,
                    );
                    node.set_availability(NodeAvailability::Active(UtilizationScore::worst()));
                    assert!(node.is_available());
                    node
                })
@@ -421,7 +283,7 @@ pub(crate) mod test_utils {
 mod tests {
    use super::*;
-    use crate::tenant_shard::IntentState;
+    use crate::tenant_state::IntentState;
    #[test]
    fn scheduler_basic() -> anyhow::Result<()> {
        let nodes = test_utils::make_test_nodes(2);
@@ -430,17 +292,15 @@ mod tests {
        let mut t1_intent = IntentState::new();
        let mut t2_intent = IntentState::new();
-        let context = ScheduleContext::default();
+        let scheduled = scheduler.schedule_shard(&[])?;
        let scheduled = scheduler.schedule_shard(&[], &context)?;
        t1_intent.set_attached(&mut scheduler, Some(scheduled));
-        let scheduled = scheduler.schedule_shard(&[], &context)?;
+        let scheduled = scheduler.schedule_shard(&[])?;
        t2_intent.set_attached(&mut scheduler, Some(scheduled));
        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
        assert_eq!(scheduler.nodes.get(&NodeId(2)).unwrap().shard_count, 1);
-        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers(), &context)?;
+        let scheduled = scheduler.schedule_shard(&t1_intent.all_pageservers())?;
        t1_intent.push_secondary(&mut scheduler, scheduled);
        assert_eq!(scheduler.nodes.get(&NodeId(1)).unwrap().shard_count, 1);
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
@@ -22,7 +22,6 @@ diesel::table! {
        placement_policy -> Varchar,
        splitting -> Int2,
        config -> Text,
        scheduling_policy -> Varchar,
    }
 }
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -4,12 +4,8 @@ use std::{
    time::Duration,
 };
-use crate::{
+use crate::{metrics, persistence::TenantShardPersistence};
-    metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
+use pageserver_api::controller_api::PlacementPolicy;
    persistence::TenantShardPersistence,
    scheduler::{AffinityScore, MaySchedule, ScheduleContext},
 };
 use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
 use pageserver_api::{
    models::{LocationConfig, LocationConfigMode, TenantConfig},
    shard::{ShardIdentity, TenantShardId},
@@ -50,7 +46,7 @@ where
 /// This struct implement Serialize for debugging purposes, but is _not_ persisted
 /// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
 #[derive(Serialize)]
-pub(crate) struct TenantShard {
+pub(crate) struct TenantState {
    pub(crate) tenant_shard_id: TenantShardId,
    pub(crate) shard: ShardIdentity,
@@ -117,10 +113,6 @@ pub(crate) struct TenantShard {
    /// sending it.  This is the mechanism by which compute notifications are included in the scope
    /// of state that we publish externally in an eventually consistent way.
    pub(crate) pending_compute_notification: bool,
    // Support/debug tool: if something is going wrong or flapping with scheduling, this may
    // be set to a non-active state to avoid making changes while the issue is fixed.
    scheduling_policy: ShardSchedulingPolicy,
 }
 #[derive(Default, Clone, Debug, Serialize)]
@@ -251,13 +243,8 @@ impl IntentState {
 impl Drop for IntentState {
    fn drop(&mut self) {
-        // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler.
+        // Must clear before dropping, to avoid leaving stale refcounts in the Scheduler
-        // We do not check this while panicking, to avoid polluting unit test failures or
+        debug_assert!(self.attached.is_none() && self.secondary.is_empty());
        // other assertions with this assertion's output.  It's still wrong to leak these,
        // but if we already have a panic then we don't need to independently flag this case.
        if !(std::thread::panicking()) {
            debug_assert!(self.attached.is_none() && self.secondary.is_empty());
        }
    }
 }
@@ -302,26 +289,6 @@ pub enum ReconcileWaitError {
    Failed(TenantShardId, String),
 }
 #[derive(Eq, PartialEq, Debug)]
 pub(crate) struct ReplaceSecondary {
    old_node_id: NodeId,
    new_node_id: NodeId,
 }
 #[derive(Eq, PartialEq, Debug)]
 pub(crate) struct MigrateAttachment {
    old_attached_node_id: NodeId,
    new_attached_node_id: NodeId,
 }
 #[derive(Eq, PartialEq, Debug)]
 pub(crate) enum ScheduleOptimization {
    // Replace one of our secondary locations with a different node
    ReplaceSecondary(ReplaceSecondary),
    // Migrate attachment to an existing secondary location
    MigrateAttachment(MigrateAttachment),
 }
 impl ReconcilerWaiter {
    pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
        tokio::select! {
@@ -354,7 +321,7 @@ pub(crate) struct ReconcilerHandle {
 }
 /// When a reconcile task completes, it sends this result object
-/// to be applied to the primary TenantShard.
+/// to be applied to the primary TenantState.
 pub(crate) struct ReconcileResult {
    pub(crate) sequence: Sequence,
    /// On errors, `observed` should be treated as an incompleted description
@@ -367,7 +334,7 @@ pub(crate) struct ReconcileResult {
    pub(crate) generation: Option<Generation>,
    pub(crate) observed: ObservedState,
-    /// Set [`TenantShard::pending_compute_notification`] from this flag
+    /// Set [`TenantState::pending_compute_notification`] from this flag
    pub(crate) pending_compute_notification: bool,
 }
@@ -379,7 +346,7 @@ impl ObservedState {
    }
 }
-impl TenantShard {
+impl TenantState {
    pub(crate) fn new(
        tenant_shard_id: TenantShardId,
        shard: ShardIdentity,
@@ -400,7 +367,6 @@ impl TenantShard {
            error_waiter: Arc::new(SeqWait::new(Sequence(0))),
            last_error: Arc::default(),
            pending_compute_notification: false,
            scheduling_policy: ShardSchedulingPolicy::default(),
        }
    }
@@ -456,7 +422,6 @@ impl TenantShard {
    fn schedule_attached(
        &mut self,
        scheduler: &mut Scheduler,
        context: &ScheduleContext,
    ) -> Result<(bool, NodeId), ScheduleError> {
        // No work to do if we already have an attached tenant
        if let Some(node_id) = self.intent.attached {
@@ -470,33 +435,14 @@ impl TenantShard {
            Ok((true, promote_secondary))
        } else {
            // Pick a fresh node: either we had no secondaries or none were schedulable
-            let node_id = scheduler.schedule_shard(&self.intent.secondary, context)?;
+            let node_id = scheduler.schedule_shard(&self.intent.secondary)?;
            tracing::debug!("Selected {} as attached", node_id);
            self.intent.set_attached(scheduler, Some(node_id));
            Ok((true, node_id))
        }
    }
-    pub(crate) fn schedule(
+    pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
        &mut self,
        scheduler: &mut Scheduler,
        context: &mut ScheduleContext,
    ) -> Result<(), ScheduleError> {
        let r = self.do_schedule(scheduler, context);
        context.avoid(&self.intent.all_pageservers());
        if let Some(attached) = self.intent.get_attached() {
            context.push_attached(*attached);
        }
        r
    }
    pub(crate) fn do_schedule(
        &mut self,
        scheduler: &mut Scheduler,
        context: &ScheduleContext,
    ) -> Result<(), ScheduleError> {
        // TODO: before scheduling new nodes, check if any existing content in
        // self.intent refers to pageservers that are offline, and pick other
        // pageservers if so.
@@ -504,16 +450,6 @@ impl TenantShard {
        // TODO: respect the splitting bit on tenants: if they are currently splitting then we may not
        // change their attach location.
        match self.scheduling_policy {
            ShardSchedulingPolicy::Active | ShardSchedulingPolicy::Essential => {}
            ShardSchedulingPolicy::Pause | ShardSchedulingPolicy::Stop => {
                // Warn to make it obvious why other things aren't happening/working, if we skip scheduling
                tracing::warn!(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(),
                    "Scheduling is disabled by policy {:?}", self.scheduling_policy);
                return Ok(());
            }
        }
        // Build the set of pageservers already in use by this tenant, to avoid scheduling
        // more work on the same pageservers we're already using.
        let mut modified = false;
@@ -521,7 +457,22 @@ impl TenantShard {
        // Add/remove nodes to fulfil policy
        use PlacementPolicy::*;
        match self.policy {
-            Attached(secondary_count) => {
+            Single => {
                // Should have exactly one attached, and zero secondaries
                if !self.intent.secondary.is_empty() {
                    self.intent.clear_secondary(scheduler);
                    modified = true;
                }
                let (modified_attached, _attached_node_id) = self.schedule_attached(scheduler)?;
                modified |= modified_attached;
                if !self.intent.secondary.is_empty() {
                    self.intent.clear_secondary(scheduler);
                    modified = true;
                }
            }
            Double(secondary_count) => {
                let retain_secondaries = if self.intent.attached.is_none()
                    && scheduler.node_preferred(&self.intent.secondary).is_some()
                {
@@ -540,13 +491,12 @@ impl TenantShard {
                }
                // Should have exactly one attached, and N secondaries
-                let (modified_attached, attached_node_id) =
+                let (modified_attached, attached_node_id) = self.schedule_attached(scheduler)?;
                    self.schedule_attached(scheduler, context)?;
                modified |= modified_attached;
                let mut used_pageservers = vec![attached_node_id];
                while self.intent.secondary.len() < secondary_count {
-                    let node_id = scheduler.schedule_shard(&used_pageservers, context)?;
+                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
                    self.intent.push_secondary(scheduler, node_id);
                    used_pageservers.push(node_id);
                    modified = true;
@@ -559,7 +509,7 @@ impl TenantShard {
                    modified = true;
                } else if self.intent.secondary.is_empty() {
                    // Populate secondary by scheduling a fresh node
-                    let node_id = scheduler.schedule_shard(&[], context)?;
+                    let node_id = scheduler.schedule_shard(&[])?;
                    self.intent.push_secondary(scheduler, node_id);
                    modified = true;
                }
@@ -586,167 +536,6 @@ impl TenantShard {
        Ok(())
    }
    /// Optimize attachments: if a shard has a secondary location that is preferable to
    /// its primary location based on soft constraints, switch that secondary location
    /// to be attached.
    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
    pub(crate) fn optimize_attachment(
        &self,
        nodes: &HashMap<NodeId, Node>,
        schedule_context: &ScheduleContext,
    ) -> Option<ScheduleOptimization> {
        let attached = (*self.intent.get_attached())?;
        if self.intent.secondary.is_empty() {
            // We can only do useful work if we have both attached and secondary locations: this
            // function doesn't schedule new locations, only swaps between attached and secondaries.
            return None;
        }
        let current_affinity_score = schedule_context.get_node_affinity(attached);
        let current_attachment_count = schedule_context.get_node_attachments(attached);
        // Generate score for each node, dropping any un-schedulable nodes.
        let all_pageservers = self.intent.all_pageservers();
        let mut scores = all_pageservers
            .iter()
            .flat_map(|node_id| {
                if matches!(
                    nodes
                        .get(node_id)
                        .map(|n| n.may_schedule())
                        .unwrap_or(MaySchedule::No),
                    MaySchedule::No
                ) {
                    None
                } else {
                    let affinity_score = schedule_context.get_node_affinity(*node_id);
                    let attachment_count = schedule_context.get_node_attachments(*node_id);
                    Some((*node_id, affinity_score, attachment_count))
                }
            })
            .collect::<Vec<_>>();
        // Sort precedence:
        //  1st - prefer nodes with the lowest total affinity score
        //  2nd - prefer nodes with the lowest number of attachments in this context
        //  3rd - if all else is equal, sort by node ID for determinism in tests.
        scores.sort_by_key(|i| (i.1, i.2, i.0));
        if let Some((preferred_node, preferred_affinity_score, preferred_attachment_count)) =
            scores.first()
        {
            if attached != *preferred_node {
                // The best alternative must be more than 1 better than us, otherwise we could end
                // up flapping back next time we're called (e.g. there's no point migrating from
                // a location with score 1 to a score zero, because on next location the situation
                // would be the same, but in reverse).
                if current_affinity_score > *preferred_affinity_score + AffinityScore(1)
                    || current_attachment_count > *preferred_attachment_count + 1
                {
                    tracing::info!(
                        "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
                        self.intent.get_secondary()
                    );
                    return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
                        old_attached_node_id: attached,
                        new_attached_node_id: *preferred_node,
                    }));
                }
            } else {
                tracing::debug!(
                    "Node {} is already preferred (score {:?})",
                    preferred_node,
                    preferred_affinity_score
                );
            }
        }
        // Fall-through: we didn't find an optimization
        None
    }
    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
    pub(crate) fn optimize_secondary(
        &self,
        scheduler: &Scheduler,
        schedule_context: &ScheduleContext,
    ) -> Option<ScheduleOptimization> {
        if self.intent.secondary.is_empty() {
            // We can only do useful work if we have both attached and secondary locations: this
            // function doesn't schedule new locations, only swaps between attached and secondaries.
            return None;
        }
        for secondary in self.intent.get_secondary() {
            let Some(affinity_score) = schedule_context.nodes.get(secondary) else {
                // We're already on a node unaffected any affinity constraints,
                // so we won't change it.
                continue;
            };
            // Let the scheduler suggest a node, where it would put us if we were scheduling afresh
            // This implicitly limits the choice to nodes that are available, and prefers nodes
            // with lower utilization.
            let Ok(candidate_node) =
                scheduler.schedule_shard(&self.intent.all_pageservers(), schedule_context)
            else {
                // A scheduling error means we have no possible candidate replacements
                continue;
            };
            let candidate_affinity_score = schedule_context
                .nodes
                .get(&candidate_node)
                .unwrap_or(&AffinityScore::FREE);
            // The best alternative must be more than 1 better than us, otherwise we could end
            // up flapping back next time we're called.
            if *candidate_affinity_score + AffinityScore(1) < *affinity_score {
                // If some other node is available and has a lower score than this node, then
                // that other node is a good place to migrate to.
                tracing::info!(
                    "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
                    self.intent.get_secondary()
                );
                return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
                    old_node_id: *secondary,
                    new_node_id: candidate_node,
                }));
            }
        }
        None
    }
    pub(crate) fn apply_optimization(
        &mut self,
        scheduler: &mut Scheduler,
        optimization: ScheduleOptimization,
    ) {
        metrics::METRICS_REGISTRY
            .metrics_group
            .storage_controller_schedule_optimization
            .inc();
        match optimization {
            ScheduleOptimization::MigrateAttachment(MigrateAttachment {
                old_attached_node_id,
                new_attached_node_id,
            }) => {
                self.intent.demote_attached(old_attached_node_id);
                self.intent
                    .promote_attached(scheduler, new_attached_node_id);
            }
            ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
                old_node_id,
                new_node_id,
            }) => {
                self.intent.remove_secondary(scheduler, old_node_id);
                self.intent.push_secondary(scheduler, new_node_id);
            }
        }
    }
    /// Query whether the tenant's observed state for attached node matches its intent state, and if so,
    /// yield the node ID.  This is appropriate for emitting compute hook notifications: we are checking that
    /// the node in question is not only where we intend to attach, but that the tenant is indeed already attached there.
@@ -788,12 +577,7 @@ impl TenantShard {
                .generation
                .expect("Attempted to enter attached state without a generation");
-            let wanted_conf = attached_location_conf(
+            let wanted_conf = attached_location_conf(generation, &self.shard, &self.config);
                generation,
                &self.shard,
                &self.config,
                !self.intent.secondary.is_empty(),
            );
            match self.observed.locations.get(&node_id) {
                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
                Some(_) | None => {
@@ -891,19 +675,6 @@ impl TenantShard {
            }
        }
        // Pre-checks done: finally check whether we may actually do the work
        match self.scheduling_policy {
            ShardSchedulingPolicy::Active
            | ShardSchedulingPolicy::Essential
            | ShardSchedulingPolicy::Pause => {}
            ShardSchedulingPolicy::Stop => {
                // We only reach this point if there is work to do and we're going to skip
                // doing it: warn it obvious why this tenant isn't doing what it ought to.
                tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
                return None;
            }
        }
        // Build list of nodes from which the reconciler should detach
        let mut detach = Vec::new();
        for node_id in self.observed.locations.keys() {
@@ -957,10 +728,7 @@ impl TenantShard {
        let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
                                                        tenant_id=%reconciler.tenant_shard_id.tenant_id,
                                                        shard_id=%reconciler.tenant_shard_id.shard_slug());
-        metrics::METRICS_REGISTRY
+        metrics::RECONCILER.spawned.inc();
            .metrics_group
            .storage_controller_reconcile_spawn
            .inc();
        let result_tx = result_tx.clone();
        let join_handle = tokio::task::spawn(
            async move {
@@ -978,12 +746,10 @@ impl TenantShard {
                // TODO: wrap all remote API operations in cancellation check
                // as well.
                if reconciler.cancel.is_cancelled() {
-                    metrics::METRICS_REGISTRY
+                    metrics::RECONCILER
-                        .metrics_group
+                        .complete
-                        .storage_controller_reconcile_complete
+                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL])
-                        .inc(ReconcileCompleteLabelGroup {
+                        .inc();
                            status: ReconcileOutcome::Cancel,
                        });
                    return;
                }
@@ -998,18 +764,18 @@ impl TenantShard {
                }
                // Update result counter
-                let outcome_label = match &result {
+                match &result {
-                    Ok(_) => ReconcileOutcome::Success,
+                    Ok(_) => metrics::RECONCILER
-                    Err(ReconcileError::Cancel) => ReconcileOutcome::Cancel,
+                        .complete
-                    Err(_) => ReconcileOutcome::Error,
+                        .with_label_values(&[metrics::ReconcilerMetrics::SUCCESS]),
-                };
+                    Err(ReconcileError::Cancel) => metrics::RECONCILER
-
+                        .complete
-                metrics::METRICS_REGISTRY
+                        .with_label_values(&[metrics::ReconcilerMetrics::CANCEL]),
-                    .metrics_group
+                    Err(_) => metrics::RECONCILER
-                    .storage_controller_reconcile_complete
+                        .complete
-                    .inc(ReconcileCompleteLabelGroup {
+                        .with_label_values(&[metrics::ReconcilerMetrics::ERROR]),
-                        status: outcome_label,
+                }
-                    });
+                .inc();
                result_tx
                    .send(ReconcileResult {
@@ -1040,22 +806,6 @@ impl TenantShard {
        })
    }
    /// Get a waiter for any reconciliation in flight, but do not start reconciliation
    /// if it is not already running
    pub(crate) fn get_waiter(&self) -> Option<ReconcilerWaiter> {
        if self.reconciler.is_some() {
            Some(ReconcilerWaiter {
                tenant_shard_id: self.tenant_shard_id,
                seq_wait: self.waiter.clone(),
                error_seq_wait: self.error_waiter.clone(),
                error: self.last_error.clone(),
                seq: self.sequence,
            })
        } else {
            None
        }
    }
    /// Called when a ReconcileResult has been emitted and the service is updating
    /// our state: if the result is from a sequence >= my ReconcileHandle, then drop
    /// the handle to indicate there is no longer a reconciliation in progress.
@@ -1081,40 +831,6 @@ impl TenantShard {
        debug_assert!(!self.intent.all_pageservers().contains(&node_id));
    }
    pub(crate) fn set_scheduling_policy(&mut self, p: ShardSchedulingPolicy) {
        self.scheduling_policy = p;
    }
    pub(crate) fn get_scheduling_policy(&self) -> &ShardSchedulingPolicy {
        &self.scheduling_policy
    }
    pub(crate) fn from_persistent(
        tsp: TenantShardPersistence,
        intent: IntentState,
    ) -> anyhow::Result<Self> {
        let tenant_shard_id = tsp.get_tenant_shard_id()?;
        let shard_identity = tsp.get_shard_identity()?;
        Ok(Self {
            tenant_shard_id,
            shard: shard_identity,
            sequence: Sequence::initial(),
            generation: tsp.generation.map(|g| Generation::new(g as u32)),
            policy: serde_json::from_str(&tsp.placement_policy).unwrap(),
            intent,
            observed: ObservedState::new(),
            config: serde_json::from_str(&tsp.config).unwrap(),
            reconciler: None,
            splitting: tsp.splitting,
            waiter: Arc::new(SeqWait::new(Sequence::initial())),
            error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
            last_error: Arc::default(),
            pending_compute_notification: false,
            scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
        })
    }
    pub(crate) fn to_persistent(&self) -> TenantShardPersistence {
        TenantShardPersistence {
            tenant_id: self.tenant_shard_id.tenant_id.to_string(),
@@ -1126,7 +842,6 @@ impl TenantShard {
            placement_policy: serde_json::to_string(&self.policy).unwrap(),
            config: serde_json::to_string(&self.config).unwrap(),
            splitting: SplitState::default(),
            scheduling_policy: serde_json::to_string(&self.scheduling_policy).unwrap(),
        }
    }
 }
@@ -1143,7 +858,7 @@ pub(crate) mod tests {
    use super::*;
-    fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard {
+    fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
        let tenant_id = TenantId::generate();
        let shard_number = ShardNumber(0);
        let shard_count = ShardCount::new(1);
@@ -1153,7 +868,7 @@ pub(crate) mod tests {
            shard_number,
            shard_count,
        };
-        TenantShard::new(
+        TenantState::new(
            tenant_shard_id,
            ShardIdentity::new(
                shard_number,
@@ -1165,32 +880,6 @@ pub(crate) mod tests {
        )
    }
    fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantShard> {
        let tenant_id = TenantId::generate();
        (0..shard_count.count())
            .map(|i| {
                let shard_number = ShardNumber(i);
                let tenant_shard_id = TenantShardId {
                    tenant_id,
                    shard_number,
                    shard_count,
                };
                TenantShard::new(
                    tenant_shard_id,
                    ShardIdentity::new(
                        shard_number,
                        shard_count,
                        pageserver_api::shard::ShardStripeSize(32768),
                    )
                    .unwrap(),
                    policy.clone(),
                )
            })
            .collect()
    }
    /// Test the scheduling behaviors used when a tenant configured for HA is subject
    /// to nodes being marked offline.
    #[test]
@@ -1200,26 +889,25 @@ pub(crate) mod tests {
        let mut nodes = make_test_nodes(3);
        let mut scheduler = Scheduler::new(nodes.values());
        let mut context = ScheduleContext::default();
-        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
-        tenant_shard
+        tenant_state
-            .schedule(&mut scheduler, &mut context)
+            .schedule(&mut scheduler)
            .expect("we have enough nodes, scheduling should work");
        // Expect to initially be schedule on to different nodes
-        assert_eq!(tenant_shard.intent.secondary.len(), 1);
+        assert_eq!(tenant_state.intent.secondary.len(), 1);
-        assert!(tenant_shard.intent.attached.is_some());
+        assert!(tenant_state.intent.attached.is_some());
-        let attached_node_id = tenant_shard.intent.attached.unwrap();
+        let attached_node_id = tenant_state.intent.attached.unwrap();
-        let secondary_node_id = *tenant_shard.intent.secondary.iter().last().unwrap();
+        let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
        assert_ne!(attached_node_id, secondary_node_id);
        // Notifying the attached node is offline should demote it to a secondary
-        let changed = tenant_shard.intent.demote_attached(attached_node_id);
+        let changed = tenant_state.intent.demote_attached(attached_node_id);
        assert!(changed);
-        assert!(tenant_shard.intent.attached.is_none());
+        assert!(tenant_state.intent.attached.is_none());
-        assert_eq!(tenant_shard.intent.secondary.len(), 2);
+        assert_eq!(tenant_state.intent.secondary.len(), 2);
        // Update the scheduler state to indicate the node is offline
        nodes
@@ -1229,18 +917,18 @@ pub(crate) mod tests {
        scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());
        // Scheduling the node should promote the still-available secondary node to attached
-        tenant_shard
+        tenant_state
-            .schedule(&mut scheduler, &mut context)
+            .schedule(&mut scheduler)
            .expect("active nodes are available");
-        assert_eq!(tenant_shard.intent.attached.unwrap(), secondary_node_id);
+        assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);
        // The original attached node should have been retained as a secondary
        assert_eq!(
-            *tenant_shard.intent.secondary.iter().last().unwrap(),
+            *tenant_state.intent.secondary.iter().last().unwrap(),
            attached_node_id
        );
-        tenant_shard.intent.clear(&mut scheduler);
+        tenant_state.intent.clear(&mut scheduler);
        Ok(())
    }
@@ -1250,263 +938,48 @@ pub(crate) mod tests {
        let nodes = make_test_nodes(3);
        let mut scheduler = Scheduler::new(nodes.values());
-        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Double(1));
-        tenant_shard.observed.locations.insert(
+        tenant_state.observed.locations.insert(
            NodeId(3),
            ObservedStateLocation {
                conf: Some(LocationConfig {
                    mode: LocationConfigMode::AttachedMulti,
                    generation: Some(2),
                    secondary_conf: None,
-                    shard_number: tenant_shard.shard.number.0,
+                    shard_number: tenant_state.shard.number.0,
-                    shard_count: tenant_shard.shard.count.literal(),
+                    shard_count: tenant_state.shard.count.literal(),
-                    shard_stripe_size: tenant_shard.shard.stripe_size.0,
+                    shard_stripe_size: tenant_state.shard.stripe_size.0,
                    tenant_conf: TenantConfig::default(),
                }),
            },
        );
-        tenant_shard.observed.locations.insert(
+        tenant_state.observed.locations.insert(
            NodeId(2),
            ObservedStateLocation {
                conf: Some(LocationConfig {
                    mode: LocationConfigMode::AttachedStale,
                    generation: Some(1),
                    secondary_conf: None,
-                    shard_number: tenant_shard.shard.number.0,
+                    shard_number: tenant_state.shard.number.0,
-                    shard_count: tenant_shard.shard.count.literal(),
+                    shard_count: tenant_state.shard.count.literal(),
-                    shard_stripe_size: tenant_shard.shard.stripe_size.0,
+                    shard_stripe_size: tenant_state.shard.stripe_size.0,
                    tenant_conf: TenantConfig::default(),
                }),
            },
        );
-        tenant_shard.intent_from_observed(&mut scheduler);
+        tenant_state.intent_from_observed(&mut scheduler);
        // The highest generationed attached location gets used as attached
-        assert_eq!(tenant_shard.intent.attached, Some(NodeId(3)));
+        assert_eq!(tenant_state.intent.attached, Some(NodeId(3)));
        // Other locations get used as secondary
-        assert_eq!(tenant_shard.intent.secondary, vec![NodeId(2)]);
+        assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]);
-        scheduler.consistency_check(nodes.values(), [&tenant_shard].into_iter())?;
+        scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?;
        tenant_shard.intent.clear(&mut scheduler);
        Ok(())
    }
    #[test]
    fn scheduling_mode() -> anyhow::Result<()> {
        let nodes = make_test_nodes(3);
        let mut scheduler = Scheduler::new(nodes.values());
        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
        // In pause mode, schedule() shouldn't do anything
        tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause;
        assert!(tenant_shard
            .schedule(&mut scheduler, &mut ScheduleContext::default())
            .is_ok());
        assert!(tenant_shard.intent.all_pageservers().is_empty());
        // In active mode, schedule() works
        tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active;
        assert!(tenant_shard
            .schedule(&mut scheduler, &mut ScheduleContext::default())
            .is_ok());
        assert!(!tenant_shard.intent.all_pageservers().is_empty());
        tenant_shard.intent.clear(&mut scheduler);
        Ok(())
    }
    #[test]
    fn optimize_attachment() -> anyhow::Result<()> {
        let nodes = make_test_nodes(3);
        let mut scheduler = Scheduler::new(nodes.values());
        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
        let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
        // Initially: both nodes attached on shard 1, and both have secondary locations
        // on different nodes.
        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
        shard_a.intent.push_secondary(&mut scheduler, NodeId(2));
        shard_b.intent.set_attached(&mut scheduler, Some(NodeId(1)));
        shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
        let mut schedule_context = ScheduleContext::default();
        schedule_context.avoid(&shard_a.intent.all_pageservers());
        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
        schedule_context.avoid(&shard_b.intent.all_pageservers());
        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
        let optimization_a = shard_a.optimize_attachment(&nodes, &schedule_context);
        // Either shard should recognize that it has the option to switch to a secondary location where there
        // would be no other shards from the same tenant, and request to do so.
        assert_eq!(
            optimization_a,
            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
                old_attached_node_id: NodeId(1),
                new_attached_node_id: NodeId(2)
            }))
        );
        // Note that these optimizing two shards in the same tenant with the same ScheduleContext is
        // mutually exclusive (the optimization of one invalidates the stats) -- it is the responsibility
        // of [`Service::optimize_all`] to avoid trying
        // to do optimizations for multiple shards in the same tenant at the same time.  Generating
        // both optimizations is just done for test purposes
        let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
        assert_eq!(
            optimization_b,
            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
                old_attached_node_id: NodeId(1),
                new_attached_node_id: NodeId(3)
            }))
        );
        // Applying these optimizations should result in the end state proposed
        shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(2)));
        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(1)]);
        shard_b.apply_optimization(&mut scheduler, optimization_b.unwrap());
        assert_eq!(shard_b.intent.get_attached(), &Some(NodeId(3)));
        assert_eq!(shard_b.intent.get_secondary(), &vec![NodeId(1)]);
        shard_a.intent.clear(&mut scheduler);
        shard_b.intent.clear(&mut scheduler);
        Ok(())
    }
    #[test]
    fn optimize_secondary() -> anyhow::Result<()> {
        let nodes = make_test_nodes(4);
        let mut scheduler = Scheduler::new(nodes.values());
        let mut shard_a = make_test_tenant_shard(PlacementPolicy::Attached(1));
        let mut shard_b = make_test_tenant_shard(PlacementPolicy::Attached(1));
        // Initially: both nodes attached on shard 1, and both have secondary locations
        // on different nodes.
        shard_a.intent.set_attached(&mut scheduler, Some(NodeId(1)));
        shard_a.intent.push_secondary(&mut scheduler, NodeId(3));
        shard_b.intent.set_attached(&mut scheduler, Some(NodeId(2)));
        shard_b.intent.push_secondary(&mut scheduler, NodeId(3));
        let mut schedule_context = ScheduleContext::default();
        schedule_context.avoid(&shard_a.intent.all_pageservers());
        schedule_context.push_attached(shard_a.intent.get_attached().unwrap());
        schedule_context.avoid(&shard_b.intent.all_pageservers());
        schedule_context.push_attached(shard_b.intent.get_attached().unwrap());
        let optimization_a = shard_a.optimize_secondary(&scheduler, &schedule_context);
        // Since there is a node with no locations available, the node with two locations for the
        // same tenant should generate an optimization to move one away
        assert_eq!(
            optimization_a,
            Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
                old_node_id: NodeId(3),
                new_node_id: NodeId(4)
            }))
        );
        shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
        assert_eq!(shard_a.intent.get_attached(), &Some(NodeId(1)));
        assert_eq!(shard_a.intent.get_secondary(), &vec![NodeId(4)]);
        shard_a.intent.clear(&mut scheduler);
        shard_b.intent.clear(&mut scheduler);
        Ok(())
    }
    // Optimize til quiescent: this emulates what Service::optimize_all does, when
    // called repeatedly in the background.
    fn optimize_til_idle(
        nodes: &HashMap<NodeId, Node>,
        scheduler: &mut Scheduler,
        shards: &mut [TenantShard],
    ) {
        let mut loop_n = 0;
        loop {
            let mut schedule_context = ScheduleContext::default();
            let mut any_changed = false;
            for shard in shards.iter() {
                schedule_context.avoid(&shard.intent.all_pageservers());
                if let Some(attached) = shard.intent.get_attached() {
                    schedule_context.push_attached(*attached);
                }
            }
            for shard in shards.iter_mut() {
                let optimization = shard.optimize_attachment(nodes, &schedule_context);
                if let Some(optimization) = optimization {
                    shard.apply_optimization(scheduler, optimization);
                    any_changed = true;
                    break;
                }
                let optimization = shard.optimize_secondary(scheduler, &schedule_context);
                if let Some(optimization) = optimization {
                    shard.apply_optimization(scheduler, optimization);
                    any_changed = true;
                    break;
                }
            }
            if !any_changed {
                break;
            }
            // Assert no infinite loop
            loop_n += 1;
            assert!(loop_n < 1000);
        }
    }
    /// Test the balancing behavior of shard scheduling: that it achieves a balance, and
    /// that it converges.
    #[test]
    fn optimize_add_nodes() -> anyhow::Result<()> {
        let nodes = make_test_nodes(4);
        // Only show the scheduler a couple of nodes
        let mut scheduler = Scheduler::new([].iter());
        scheduler.node_upsert(nodes.get(&NodeId(1)).unwrap());
        scheduler.node_upsert(nodes.get(&NodeId(2)).unwrap());
        let mut shards = make_test_tenant(PlacementPolicy::Attached(1), ShardCount::new(4));
        let mut schedule_context = ScheduleContext::default();
        for shard in &mut shards {
            assert!(shard
                .schedule(&mut scheduler, &mut schedule_context)
                .is_ok());
        }
        // We should see equal number of locations on the two nodes.
        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 4);
        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 4);
        // Add another two nodes: we should see the shards spread out when their optimize
        // methods are called
        scheduler.node_upsert(nodes.get(&NodeId(3)).unwrap());
        scheduler.node_upsert(nodes.get(&NodeId(4)).unwrap());
        optimize_til_idle(&nodes, &mut scheduler, &mut shards);
        assert_eq!(scheduler.get_node_shard_count(NodeId(1)), 2);
        assert_eq!(scheduler.get_node_shard_count(NodeId(2)), 2);
        assert_eq!(scheduler.get_node_shard_count(NodeId(3)), 2);
        assert_eq!(scheduler.get_node_shard_count(NodeId(4)), 2);
        for shard in shards.iter_mut() {
            shard.intent.clear(&mut scheduler);
        }
        tenant_state.intent.clear(&mut scheduler);
        Ok(())
    }
 }
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -86,10 +86,7 @@ where
        .stdout(process_log_file)
        .stderr(same_file_for_stderr)
        .args(args);
-
+    let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
    let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
        fill_rust_env_vars(background_command),
    ));
    filled_cmd.envs(envs);
    let pid_file_to_check = match &initial_pid_file {
@@ -271,15 +268,6 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
    cmd
 }
 fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
    for (var, val) in std::env::vars() {
        if var.starts_with("NEON_PAGESERVER_") {
            cmd = cmd.env(var, val);
        }
    }
    cmd
 }
 /// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
 /// 1. Claims a pidfile with a fcntl lock on it and
 /// 2. Sets up the pidfile's file descriptor so that it (and the lock)
@@ -306,7 +294,7 @@ where
    //      is in state 'taken' but the thread that would unlock it is
    //      not there.
    //   2. A rust object that represented some external resource in the
-    //      parent now got implicitly copied by the fork, even though
+    //      parent now got implicitly copied by the the fork, even though
    //      the object's type is not `Copy`. The parent program may use
    //      non-copyability as way to enforce unique ownership of an
    //      external resource in the typesystem. The fork breaks that
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -14,7 +14,9 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
-use pageserver_api::controller_api::PlacementPolicy;
+use pageserver_api::controller_api::{
    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
 };
 use pageserver_api::models::{
    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
@@ -417,54 +419,6 @@ async fn handle_tenant(
                println!("{} {:?}", t.id, t.state);
            }
        }
        Some(("import", import_match)) => {
            let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate);
            let storage_controller = StorageController::from_env(env);
            let create_response = storage_controller.tenant_import(tenant_id).await?;
            let shard_zero = create_response
                .shards
                .first()
                .expect("Import response omitted shards");
            let attached_pageserver_id = shard_zero.node_id;
            let pageserver =
                PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?);
            println!(
                "Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}"
            );
            let timelines = pageserver
                .http_client
                .list_timelines(shard_zero.shard_id)
                .await?;
            // Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names
            let main_timeline = timelines
                .iter()
                .find(|t| t.ancestor_timeline_id.is_none())
                .expect("No timelines found")
                .timeline_id;
            let mut branch_i = 0;
            for timeline in timelines.iter() {
                let branch_name = if timeline.timeline_id == main_timeline {
                    "main".to_string()
                } else {
                    branch_i += 1;
                    format!("branch_{branch_i}")
                };
                println!(
                    "Importing timeline {tenant_id}/{} as branch {branch_name}",
                    timeline.timeline_id
                );
                env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?;
            }
        }
        Some(("create", create_match)) => {
            let tenant_conf: HashMap<_, _> = create_match
                .get_many::<String>("config")
@@ -483,7 +437,7 @@ async fn handle_tenant(
            let placement_policy = match create_match.get_one::<String>("placement-policy") {
                Some(s) if !s.is_empty() => serde_json::from_str::<PlacementPolicy>(s)?,
-                _ => PlacementPolicy::Attached(0),
+                _ => PlacementPolicy::Single,
            };
            let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
@@ -569,6 +523,88 @@ async fn handle_tenant(
                .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
        Some(("migrate", matches)) => {
            let tenant_shard_id = get_tenant_shard_id(matches, env)?;
            let new_pageserver = get_pageserver(env, matches)?;
            let new_pageserver_id = new_pageserver.conf.id;
            let storage_controller = StorageController::from_env(env);
            storage_controller
                .tenant_migrate(tenant_shard_id, new_pageserver_id)
                .await?;
            println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
        }
        Some(("status", matches)) => {
            let tenant_id = get_tenant_id(matches, env)?;
            let mut shard_table = comfy_table::Table::new();
            shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
            let mut tenant_synthetic_size = None;
            let storage_controller = StorageController::from_env(env);
            for shard in storage_controller.tenant_locate(tenant_id).await?.shards {
                let pageserver =
                    PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
                let size = pageserver
                    .http_client
                    .tenant_details(shard.shard_id)
                    .await?
                    .tenant_info
                    .current_physical_size
                    .unwrap();
                shard_table.add_row([
                    format!("{}", shard.shard_id.shard_slug()),
                    format!("{}", shard.node_id.0),
                    format!("{} MiB", size / (1024 * 1024)),
                ]);
                if shard.shard_id.is_zero() {
                    tenant_synthetic_size =
                        Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
                }
            }
            let Some(synthetic_size) = tenant_synthetic_size else {
                bail!("Shard 0 not found")
            };
            let mut tenant_table = comfy_table::Table::new();
            tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
            tenant_table.add_row([
                "Synthetic size".to_string(),
                format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
            ]);
            println!("{tenant_table}");
            println!("{shard_table}");
        }
        Some(("shard-split", matches)) => {
            let tenant_id = get_tenant_id(matches, env)?;
            let shard_count: u8 = matches.get_one::<u8>("shard-count").cloned().unwrap_or(0);
            let shard_stripe_size: Option<ShardStripeSize> = matches
                .get_one::<Option<ShardStripeSize>>("shard-stripe-size")
                .cloned()
                .unwrap();
            let storage_controller = StorageController::from_env(env);
            let result = storage_controller
                .tenant_split(tenant_id, shard_count, shard_stripe_size)
                .await?;
            println!(
                "Split tenant {} into shards {}",
                tenant_id,
                result
                    .new_shards
                    .iter()
                    .map(|s| format!("{:?}", s))
                    .collect::<Vec<_>>()
                    .join(",")
            );
        }
        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
@@ -1106,6 +1142,21 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }
        }
        Some(("set-state", subcommand_args)) => {
            let pageserver = get_pageserver(env, subcommand_args)?;
            let scheduling = subcommand_args.get_one("scheduling");
            let availability = subcommand_args.get_one("availability");
            let storage_controller = StorageController::from_env(env);
            storage_controller
                .node_configure(NodeConfigureRequest {
                    node_id: pageserver.conf.id,
                    scheduling: scheduling.cloned(),
                    availability: availability.cloned(),
                })
                .await?;
        }
        Some(("status", subcommand_args)) => {
            match get_pageserver(env, subcommand_args)?.check_status().await {
                Ok(_) => println!("Page server is up and running"),
@@ -1279,7 +1330,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
    match ComputeControlPlane::load(env.clone()) {
        Ok(cplane) => {
            for (_k, node) in cplane.endpoints {
-                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) {
+                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
                    eprintln!("postgres stop failed: {e:#}");
                }
            }
@@ -1465,7 +1516,6 @@ fn cli() -> Command {
        .subcommand(
            Command::new("timeline")
            .about("Manage timelines")
            .arg_required_else_help(true)
            .subcommand(Command::new("list")
                .about("List all timelines, available to this pageserver")
                .arg(tenant_id_arg.clone()))
@@ -1528,8 +1578,19 @@ fn cli() -> Command {
            .subcommand(Command::new("config")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
-            .subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true))
+            .subcommand(Command::new("migrate")
-                .about("Import a tenant that is present in remote storage, and create branches for its timelines"))
+                .about("Migrate a tenant from one pageserver to another")
                .arg(tenant_id_arg.clone())
                .arg(pageserver_id_arg.clone()))
            .subcommand(Command::new("status")
                .about("Human readable summary of the tenant's shards and attachment locations")
                .arg(tenant_id_arg.clone()))
            .subcommand(Command::new("shard-split")
                .about("Increase the number of shards in the tenant")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
                .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
                )
        )
        .subcommand(
            Command::new("pageserver")
@@ -1549,6 +1610,12 @@ fn cli() -> Command {
                    .about("Restart local pageserver")
                    .arg(pageserver_config_args.clone())
                )
                .subcommand(Command::new("set-state")
                    .arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
                    .arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
                    .about("Set scheduling or availability state of pageserver node")
                    .arg(pageserver_config_args.clone())
                )
        )
        .subcommand(
            Command::new("storage_controller")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -12,7 +12,7 @@
 //!
 //! The endpoint is managed by the `compute_ctl` binary. When an endpoint is
 //! started, we launch `compute_ctl` It synchronizes the safekeepers, downloads
-//! the basebackup from the pageserver to initialize the data directory, and
+//! the basebackup from the pageserver to initialize the the data directory, and
 //! finally launches the PostgreSQL process. It watches the PostgreSQL process
 //! until it exits.
 //!
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -114,7 +114,7 @@ impl NeonBroker {
 }
 #[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
-#[serde(default, deny_unknown_fields)]
+#[serde(default)]
 pub struct PageServerConf {
    // node id
    pub id: NodeId,
@@ -126,9 +126,6 @@ pub struct PageServerConf {
    // auth type used for the PG and HTTP ports
    pub pg_auth_type: AuthType,
    pub http_auth_type: AuthType,
    pub(crate) virtual_file_io_engine: Option<String>,
    pub(crate) get_vectored_impl: Option<String>,
 }
 impl Default for PageServerConf {
@@ -139,8 +136,6 @@ impl Default for PageServerConf {
            listen_http_addr: String::new(),
            pg_auth_type: AuthType::Trust,
            http_auth_type: AuthType::Trust,
            virtual_file_io_engine: None,
            get_vectored_impl: None,
        }
    }
 }
@@ -156,7 +151,6 @@ pub struct SafekeeperConf {
    pub remote_storage: Option<String>,
    pub backup_threads: Option<u32>,
    pub auth_enabled: bool,
    pub listen_addr: Option<String>,
 }
 impl Default for SafekeeperConf {
@@ -170,7 +164,6 @@ impl Default for SafekeeperConf {
            remote_storage: None,
            backup_threads: None,
            auth_enabled: false,
            listen_addr: None,
        }
    }
 }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -78,39 +78,18 @@ impl PageServerNode {
    ///
    /// These all end up on the command line of the `pageserver` binary.
    fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
        let id = format!("id={}", self.conf.id);
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
        let pg_distrib_dir_param = format!(
            "pg_distrib_dir='{}'",
            self.env.pg_distrib_dir_raw().display()
        );
-        let PageServerConf {
+        let http_auth_type_param = format!("http_auth_type='{}'", self.conf.http_auth_type);
-            id,
+        let listen_http_addr_param = format!("listen_http_addr='{}'", self.conf.listen_http_addr);
            listen_pg_addr,
            listen_http_addr,
            pg_auth_type,
            http_auth_type,
            virtual_file_io_engine,
            get_vectored_impl,
        } = &self.conf;
-        let id = format!("id={}", id);
+        let pg_auth_type_param = format!("pg_auth_type='{}'", self.conf.pg_auth_type);
-
+        let listen_pg_addr_param = format!("listen_pg_addr='{}'", self.conf.listen_pg_addr);
        let http_auth_type_param = format!("http_auth_type='{}'", http_auth_type);
        let listen_http_addr_param = format!("listen_http_addr='{}'", listen_http_addr);
        let pg_auth_type_param = format!("pg_auth_type='{}'", pg_auth_type);
        let listen_pg_addr_param = format!("listen_pg_addr='{}'", listen_pg_addr);
        let virtual_file_io_engine = if let Some(virtual_file_io_engine) = virtual_file_io_engine {
            format!("virtual_file_io_engine='{virtual_file_io_engine}'")
        } else {
            String::new()
        };
        let get_vectored_impl = if let Some(get_vectored_impl) = get_vectored_impl {
            format!("get_vectored_impl='{get_vectored_impl}'")
        } else {
            String::new()
        };
        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());
@@ -122,8 +101,6 @@ impl PageServerNode {
            listen_http_addr_param,
            listen_pg_addr_param,
            broker_endpoint_param,
            virtual_file_io_engine,
            get_vectored_impl,
        ];
        if let Some(control_plane_api) = &self.env.control_plane_api {
@@ -134,7 +111,7 @@ impl PageServerNode {
            // Storage controller uses the same auth as pageserver: if JWT is enabled
            // for us, we will also need it to talk to them.
-            if matches!(http_auth_type, AuthType::NeonJWT) {
+            if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
                let jwt_token = self
                    .env
                    .generate_auth_token(&Claims::new(None, Scope::GenerationsApi))
@@ -152,7 +129,8 @@ impl PageServerNode {
            ));
        }
-        if *http_auth_type != AuthType::Trust || *pg_auth_type != AuthType::Trust {
+        if self.conf.http_auth_type != AuthType::Trust || self.conf.pg_auth_type != AuthType::Trust
        {
            // Keys are generated in the toplevel repo dir, pageservers' workdirs
            // are one level below that, so refer to keys with ../
            overrides.push("auth_validation_public_key_path='../auth_public_key.pem'".to_owned());
@@ -389,10 +367,6 @@ impl PageServerNode {
                .remove("image_creation_threshold")
                .map(|x| x.parse::<usize>())
                .transpose()?,
            image_layer_creation_check_threshold: settings
                .remove("image_layer_creation_check_threshold")
                .map(|x| x.parse::<u8>())
                .transpose()?,
            pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
            walreceiver_connect_timeout: settings
                .remove("walreceiver_connect_timeout")
@@ -410,6 +384,11 @@ impl PageServerNode {
                .map(|x| x.parse::<bool>())
                .transpose()
                .context("Failed to parse 'trace_read_requests' as bool")?,
            image_layer_compression: settings
                .remove("image_layer_compression")
                .map(serde_json::from_str)
                .transpose()
                .context("Failed to parse 'image_layer_compression' json")?,
            eviction_policy: settings
                .remove("eviction_policy")
                .map(serde_json::from_str)
@@ -505,12 +484,6 @@ impl PageServerNode {
                    .map(|x| x.parse::<usize>())
                    .transpose()
                    .context("Failed to parse 'image_creation_threshold' as non zero integer")?,
                image_layer_creation_check_threshold: settings
                    .remove("image_layer_creation_check_threshold")
                    .map(|x| x.parse::<u8>())
                    .transpose()
                    .context("Failed to parse 'image_creation_check_threshold' as integer")?,
                pitr_interval: settings.remove("pitr_interval").map(|x| x.to_string()),
                walreceiver_connect_timeout: settings
                    .remove("walreceiver_connect_timeout")
@@ -528,6 +501,11 @@ impl PageServerNode {
                    .map(|x| x.parse::<bool>())
                    .transpose()
                    .context("Failed to parse 'trace_read_requests' as bool")?,
                image_layer_compression: settings
                    .remove("image_layer_compression")
                    .map(serde_json::from_str)
                    .transpose()
                    .context("Failed to parse 'image_layer_compression' json")?,
                eviction_policy: settings
                    .remove("eviction_policy")
                    .map(serde_json::from_str)
@@ -586,6 +564,13 @@ impl PageServerNode {
        Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
    }
    pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
        Ok(self
            .http_client
            .tenant_secondary_download(*tenant_id)
            .await?)
    }
    pub async fn timeline_create(
        &self,
        tenant_shard_id: TenantShardId,
--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -70,31 +70,24 @@ pub struct SafekeeperNode {
    pub pg_connection_config: PgConnectionConfig,
    pub env: LocalEnv,
    pub http_client: reqwest::Client,
    pub listen_addr: String,
    pub http_base_url: String,
 }
 impl SafekeeperNode {
    pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
        let listen_addr = if let Some(ref listen_addr) = conf.listen_addr {
            listen_addr.clone()
        } else {
            "127.0.0.1".to_string()
        };
        SafekeeperNode {
            id: conf.id,
            conf: conf.clone(),
-            pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port),
+            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
            env: env.clone(),
            http_client: reqwest::Client::new(),
-            http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port),
+            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
            listen_addr,
        }
    }
    /// Construct libpq connection string for connecting to this safekeeper.
-    fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig {
+    fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
-        PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port)
+        PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port)
    }
    pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
@@ -118,8 +111,8 @@ impl SafekeeperNode {
        );
        io::stdout().flush().unwrap();
-        let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port);
+        let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
-        let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port);
+        let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
        let id = self.id;
        let datadir = self.datadir_path();
@@ -146,7 +139,7 @@ impl SafekeeperNode {
            availability_zone,
        ];
        if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
-            let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
+            let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
            args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
        }
        if !self.conf.sync {
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -38,9 +38,6 @@ const COMMAND: &str = "storage_controller";
 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;
 // Use a shorter pageserver unavailability interval than the default to speed up tests.
 const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -272,18 +269,13 @@ impl StorageController {
        // Run migrations on every startup, in case something changed.
        let database_url = self.setup_database().await?;
        let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
        let mut args = vec![
            "-l",
            &self.listen,
            "-p",
            self.path.as_ref(),
            "--dev",
            "--database-url",
            &database_url,
            "--max-unavailable-interval",
            &max_unavailable.to_string(),
        ]
        .into_iter()
        .map(|s| s.to_string())
@@ -472,21 +464,11 @@ impl StorageController {
            .await
    }
    #[instrument(skip(self))]
    pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result<TenantCreateResponse> {
        self.dispatch::<(), TenantCreateResponse>(
            Method::POST,
            format!("debug/v1/tenant/{tenant_id}/import"),
            None,
        )
        .await
    }
    #[instrument(skip(self))]
    pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
        self.dispatch::<(), _>(
            Method::GET,
-            format!("debug/v1/tenant/{tenant_id}/locate"),
+            format!("control/v1/tenant/{tenant_id}/locate"),
            None,
        )
        .await
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -1,23 +0,0 @@
 [package]
 name = "storcon_cli"
 version = "0.1.0"
 edition.workspace = true
 license.workspace = true
 [dependencies]
 anyhow.workspace = true
 clap.workspace = true
 comfy-table.workspace = true
 hyper.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 reqwest.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 thiserror.workspace = true
 tokio.workspace = true
 tracing.workspace = true
 utils.workspace = true
 workspace_hack.workspace = true
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,681 +0,0 @@
 use std::{collections::HashMap, str::FromStr, time::Duration};
 use clap::{Parser, Subcommand};
 use hyper::{Method, StatusCode};
 use pageserver_api::{
    controller_api::{
        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
        TenantDescribeResponse, TenantPolicyRequest,
    },
    models::{
        LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
    },
    shard::{ShardStripeSize, TenantShardId},
 };
 use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
 use reqwest::Url;
 use serde::{de::DeserializeOwned, Serialize};
 use utils::id::{NodeId, TenantId};
 use pageserver_api::controller_api::{
    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
    TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
 };
 #[derive(Subcommand, Debug)]
 enum Command {
    /// Register a pageserver with the storage controller.  This shouldn't usually be necessary,
    /// since pageservers auto-register when they start up
    NodeRegister {
        #[arg(long)]
        node_id: NodeId,
        #[arg(long)]
        listen_pg_addr: String,
        #[arg(long)]
        listen_pg_port: u16,
        #[arg(long)]
        listen_http_addr: String,
        #[arg(long)]
        listen_http_port: u16,
    },
    /// Modify a node's configuration in the storage controller
    NodeConfigure {
        #[arg(long)]
        node_id: NodeId,
        /// Availability is usually auto-detected based on heartbeats.  Set 'offline' here to
        /// manually mark a node offline
        #[arg(long)]
        availability: Option<NodeAvailabilityArg>,
        /// Scheduling policy controls whether tenant shards may be scheduled onto this node.
        #[arg(long)]
        scheduling: Option<NodeSchedulingPolicy>,
    },
    /// Modify a tenant's policies in the storage controller
    TenantPolicy {
        #[arg(long)]
        tenant_id: TenantId,
        /// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
        /// or is in the normal attached state with N secondary locations (`attached:N`)
        #[arg(long)]
        placement: Option<PlacementPolicyArg>,
        /// Scheduling policy enables pausing the controller's scheduling activity involving this tenant.  `active` is normal,
        /// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
        /// all reconciliation activity including for scheduling changes already made.  `pause` and `stop` can make a tenant
        /// unavailable, and are only for use in emergencies.
        #[arg(long)]
        scheduling: Option<ShardSchedulingPolicyArg>,
    },
    /// List nodes known to the storage controller
    Nodes {},
    /// List tenants known to the storage controller
    Tenants {},
    /// Create a new tenant in the storage controller, and by extension on pageservers.
    TenantCreate {
        #[arg(long)]
        tenant_id: TenantId,
    },
    /// Delete a tenant in the storage controller, and by extension on pageservers.
    TenantDelete {
        #[arg(long)]
        tenant_id: TenantId,
    },
    /// Split an existing tenant into a higher number of shards than its current shard count.
    TenantShardSplit {
        #[arg(long)]
        tenant_id: TenantId,
        #[arg(long)]
        shard_count: u8,
        /// Optional, in 8kiB pages.  e.g. set 2048 for 16MB stripes.
        #[arg(long)]
        stripe_size: Option<u32>,
    },
    /// Migrate the attached location for a tenant shard to a specific pageserver.
    TenantShardMigrate {
        #[arg(long)]
        tenant_shard_id: TenantShardId,
        #[arg(long)]
        node: NodeId,
    },
    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
    /// that is passed through to pageservers, and does not affect storage controller behavior.
    TenantConfig {
        #[arg(long)]
        tenant_id: TenantId,
        #[arg(long)]
        config: String,
    },
    /// Attempt to balance the locations for a tenant across pageservers.  This is a client-side
    /// alternative to the storage controller's scheduling optimization behavior.
    TenantScatter {
        #[arg(long)]
        tenant_id: TenantId,
    },
    /// Print details about a particular tenant, including all its shards' states.
    TenantDescribe {
        #[arg(long)]
        tenant_id: TenantId,
    },
    /// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
    /// mode so that it can warm up content on a pageserver.
    TenantWarmup {
        #[arg(long)]
        tenant_id: TenantId,
    },
 }
 #[derive(Parser)]
 #[command(
    author,
    version,
    about,
    long_about = "CLI for Storage Controller Support/Debug"
 )]
 #[command(arg_required_else_help(true))]
 struct Cli {
    #[arg(long)]
    /// URL to storage controller.  e.g. http://127.0.0.1:1234 when using `neon_local`
    api: Url,
    #[arg(long)]
    /// JWT token for authenticating with storage controller.  Depending on the API used, this
    /// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
    /// a token with both scopes to use with this tool.
    jwt: Option<String>,
    #[command(subcommand)]
    command: Command,
 }
 #[derive(Debug, Clone)]
 struct PlacementPolicyArg(PlacementPolicy);
 impl FromStr for PlacementPolicyArg {
    type Err = anyhow::Error;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
            "detached" => Ok(Self(PlacementPolicy::Detached)),
            "secondary" => Ok(Self(PlacementPolicy::Secondary)),
            _ if s.starts_with("attached:") => {
                let mut splitter = s.split(':');
                let _prefix = splitter.next().unwrap();
                match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
                    Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
                    None => Err(anyhow::anyhow!(
                        "Invalid format '{s}', a valid example is 'attached:1'"
                    )),
                }
            }
            _ => Err(anyhow::anyhow!(
                "Unknown placement policy '{s}', try detached,secondary,attached:<n>"
            )),
        }
    }
 }
 #[derive(Debug, Clone)]
 struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
 impl FromStr for ShardSchedulingPolicyArg {
    type Err = anyhow::Error;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
            "active" => Ok(Self(ShardSchedulingPolicy::Active)),
            "essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
            "pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
            "stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
            _ => Err(anyhow::anyhow!(
                "Unknown scheduling policy '{s}', try active,essential,pause,stop"
            )),
        }
    }
 }
 #[derive(Debug, Clone)]
 struct NodeAvailabilityArg(NodeAvailabilityWrapper);
 impl FromStr for NodeAvailabilityArg {
    type Err = anyhow::Error;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        match s {
            "active" => Ok(Self(NodeAvailabilityWrapper::Active)),
            "offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
        }
    }
 }
 struct Client {
    base_url: Url,
    jwt_token: Option<String>,
    client: reqwest::Client,
 }
 impl Client {
    fn new(base_url: Url, jwt_token: Option<String>) -> Self {
        Self {
            base_url,
            jwt_token,
            client: reqwest::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
        }
    }
    /// Simple HTTP request wrapper for calling into storage controller
    async fn dispatch<RQ, RS>(
        &self,
        method: hyper::Method,
        path: String,
        body: Option<RQ>,
    ) -> mgmt_api::Result<RS>
    where
        RQ: Serialize + Sized,
        RS: DeserializeOwned + Sized,
    {
        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
        // for general purpose API access.
        let url = Url::from_str(&format!(
            "http://{}:{}/{path}",
            self.base_url.host_str().unwrap(),
            self.base_url.port().unwrap()
        ))
        .unwrap();
        let mut builder = self.client.request(method, url);
        if let Some(body) = body {
            builder = builder.json(&body)
        }
        if let Some(jwt_token) = &self.jwt_token {
            builder = builder.header(
                reqwest::header::AUTHORIZATION,
                format!("Bearer {jwt_token}"),
            );
        }
        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
        let response = response.error_from_body().await?;
        response
            .json()
            .await
            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
    }
 }
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
    let cli = Cli::parse();
    let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
    let mut trimmed = cli.api.to_string();
    trimmed.pop();
    let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
    match cli.command {
        Command::NodeRegister {
            node_id,
            listen_pg_addr,
            listen_pg_port,
            listen_http_addr,
            listen_http_port,
        } => {
            storcon_client
                .dispatch::<_, ()>(
                    Method::POST,
                    "control/v1/node".to_string(),
                    Some(NodeRegisterRequest {
                        node_id,
                        listen_pg_addr,
                        listen_pg_port,
                        listen_http_addr,
                        listen_http_port,
                    }),
                )
                .await?;
        }
        Command::TenantCreate { tenant_id } => {
            vps_client
                .tenant_create(&TenantCreateRequest {
                    new_tenant_id: TenantShardId::unsharded(tenant_id),
                    generation: None,
                    shard_parameters: ShardParameters::default(),
                    placement_policy: Some(PlacementPolicy::Attached(1)),
                    config: TenantConfig::default(),
                })
                .await?;
        }
        Command::TenantDelete { tenant_id } => {
            let status = vps_client
                .tenant_delete(TenantShardId::unsharded(tenant_id))
                .await?;
            tracing::info!("Delete status: {}", status);
        }
        Command::Nodes {} => {
            let resp = storcon_client
                .dispatch::<(), Vec<NodeDescribeResponse>>(
                    Method::GET,
                    "control/v1/node".to_string(),
                    None,
                )
                .await?;
            let mut table = comfy_table::Table::new();
            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
            for node in resp {
                table.add_row([
                    format!("{}", node.id),
                    node.listen_http_addr,
                    format!("{:?}", node.scheduling),
                    format!("{:?}", node.availability),
                ]);
            }
            println!("{table}");
        }
        Command::NodeConfigure {
            node_id,
            availability,
            scheduling,
        } => {
            let req = NodeConfigureRequest {
                node_id,
                availability: availability.map(|a| a.0),
                scheduling,
            };
            storcon_client
                .dispatch::<_, ()>(
                    Method::PUT,
                    format!("control/v1/node/{node_id}/config"),
                    Some(req),
                )
                .await?;
        }
        Command::Tenants {} => {
            let resp = storcon_client
                .dispatch::<(), Vec<TenantDescribeResponse>>(
                    Method::GET,
                    "control/v1/tenant".to_string(),
                    None,
                )
                .await?;
            let mut table = comfy_table::Table::new();
            table.set_header([
                "TenantId",
                "ShardCount",
                "StripeSize",
                "Placement",
                "Scheduling",
            ]);
            for tenant in resp {
                let shard_zero = tenant.shards.into_iter().next().unwrap();
                table.add_row([
                    format!("{}", tenant.tenant_id),
                    format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
                    format!("{:?}", tenant.stripe_size),
                    format!("{:?}", tenant.policy),
                    format!("{:?}", shard_zero.scheduling_policy),
                ]);
            }
            println!("{table}");
        }
        Command::TenantPolicy {
            tenant_id,
            placement,
            scheduling,
        } => {
            let req = TenantPolicyRequest {
                scheduling: scheduling.map(|s| s.0),
                placement: placement.map(|p| p.0),
            };
            storcon_client
                .dispatch::<_, ()>(
                    Method::PUT,
                    format!("control/v1/tenant/{tenant_id}/policy"),
                    Some(req),
                )
                .await?;
        }
        Command::TenantShardSplit {
            tenant_id,
            shard_count,
            stripe_size,
        } => {
            let req = TenantShardSplitRequest {
                new_shard_count: shard_count,
                new_stripe_size: stripe_size.map(ShardStripeSize),
            };
            let response = storcon_client
                .dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
                    Method::PUT,
                    format!("control/v1/tenant/{tenant_id}/shard_split"),
                    Some(req),
                )
                .await?;
            println!(
                "Split tenant {} into {} shards: {}",
                tenant_id,
                shard_count,
                response
                    .new_shards
                    .iter()
                    .map(|s| format!("{:?}", s))
                    .collect::<Vec<_>>()
                    .join(",")
            );
        }
        Command::TenantShardMigrate {
            tenant_shard_id,
            node,
        } => {
            let req = TenantShardMigrateRequest {
                tenant_shard_id,
                node_id: node,
            };
            storcon_client
                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
                    Method::PUT,
                    format!("control/v1/tenant/{tenant_shard_id}/migrate"),
                    Some(req),
                )
                .await?;
        }
        Command::TenantConfig { tenant_id, config } => {
            let tenant_conf = serde_json::from_str(&config)?;
            vps_client
                .tenant_config(&TenantConfigRequest {
                    tenant_id,
                    config: tenant_conf,
                })
                .await?;
        }
        Command::TenantScatter { tenant_id } => {
            // Find the shards
            let locate_response = storcon_client
                .dispatch::<(), TenantLocateResponse>(
                    Method::GET,
                    format!("control/v1/tenant/{tenant_id}/locate"),
                    None,
                )
                .await?;
            let shards = locate_response.shards;
            let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
            let shard_count = shards.len();
            for s in shards {
                let entry = node_to_shards.entry(s.node_id).or_default();
                entry.push(s.shard_id);
            }
            // Load list of available nodes
            let nodes_resp = storcon_client
                .dispatch::<(), Vec<NodeDescribeResponse>>(
                    Method::GET,
                    "control/v1/node".to_string(),
                    None,
                )
                .await?;
            for node in nodes_resp {
                if matches!(node.availability, NodeAvailabilityWrapper::Active) {
                    node_to_shards.entry(node.id).or_default();
                }
            }
            let max_shard_per_node = shard_count / node_to_shards.len();
            loop {
                let mut migrate_shard = None;
                for shards in node_to_shards.values_mut() {
                    if shards.len() > max_shard_per_node {
                        // Pick the emptiest
                        migrate_shard = Some(shards.pop().unwrap());
                    }
                }
                let Some(migrate_shard) = migrate_shard else {
                    break;
                };
                // Pick the emptiest node to migrate to
                let mut destinations = node_to_shards
                    .iter()
                    .map(|(k, v)| (k, v.len()))
                    .collect::<Vec<_>>();
                destinations.sort_by_key(|i| i.1);
                let (destination_node, destination_count) = *destinations.first().unwrap();
                if destination_count + 1 > max_shard_per_node {
                    // Even the emptiest destination doesn't have space: we're done
                    break;
                }
                let destination_node = *destination_node;
                node_to_shards
                    .get_mut(&destination_node)
                    .unwrap()
                    .push(migrate_shard);
                println!("Migrate {} -> {} ...", migrate_shard, destination_node);
                storcon_client
                    .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
                        Method::PUT,
                        format!("control/v1/tenant/{migrate_shard}/migrate"),
                        Some(TenantShardMigrateRequest {
                            tenant_shard_id: migrate_shard,
                            node_id: destination_node,
                        }),
                    )
                    .await?;
                println!("Migrate {} -> {} OK", migrate_shard, destination_node);
            }
            // Spread the shards across the nodes
        }
        Command::TenantDescribe { tenant_id } => {
            let describe_response = storcon_client
                .dispatch::<(), TenantDescribeResponse>(
                    Method::GET,
                    format!("control/v1/tenant/{tenant_id}"),
                    None,
                )
                .await?;
            let shards = describe_response.shards;
            let mut table = comfy_table::Table::new();
            table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
            for shard in shards {
                let secondary = shard
                    .node_secondary
                    .iter()
                    .map(|n| format!("{}", n))
                    .collect::<Vec<_>>()
                    .join(",");
                let mut status_parts = Vec::new();
                if shard.is_reconciling {
                    status_parts.push("reconciling");
                }
                if shard.is_pending_compute_notification {
                    status_parts.push("pending_compute");
                }
                if shard.is_splitting {
                    status_parts.push("splitting");
                }
                let status = status_parts.join(",");
                table.add_row([
                    format!("{}", shard.tenant_shard_id),
                    shard
                        .node_attached
                        .map(|n| format!("{}", n))
                        .unwrap_or(String::new()),
                    secondary,
                    shard.last_error,
                    status,
                ]);
            }
            println!("{table}");
        }
        Command::TenantWarmup { tenant_id } => {
            let describe_response = storcon_client
                .dispatch::<(), TenantDescribeResponse>(
                    Method::GET,
                    format!("control/v1/tenant/{tenant_id}"),
                    None,
                )
                .await;
            match describe_response {
                Ok(describe) => {
                    if matches!(describe.policy, PlacementPolicy::Secondary) {
                        // Fine: it's already known to controller in secondary mode: calling
                        // again to put it into secondary mode won't cause problems.
                    } else {
                        anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
                    }
                }
                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
                    // Fine: this tenant isn't know to the storage controller yet.
                }
                Err(e) => {
                    // Unexpected API error
                    return Err(e.into());
                }
            }
            vps_client
                .location_config(
                    TenantShardId::unsharded(tenant_id),
                    pageserver_api::models::LocationConfig {
                        mode: pageserver_api::models::LocationConfigMode::Secondary,
                        generation: None,
                        secondary_conf: Some(LocationConfigSecondary { warm: true }),
                        shard_number: 0,
                        shard_count: 0,
                        shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
                        tenant_conf: TenantConfig::default(),
                    },
                    None,
                    true,
                )
                .await?;
            let describe_response = storcon_client
                .dispatch::<(), TenantDescribeResponse>(
                    Method::GET,
                    format!("control/v1/tenant/{tenant_id}"),
                    None,
                )
                .await?;
            let secondary_ps_id = describe_response
                .shards
                .first()
                .unwrap()
                .node_secondary
                .first()
                .unwrap();
            println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
            loop {
                let (status, progress) = vps_client
                    .tenant_secondary_download(
                        TenantShardId::unsharded(tenant_id),
                        Some(Duration::from_secs(10)),
                    )
                    .await?;
                println!(
                    "Progress: {}/{} layers, {}/{} bytes",
                    progress.layers_downloaded,
                    progress.layers_total,
                    progress.bytes_downloaded,
                    progress.bytes_total
                );
                match status {
                    StatusCode::OK => {
                        println!("Download complete");
                        break;
                    }
                    StatusCode::ACCEPTED => {
                        // Loop
                    }
                    _ => {
                        anyhow::bail!("Unexpected download status: {status}");
                    }
                }
            }
        }
    }
    Ok(())
 }
--- a/diesel.toml
+++ b/diesel.toml
@@ -2,8 +2,8 @@
 # see https://diesel.rs/guides/configuring-diesel-cli
 [print_schema]
-file = "storage_controller/src/schema.rs"
+file = "control_plane/attachment_service/src/schema.rs"
 custom_type_derives = ["diesel::query_builder::QueryId"]
 [migrations_directory]
-dir = "storage_controller/migrations"
+dir = "control_plane/attachment_service/migrations"
--- a/docs/rfcs/031-sharding-static.md
+++ b/docs/rfcs/031-sharding-static.md
@@ -1,408 +0,0 @@
 # Sharding Phase 1: Static Key-space Sharding
 ## Summary
 To enable databases with sizes approaching the capacity of a pageserver's disk,
 it is necessary to break up the storage for the database, or _shard_ it.
 Sharding in general is a complex area. This RFC aims to define an initial
 capability that will permit creating large-capacity databases using a static configuration
 defined at time of Tenant creation.
 ## Motivation
 Currently, all data for a Tenant, including all its timelines, is stored on a single
 pageserver. The local storage required may be several times larger than the actual
 database size, due to LSM write inflation.
 If a database is larger than what one pageserver can hold, then it becomes impossible
 for the pageserver to hold it in local storage, as it must do to provide service to
 clients.
 ### Prior art
 In Neon:
 - Layer File Spreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Konstantin-21fd9b11b618475da5f39c61dd8ab7a4
 - Layer File SPreading: https://www.notion.so/neondatabase/One-Pager-Layer-File-Spreading-Christian-eb6b64182a214e11b3fceceee688d843
 - Key Space partitioning: https://www.notion.so/neondatabase/One-Pager-Key-Space-Partitioning-Stas-8e3a28a600a04a25a68523f42a170677
 Prior art in other distributed systems is too broad to capture here: pretty much
 any scale out storage system does something like this.
 ## Requirements
 - Enable creating a large (for example, 16TiB) database without requiring dedicated
  pageserver nodes.
 - Share read/write bandwidth costs for large databases across pageservers, as well
  as storage capacity, in order to avoid large capacity databases acting as I/O hotspots
  that disrupt service to other tenants.
 - Our data distribution scheme should handle sparse/nonuniform keys well, since postgres
  does not write out a single contiguous ranges of page numbers.
 _Note: the definition of 'large database' is arbitrary, but the lower bound is to ensure that a database
 that a user might create on a current-gen enterprise SSD should also work well on
 Neon. The upper bound is whatever postgres can handle: i.e. we must make sure that the
 pageserver backend is not the limiting factor in the database size_.
 ## Non Goals
 - Independently distributing timelines within the same tenant. If a tenant has many
  timelines, then sharding may be a less efficient mechanism for distributing load than
  sharing out timelines between pageservers.
 - Distributing work in the LSN dimension: this RFC focuses on the Key dimension only,
  based on the idea that separate mechanisms will make sense for each dimension.
 ## Impacted Components
 pageserver, control plane, postgres/smgr
 ## Terminology
 **Key**: a postgres page number, qualified by relation. In the sense that the pageserver is a versioned key-value store,
 the page number is the key in that store. `Key` is a literal data type in existing code.
 **LSN dimension**: this just means the range of LSNs (history), when talking about the range
 of keys and LSNs as a two dimensional space.
 ## Implementation
 ### Key sharding vs. LSN sharding
 When we think of sharding across the two dimensional key/lsn space, this is an
 opportunity to think about how the two dimensions differ:
 - Sharding the key space distributes the _write_ workload of ingesting data
  and compacting. This work must be carefully managed so that exactly one
  node owns a given key.
 - Sharding the LSN space distributes the _historical read_ workload. This work
  can be done by anyone without any special coordination, as long as they can
  see the remote index and layers.
 The key sharding is the harder part, and also the more urgent one, to support larger
 capacity databases. Because distributing historical LSN read work is a relatively
 simpler problem that most users don't have, we defer it to future work. It is anticipated
 that some quite simple P2P offload model will enable distributing work for historical
 reads: a node which is low on space can call out to peer to ask it to download and
 serve reads from a historical layer.
 ### Key mapping scheme
 Having decided to focus on key sharding, we must next decide how we will map
 keys to shards. It is proposed to use a "wide striping" approach, to obtain a good compromise
 between data locality and avoiding entire large relations mapping to the same shard.
 We will define two spaces:
 - Key space: unsigned integer
 - Shard space: integer from 0 to N-1, where we have N shards.
 ### Key -> Shard mapping
 Keys are currently defined in the pageserver's getpage@lsn interface as follows:
 ```
 pub struct Key {
    pub field1: u8,
    pub field2: u32,
    pub field3: u32,
    pub field4: u32,
    pub field5: u8,
    pub field6: u32,
 }
 fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
    Key {
        field1: 0x00,
        field2: rel.spcnode,
        field3: rel.dbnode,
        field4: rel.relnode,
        field5: rel.forknum,
        field6: blknum,
    }
 }
 ```
 _Note: keys for relation metadata are ignored here, as this data will be mirrored to all
 shards. For distribution purposes, we only care about user data keys_
 The properties we want from our Key->Shard mapping are:
 - Locality in `blknum`, such that adjacent `blknum` will usually map to
  the same stripe and consequently land on the same shard, even though the overall
  collection of blocks in a relation will be spread over many stripes and therefore
  many shards.
 - Avoid the same blknum on different relations landing on the same stripe, so that
  with many small relations we do not end up aliasing data to the same stripe/shard.
 - Avoid vulnerability to aliasing in the values of relation identity fields, such that
  if there are patterns in the value of `relnode`, these do not manifest as patterns
  in data placement.
 To accomplish this, the blknum is used to select a stripe, and stripes are
 assigned to shards in a pseudorandom order via a hash. The motivation for
 pseudo-random distribution (rather than sequential mapping of stripe to shard)
 is to avoid I/O hotspots when sequentially reading multiple relations: we don't want
 all relations' stripes to touch pageservers in the same order.
 To map a `Key` to a shard:
 - Hash the `Key` field 4 (relNode).
 - Divide field 6 (`blknum`) field by the stripe size in pages, and combine the
  hash of this with the hash from the previous step.
 - The total hash modulo the shard count gives the shard holding this key.
 Why don't we use the other fields in the Key?
 - We ignore `forknum` for key mapping, because it distinguishes different classes of data
  in the same relation, and we would like to keep the data in a relation together.
 - We would like to use spcNode and dbNode, but cannot. Postgres database creation operations can refer to an existing database as a template, such that the created
  database's blocks differ only by spcNode and dbNode from the original. To enable running
  this type of creation without cross-pageserver communication, we must ensure that these
  blocks map to the same shard -- we do this by excluding spcNode and dbNode from the hash.
 ### Data placement examples
 For example, consider the extreme large databases cases of postgres data layout in a system with 8 shards
 and a stripe size of 32k pages:
 - A single large relation: `blknum` division will break the data up into 4096
  stripes, which will be scattered across the shards.
 - 4096 relations of of 32k pages each: each relation will map to exactly one stripe,
  and that stripe will be placed according to the hash of the key fields 4. The
  data placement will be statistically uniform across shards.
 Data placement will be more uneven on smaller databases:
 - A tenant with 2 shards and 2 relations of one stripe size each: there is a 50% chance
  that both relations land on the same shard and no data lands on the other shard.
 - A tenant with 8 shards and one relation of size 12 stripes: 4 shards will have double
  the data of the other four shards.
 These uneven cases for small amounts of data do not matter, as long as the stripe size
 is an order of magnitude smaller than the amount of data we are comfortable holding
 in a single shard: if our system handles shard sizes up to 10-100GB, then it is not an issue if
 a tenant has some shards with 256MB size and some shards with 512MB size, even though
 the standard deviation of shard size within the tenant is very high. Our key mapping
 scheme provides a statistical guarantee that as the tenant's overall data size increases,
 uniformity of placement will improve.
 ### Important Types
 #### `ShardIdentity`
 Provides the information needed to know whether a particular key belongs
 to a particular shard:
 - Layout version
 - Stripe size
 - Shard count
 - Shard index
 This structure's size is constant. Note that if we had used a differnet key
 mapping scheme such as consistent hashing with explicit hash ranges assigned
 to each shard, then the ShardIdentity's size would grow with the shard count: the simpler
 key mapping scheme used here enables a small fixed size ShardIdentity.
 ### Pageserver changes
 #### Structural
 Everywhere the Pageserver currently deals with Tenants, it will move to dealing with
 `TenantShard`s, which are just a `Tenant` plus a `ShardIdentity` telling it which part
 of the keyspace it owns. An un-sharded tenant is just a `TenantShard` whose `ShardIdentity`
 covers the whole keyspace.
 When the pageserver writes layers and index_part.json to remote storage, it must
 include the shard index & count in the name, to avoid collisions (the count is
 necessary for future-proofing: the count will vary in time). These keys
 will also include a generation number: the [generation numbers](025-generation-numbers.md) system will work
 exactly the same for TenantShards as it does for Tenants today: each shard will have
 its own generation number.
 #### Storage Format: Keys
 For tenants with >1 shard, layer files implicitly become sparse: within the key
 range described in the layer name, the layer file for a shard will only hold the
 content relevant to stripes assigned to the shard.
 For this reason, the LayerFileName within a tenant is no longer unique: different shards
 may use the same LayerFileName to refer to different data. We may solve this simply
 by including the shard number in the keys used for layers.
 The shard number will be included as a prefix (as part of tenant ID), like this:
 `pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/<layer file name>-<generation>`
 `pageserver/v1/tenants/<tenant_id>-<shard_number><shard_count>/timelines/<timeline id>/index_part.json-<generation>`
 Reasons for this particular format:
 - Use of a prefix is convenient for implementation (no need to carry the shard ID everywhere
  we construct a layer file name), and enables efficient listing of index_parts within
  a particular shard-timeline prefix.
 - Including the shard _count_ as well as shard number means that in future when we implement
  shard splitting, it will be possible for a parent shard and one of its children to write
  the same layer file without a name collision. For example, a parent shard 0_1 might split
  into two (0_2, 1_2), and in the process of splitting shard 0_2 could write a layer or index_part
  that is distinct from what shard 0_1 would have written at the same place.
 In practice, we expect shard counts to be relatively small, so a `u8` will be sufficient,
 and therefore the shard part of the path can be a fixed-length hex string like `{:02X}{:02X}`,
 for example a single-shard tenant's prefix will be `0001`.
 For backward compatibility, we may define a special `ShardIdentity` that has shard_count==0,
 and use this as a cue to construct paths with no prefix at all.
 #### Storage Format: Indices
 In the phase 1 described in this RFC, shards only reference layers they write themselves. However,
 when we implement shard splitting in future, it will be useful to enable shards to reference layers
 written by other shards (specifically the parent shard during a split), so that shards don't
 have to exhaustively copy all data into their own shard-prefixed keys.
 To enable this, the `IndexPart` structure will be extended to store the (shard number, shard count)
 tuple on each layer, such that it can construct paths for layers written by other shards. This
 naturally raises the question of who "owns" such layers written by ancestral shards: this problem
 will be addressed in phase 2.
 For backward compatibility, any index entry without shard information will be assumed to be
 in the legacy shardidentity.
 #### WAL Ingest
 In Phase 1, all shards will subscribe to the safekeeper to download WAL content. They will filter
 it down to the pages relevant to their shard:
 - For ordinary user data writes, only retain a write if it matches the ShardIdentity
 - For metadata describing relations etc, all shards retain these writes.
 The pageservers must somehow give the safekeeper correct feedback on remote_consistent_lsn:
 one solution here is for the 0th shard to periodically peek at the IndexParts for all the other shards,
 and have only the 0th shard populate remote_consistent_lsn. However, this is relatively
 expensive: if the safekeeper can be made shard-aware then it could be taught to use
 the max() of all shards' remote_consistent_lsns to decide when to trim the WAL.
 #### Compaction/GC
 No changes needed.
 The pageserver doesn't have to do anything special during compaction
 or GC. It is implicitly operating on the subset of keys that map to its ShardIdentity.
 This will result in sparse layer files, containing keys only in the stripes that this
 shard owns. Where optimizations currently exist in compaction for spotting "gaps" in
 the key range, these should be updated to ignore gaps that are due to sharding, to
 avoid spuriously splitting up layers ito stripe-sized pieces.
 ### Compute Endpoints
 Compute endpoints will need to:
 - Accept a vector of connection strings as part of their configuration from the control plane
 - Route pageserver requests according to mapping the hash of key to the correct
  entry in the vector of connection strings.
 Doing this in compute rather than routing requests via a single pageserver is
 necessary to enable sharding tenants without adding latency from extra hops.
 ### Control Plane
 Tenants, or _Projects_ in the control plane, will each own a set of TenantShards (this will
 be 1 for small tenants). Logic for placement of tenant shards is just the same as the current logic for placing
 tenants.
 Tenant lifecycle operations like deletion will require fanning-out to all the shards
 in the tenant. The same goes for timeline creation and deletion: a timeline should
 not be considered created until it has been created in all shards.
 #### Selectively enabling sharding for large tenants
 Initially, we will explicitly enable sharding for large tenants only.
 In future, this hint mechanism will become optional when we implement automatic
 re-sharding of tenants.
 ## Future Phases
 This section exists to indicate what will likely come next after this phase.
 Phases 2a and 2b are amenable to execution in parallel.
 ### Phase 2a: WAL fan-out
 **Problem**: when all shards consume the whole WAL, the network bandwidth used
 for transmitting the WAL from safekeeper to pageservers is multiplied by a factor
 of the shard count.
 Network bandwidth is not our most pressing bottleneck, but it is likely to become
 a problem if we set a modest shard count (~8) on a significant number of tenants,
 especially as those larger tenants which we shard are also likely to have higher
 write bandwidth than average.
 ### Phase 2b: Shard Splitting
 **Problem**: the number of shards in a tenant is defined at creation time and cannot
 be changed. This causes excessive sharding for most small tenants, and an upper
 bound on scale for very large tenants.
 To address this, a _splitting_ feature will later be added. One shard can split its
 data into a number of children by doing a special compaction operation to generate
 image layers broken up child-shard-wise, and then writing out an `index_part.json` for
 each child. This will then require external coordination (by the control plane) to
 safely attach these new child shards and then move them around to distribute work.
 The opposite _merging_ operation can also be imagined, but is unlikely to be implemented:
 once a Tenant has been sharded, the marginal efficiency benefit of merging is unlikely to justify
 the risk/complexity of implementing such a rarely-encountered scenario.
 ### Phase N (future): distributed historical reads
 **Problem**: while sharding based on key is good for handling changes in overall
 database size, it is less suitable for spiky/unpredictable changes in the read
 workload to historical layers. Sudden increases in historical reads could result
 in sudden increases in local disk capacity required for a TenantShard.
 Example: the extreme case of this would be to run a tenant for a year, then create branches
 with ancestors at monthly intervals. This could lead to a sudden 12x inflation in
 the on-disk capacity footprint of a TenantShard, since it would be serving reads
 from all those disparate historical layers.
 If we can respond fast enough, then key-sharding a tenant more finely can help with
 this, but splitting may be a relatively expensive operation and the increased historical
 read load may be transient.
 A separate mechanism for handling heavy historical reads could be something like
 a gossip mechanism for pageservers to communicate
 about their workload, and then a getpageatlsn offload mechanism where one pageserver can
 ask another to go read the necessary layers from remote storage to serve the read. This
 requires relativly little coordination because it is read-only: any node can service any
 read. All reads to a particular shard would still flow through one node, but the
 disk capactity & I/O impact of servicing the read would be distributed.
 ## FAQ/Alternatives
 ### Why stripe the data, rather than using contiguous ranges of keyspace for each shard?
 When a database is growing under a write workload, writes may predominantly hit the
 end of the keyspace, creating a bandwidth hotspot on that shard. Similarly, if the user
 is intensively re-writing a particular relation, if that relation lived in a particular
 shard then it would not achieve our goal of distributing the write work across shards.
 ### Why not proxy read requests through one pageserver, so that endpoints don't have to change?
 1. This would not achieve scale-out of network bandwidth: a busy tenant with a large
   database would still cause a load hotspot on the pageserver routing its read requests.
 2. The additional hop through the "proxy" pageserver would add latency and overall
   resource cost (CPU, network bandwidth)
 ### Layer File Spreading: use one pageserver as the owner of a tenant, and have it spread out work on a per-layer basis to peers
 In this model, there would be no explicit sharding of work, but the pageserver to which
 a tenant is attached would not hold all layers on its disk: instead, it would call out
 to peers to have them store some layers, and call out to those peers to request reads
 in those layers.
 This mechanism will work well for distributing work in the LSN dimension, but in the key
 space dimension it has the major limitation of requiring one node to handle all
 incoming writes, and compactions. Even if the write workload for a large database
 fits in one pageserver, it will still be a hotspot and such tenants may still
 de-facto require their own pageserver.
--- a/docs/rfcs/032-shard-splitting.md
+++ b/docs/rfcs/032-shard-splitting.md
@@ -1,479 +0,0 @@
 # Shard splitting
 ## Summary
 This RFC describes a new pageserver API for splitting an existing tenant shard into
 multiple shards, and describes how to use this API to safely increase the total
 shard count of a tenant.
 ## Motivation
 In the [sharding RFC](031-sharding-static.md), a mechanism was introduced to scale
 tenants beyond the capacity of a single pageserver by breaking up the key space
 into stripes, and distributing these stripes across many pageservers. However,
 the shard count was defined once at tenant creation time and not varied thereafter.
 In practice, the expected size of a database is rarely known at creation time, and
 it is inefficient to enable sharding for very small tenants: we need to be
 able to create a tenant with a small number of shards (such as 1), and later expand
 when it becomes clear that the tenant has grown in size to a point where sharding
 is beneficial.
 ### Prior art
 Many distributed systems have the problem of choosing how many shards to create for
 tenants that do not specify an expected size up-front. There are a couple of general
 approaches:
 - Write to a key space in order, and start a new shard when the highest key advances
  past some point. This doesn't work well for Neon, because we write to our key space
  in many different contiguous ranges (per relation), rather than in one contiguous
  range. To adapt to this kind of model, we would need a sharding scheme where each
  relation had its own range of shards, which would be inefficient for the common
  case of databases with many small relations.
 - Monitor the system, and automatically re-shard at some size threshold. For
  example in Ceph, the [pg_autoscaler](https://github.com/ceph/ceph/blob/49c27499af4ee9a90f69fcc6bf3597999d6efc7b/src/pybind/mgr/pg_autoscaler/module.py)
  component monitors the size of each RADOS Pool, and adjusts the number of Placement
  Groups (Ceph's shard equivalent).
 ## Requirements
 - A configurable capacity limit per-shard is enforced.
 - Changes in shard count do not interrupt service beyond requiring postgres
  to reconnect (i.e. milliseconds).
 - Human being does not have to choose shard count
 ## Non Goals
 - Shard splitting is always a tenant-global operation: we will not enable splitting
  one shard while leaving others intact.
 - The inverse operation (shard merging) is not described in this RFC. This is a lower
  priority than splitting, because databases grow more often than they shrink, and
  a database with many shards will still work properly if the stored data shrinks, just
  with slightly more overhead (e.g. redundant WAL replication)
 - Shard splitting is only initiated based on capacity bounds, not load. Splitting
  a tenant based on load will make sense for some medium-capacity, high-load workloads,
  but is more complex to reason about and likely is not desirable until we have
  shard merging to reduce the shard count again if the database becomes less busy.
 ## Impacted Components
 pageserver, storage controller
 (the _storage controller_ is the evolution of what was called `attachment_service` in our test environment)
 ## Terminology
 **Parent** shards are the shards that exist before a split. **Child** shards are
 the new shards created during a split.
 **Shard** is synonymous with _tenant shard_.
 **Shard Index** is the 2-tuple of shard number and shard count, written in
 paths as {:02x}{:02x}, e.g. `0001`.
 ## Background
 In the implementation section, a couple of existing aspects of sharding are important
 to remember:
 - Shard identifiers contain the shard number and count, so that "shard 0 of 1" (`0001`) is
  a distinct shard from "shard 0 of 2" (`0002`). This is the case in key paths, local
  storage paths, and remote index metadata.
 - Remote layer file paths contain the shard index of the shard that created them, and
  remote indices contain the same index to enable building the layer file path. A shard's
  index may reference layers that were created by another shard.
 - Local tenant shard directories include the shard index. All layers downloaded by
  a tenant shard are stored in this shard-prefixed path, even if those layers were
  initially created by another shard: tenant shards do not read and write one anothers'
  paths.
 - The `Tenant` pageserver type represents one tenant _shard_, not the whole tenant.
  This is for historical reasons and will be cleaned up in future, but the existing
  name is used here to help comprehension when reading code.
 ## Implementation
 Note: this section focuses on the correctness of the core split process. This will
 be fairly inefficient in a naive implementation, and several important optimizations
 are described in a later section.
 There are broadly two parts to the implementation:
 1. The pageserver split API, which splits one shard on one pageserver
 2. The overall tenant split proccess which is coordinated by the storage controller,
   and calls into the pageserver split API as needed.
 ### Pageserver Split API
 The pageserver will expose a new API endpoint at `/v1/tenant/:tenant_shard_id/shard_split`
 that takes the new total shard count in the body.
 The pageserver split API operates on one tenant shard, on one pageserver. External
 coordination is required to use it safely, this is described in the later
 'Split procedure' section.
 #### Preparation
 First identify the shard indices for the new child shards. These are deterministic,
 calculated from the parent shard's index, and the number of children being created (this
 is an input to the API, and validated to be a power of two). In a trivial example, splitting
 0001 in two always results in 0002 and 0102.
 Child shard indices are chosen such that the childrens' parts of the keyspace will
 be subsets of the parent's parts of the keyspace.
 #### Step 1: write new remote indices
 In remote storage, splitting is very simple: we may just write new index_part.json
 objects for each child shard, containing exactly the same layers as the parent shard.
 The children will have more data than they need, but this avoids any exhausive
 re-writing or copying of layer files.
 The index key path includes a generation number: the parent shard's current
 attached generation number will also be used for the child shards' indices. This
 makes the operation safely retryable: if everything crashes and restarts, we may
 call the split API again on the parent shard, and the result will be some new remote
 indices for the child shards, under a higher generation number.
 #### Step 2: start new `Tenant` objects
 A new `Tenant` object may be instantiated for each child shard, while the parent
 shard still exists. When calling the tenant_spawn function for this object,
 the remote index from step 1 will be read, and the child shard will start
 to ingest WAL to catch up from whatever was in the remote storage at step 1.
 We now wait for child shards' WAL ingestion to catch up with the parent shard,
 so that we can safely tear down the parent shard without risking an availability
 gap to clients reading recent LSNs.
 #### Step 3: tear down parent `Tenant` object
 Once child shards are running and have caught up with WAL ingest, we no longer
 need the parent shard. Note that clients may still be using it -- when we
 shut it down, any page_service handlers will also shut down, causing clients
 to disconnect. When the client reconnects, it will re-lookup the tenant,
 and hit the child shard instead of the parent (shard lookup from page_service
 should bias toward higher ShardCount shards).
 Note that at this stage the page service client has not yet been notified of
 any split. In the trivial single split example:
 - Shard 0001 is gone: Tenant object torn down
 - Shards 0002 and 0102 are running on the same pageserver where Shard 0001 used to live.
 - Clients will continue to connect to that server thinking that shard 0001 is there,
  and all requests will work, because any key that was in shard 0001 is definitely
  available in either shard 0002 or shard 0102.
 - Eventually, the storage controller (not the pageserver) will decide to migrate
  some child shards away: at that point it will do a live migration, ensuring
  that the client has an updated configuration before it detaches anything
  from the original server.
 #### Complete
 When we send a 200 response to the split request, we are promising the caller:
 - That the child shards are persistent in remote storage
 - That the parent shard has been shut down
 This enables the caller to proceed with the overall shard split operation, which
 may involve other shards on other pageservers.
 ### Storage Controller Split procedure
 Splitting a tenant requires calling the pageserver split API, and tracking
 enough state to ensure recovery + completion in the event of any component (pageserver
 or storage controller) crashing (or request timing out) during the split.
 1. call the split API on all existing shards. Ensure that the resulting
   child shards are pinned to their pageservers until _all_ the split calls are done.
   This pinning may be implemented as a "split bit" on the tenant shards, that
   blocks any migrations, and also acts as a sign that if we restart, we must go
   through some recovery steps to resume the split.
 2. Once all the split calls are done, we may unpin the child shards (clear
   the split bit). The split is now complete: subsequent steps are just migrations,
   not strictly part of the split.
 3. Try to schedule new pageserver locations for the child shards, using
   a soft anti-affinity constraint to place shards from the same tenant onto different
   pageservers.
 Updating computes about the new shard count is not necessary until we migrate
 any of the child shards away from the parent's location.
 ### Recovering from failures
 #### Rolling back an incomplete split
 An incomplete shard split may be rolled back quite simply, by attaching the parent shards to pageservers,
 and detaching child shards. This will lose any WAL ingested into the children after the parents
 were detached earlier, but the parents will catch up.
 No special pageserver API is needed for this. From the storage controllers point of view, the
 procedure is:
 1. For all parent shards in the tenant, ensure they are attached
 2. For all child shards, ensure they are not attached
 3. Drop child shards from the storage controller's database, and clear the split bit on the parent shards.
 Any remote storage content for child shards is left behind. This is similar to other cases where
 we may leave garbage objects in S3 (e.g. when we upload a layer but crash before uploading an
 index that references it). Future online scrub/cleanup functionality can remove these objects, or
 they will be removed when the tenant is deleted, as tenant deletion lists all objects in the prefix,
 which would include any child shards that were rolled back.
 If any timelines had been created on child shards, they will be lost when rolling back. To mitigate
 this, we will **block timeline creation during splitting**, so that we can safely roll back until
 the split is complete, without risking losing timelines.
 Rolling back an incomplete split will happen automatically if a split fails due to some fatal
 reason, and will not be accessible via an API:
 - A pageserver fails to complete its split API request after too many retries
 - A pageserver returns a fatal unexpected error such as 400 or 500
 - The storage controller database returns a non-retryable error
 - Some internal invariant is violated in the storage controller split code
 #### Rolling back a complete split
 A complete shard split may be rolled back similarly to an incomplete split, with the following
 modifications:
 - The parent shards will no longer exist in the storage controller database, so these must
  be re-synthesized somehow: the hard part of this is figuring the parent shards' generations. This
  may be accomplished either by probing in S3, or by retaining some tombstone state for deleted
  shards in the storage controller database.
 - Any timelines that were created after the split complete will disappear when rolling back
  to the tenant shards. For this reason, rolling back after a complete split should only
  be done due to serious issues where loss of recently created timelines is acceptable, or
  in cases where we have confirmed that no timelines were created in the intervening period.
 - Parent shards' layers must not have been deleted: this property will come "for free" when
  we first roll out sharding, by simply not implementing deletion of parent layers after
  a split. When we do implement such deletion (see "Cleaning up parent-shard layers" in the
  Optimizations section), it should apply a TTL to layers such that we have a
  defined walltime window in which rollback will be possible.
 The storage controller will expose an API for rolling back a complete split, for use
 in the field if we encounter some critical bug with a post-split tenant.
 #### Retrying API calls during Pageserver Restart
 When a pageserver restarts during a split API call, it may witness on-disk content for both parent and
 child shards from an ongoing split. This does not intrinsically break anything, and the
 pageserver may include all these shards in its `/re-attach` request to the storage controller.
 In order to support such restarts, it is important that the storage controller stores
 persistent records of each child shard before it calls into a pageserver, as these child shards
 may require generation increments via a `/re-attach` request.
 The pageserver restart will also result in a failed API call from the storage controller's point
 of view. Recall that if _any_ pageserver fails to split, the overall split operation may not
 complete, and all shards must remain pinned to their current pageserver locations until the
 split is done.
 The pageserver API calls during splitting will retry on transient errors, so that
 short availability gaps do not result in a failure of the overall operation. The
 split in progress will be automatically rolled back if the threshold for API
 retries is reached (e.g. if a pageserver stays offline for longer than a typical
 restart).
 #### Rollback on Storage Controller Restart
 On startup, the storage controller will inspect the split bit for tenant shards that
 it loads from the database. If any splits are in progress:
 - Database content will be reverted to the parent shards
 - Child shards will be dropped from memory
 - The parent and child shards will be included in the general startup reconciliation that
  the storage controller does: any child shards will be detached from pageservers because
  they don't exist in the storage controller's expected set of shards, and parent shards
  will be attached if they aren't already.
 #### Storage controller API request failures/retries
 The split request handler will implement idempotency: if the [`Tenant`] requested to split
 doesn't exist, we will check for the would-be child shards, and if they already exist,
 we consider the request complete.
 If a request is retried while the original request is still underway, then the split
 request handler will notice an InProgress marker in TenantManager, and return 503
 to encourage the client to backoff/retry. This is the same as the general pageserver
 API handling for calls that try to act on an InProgress shard.
 #### Compute start/restart during a split
 If a compute starts up during split, it will be configured with the old sharding
 configuration. This will work for reads irrespective of the progress of the split
 as long as no child hards have been migrated away from their original location, and
 this is guaranteed in the split procedure (see earlier section).
 #### Pageserver fails permanently during a split
 If a pageserver permanently fails (i.e. the storage controller availability state for it
 goes to Offline) while a split is in progress, the splitting operation will roll back, and
 during the roll back it will skip any API calls to the offline pageserver. If the offline
 pageserver becomes available again, any stale locations will be cleaned up via the normal reconciliation process (the `/re-attach` API).
 ### Handling secondary locations
 For correctness, it is not necessary to split secondary locations. We can simply detach
 the secondary locations for parent shards, and then attach new secondary locations
 for child shards.
 Clearly this is not optimal, as it will result in re-downloads of layer files that
 were already present on disk. See "Splitting secondary locations"
 ### Conditions to trigger a split
 The pageserver will expose a new API for reporting on shards that are candidates
 for split: this will return a top-N report of the largest tenant shards by
 physical size (remote size). This should exclude any tenants that are already
 at the maximum configured shard count.
 The API would look something like:
 `/v1/top_n_tenant?shard_count_lt=8&sort_by=resident_size`
 The storage controller will poll that API across all pageservers it manages at some appropriate interval (e.g. 60 seconds).
 A split operation will be started when the tenant exceeds some threshold. This threshold
 should be _less than_ how large we actually want shards to be, perhaps much less. That's to
 minimize the amount of work involved in splitting -- if we want 100GiB shards, we shouldn't
 wait for a tenant to exceed 100GiB before we split anything. Some data analysis of existing
 tenant size distribution may be useful here: if we can make a statement like "usually, if
 a tenant has exceeded 20GiB they're probably going to exceed 100GiB later", then we might
 make our policy to split a tenant at 20GiB.
 The finest split we can do is by factors of two, but we can do higher-cardinality splits
 too, and this will help to reduce the overhead of repeatedly re-splitting a tenant
 as it grows. An example of a very simple heuristic for early deployment of the splitting
 feature would be: "Split tenants into 8 shards when their physical size exceeds 64GiB": that
 would give us two kinds of tenant (1 shard and 8 shards), and the confidence that once we had
 split a tenant, it will not need re-splitting soon after.
 ## Optimizations
 ### Flush parent shard to remote storage during split
 Any data that is in WAL but not remote storage at time of split will need
 to be replayed by child shards when they start for the first time. To minimize
 this work, we may flush the parent shard to remote storage before writing the
 remote indices for child shards.
 It is important that this flush is subject to some time bounds: we may be splitting
 in response to a surge of write ingest, so it may be time-critical to split. A
 few seconds to flush latest data should be sufficient to optimize common cases without
 running the risk of holding up a split for a harmful length of time when a parent
 shard is being written heavily. If the flush doesn't complete in time, we may proceed
 to shut down the parent shard and carry on with the split.
 ### Hard linking parent layers into child shard directories
 Before we start the Tenant objects for child shards, we may pre-populate their
 local storage directories with hard links to the layer files already present
 in the parent shard's local directory. When the child shard starts and downloads
 its remote index, it will find all those layer files already present on local disk.
 This avoids wasting download capacity and makes splitting faster, but more importantly
 it avoids taking up a factor of N more disk space when splitting 1 shard into N.
 This mechanism will work well in typical flows where shards are migrated away
 promptly after a split, but for the general case including what happens when
 layers are evicted and re-downloaded after a split, see the 'Proactive compaction'
 section below.
 ### Filtering during compaction
 Compaction, especially image layer generation, should skip any keys that are
 present in a shard's layer files, but do not match the shard's ShardIdentity's
 is_key_local() check. This avoids carrying around data for longer than necessary
 in post-split compactions.
 This was already implemented in https://github.com/neondatabase/neon/pull/6246
 ### Proactive compaction
 In remote storage, there is little reason to rewrite any data on a shard split:
 all the children can reference parent layers via the very cheap write of the child
 index_part.json.
 In local storage, things are more nuanced. During the initial split there is no
 capacity cost to duplicating parent layers, if we implement the hard linking
 optimization described above. However, as soon as any layers are evicted from
 local disk and re-downloaded, the downloaded layers will not be hard-links any more:
 they'll have real capacity footprint. That isn't a problem if we migrate child shards
 away from the parent node swiftly, but it risks a significant over-use of local disk
 space if we do not.
 For example, if we did an 8-way split of a shard, and then _didn't_ migrate 7 of
 the shards elsewhere, then churned all the layers in all the shards via eviction,
 then we would blow up the storage capacity used on the node by 8x. If we're splitting
 a 100GB shard, that could take the pageserver to the point of exhausting disk space.
 To avoid this scenario, we could implement a special compaction mode where we just
 read historic layers, drop unwanted keys, and write back the layer file. This
 is pretty expensive, but useful if we have split a large shard and are not going to
 migrate the child shards away.
 The heuristic conditions for triggering such a compaction are:
 - A) eviction plus time: if a child shard
  has existed for more than a time threshold, and has been requested to perform at least one eviction, then it becomes urgent for this child shard to execute a proactive compaction to reduce its storage footprint, at the cost of I/O load.
 - B) resident size plus time: we may inspect the resident layers and calculate how
  many of them include the overhead of storing pre-split keys. After some time
  threshold (different to the one in case A) we still have such layers occupying
  local disk space, then we should proactively compact them.
 ### Cleaning up parent-shard layers
 It is functionally harmless to leave parent shard layers in remote storage indefinitely.
 They would be cleaned up in the event of the tenant's deletion.
 As an optimization to avoid leaking remote storage capacity (which costs money), we may
 lazily clean up parent shard layers once no child shards reference them.
 This may be done _very_ lazily: e.g. check every PITR interval. The cleanup procedure is:
 - list all the key prefixes beginning with the tenant ID, and select those shard prefixes
  which do not belong to the most-recently-split set of shards (_ancestral shards_, i.e. `shard*count < max(shard_count) over all shards)`, and those shard prefixes which do have the latest shard count (_current shards_)
 - If there are no _ancestral shard_ prefixes found, we have nothing to clean up and
  may drop out now.
 - find the latest-generation index for each _current shard_, read all and accumulate the set of layers belonging to ancestral shards referenced by these indices.
 - for all ancestral shards, list objects in the prefix and delete any layer which was not
  referenced by a current shard.
 If this cleanup is scheduled for 1-2 PITR periods after the split, there is a good chance that child shards will have written their own image layers covering the whole keyspace, such that all parent shard layers will be deletable.
 The cleanup may be done by the scrubber (external process), or we may choose to have
 the zeroth shard in the latest generation do the work -- there is no obstacle to one shard
 reading the other shard's indices at runtime, and we do not require visibility of the
 latest index writes.
 Cleanup should be artificially delayed by some period (for example 24 hours) to ensure
 that we retain the option to roll back a split in case of bugs.
 ### Splitting secondary locations
 We may implement a pageserver API similar to the main splitting API, which does a simpler
 operation for secondary locations: it would not write anything to S3, instead it would simply
 create the child shard directory on local disk, hard link in directories from the parent,
 and set up the in memory (TenantSlot) state for the children.
 Similar to attached locations, a subset of secondary locations will probably need re-locating
 after the split is complete, to avoid leaving multiple child shards on the same pageservers,
 where they may use excessive space for the tenant.
 ## FAQ/Alternatives
 ### What should the thresholds be set to?
 Shard size limit: the pre-sharding default capacity quota for databases was 200GiB, so this could be a starting point for the per-shard size limit.
 Max shard count:
 - The safekeeper overhead to sharding is currently O(N) network bandwidth because
  the un-filtered WAL is sent to all shards. To avoid this growing out of control,
  a limit of 8 shards should be temporarily imposed until WAL filtering is implemented
  on the safekeeper.
 - there is also little benefit to increasing the shard count beyond the number
  of pageservers in a region.
 ### Is it worth just rewriting all the data during a split to simplify reasoning about space?
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -7,11 +7,6 @@ Below you will find a brief overview of each subdir in the source tree in alphab
 Neon storage broker, providing messaging between safekeepers and pageservers.
 [storage_broker.md](./storage_broker.md)
 `storage_controller`:
 Neon storage controller, manages a cluster of pageservers and exposes an API that enables
 managing a many-sharded tenant as a single entity.
 `/control_plane`:
 Local control plane.
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -1,150 +0,0 @@
 # Storage Controller
 ## Concepts
 The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller,
 which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations).
 It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding
 the underlying details of how data is spread across multiple nodes.
 The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent.
 ## APIs
 The storage controller’s HTTP server implements four logically separate APIs:
 - `/v1/...` path is the pageserver-compatible API. This has to be at the path root because that’s where clients expect to find it on a pageserver.
 - `/control/v1/...` path is the storage controller’s API, which enables operations such as registering and management pageservers, or executing shard splits.
 - `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system.
 - `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers
  to ensure data safety with generation numbers.
 The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers’ APIs).
 See the `http.rs` file in the source for where the HTTP APIs are implemented.
 ## Database
 The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not
 persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
 rebuilt on startup.
 The file `[persistence.rs](http://persistence.rs)` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
 The `diesel` crate is used for defining models & migrations.
 Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controller’s database.
 ### Diesel tip: migrations
 If you need to modify the database schema, here’s how to create a migration:
 - Install the diesel CLI with `cargo install diesel_cli`
 - Use `diesel migration generate <name>` to create a new migration
 - Populate the SQL files in the `migrations/` subdirectory
 - Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
 - Commit the migration files and the changes to schema.rs
 - If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
 - The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
 ## storcon_cli
 The `storcon_cli` tool enables interactive management of the storage controller. This is usually
 only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline).
 `storcon_cli --help` includes details on commands.
 # Deploying
 This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as
 part of a self-hosted system.
 _General note: since the default `neon_local` environment includes a storage controller, this is a useful
 reference when figuring out deployment._
 ## Database
 It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral
 local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver.
 The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte.
 Set the URL to the database using the `--database-url` CLI option.
 There is no need to run migrations manually: the storage controller automatically applies migrations
 when it starts up.
 ## Configure pageservers to use the storage controller
 1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should
   point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters.
 2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself
   with the storage controller when it starts up. See the example below for the format of this file.
 ### Example `metadata.json`
 ```
 {"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000}
 ```
 - `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever
  postgres runs.
 - `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where
  the storage controller runs.
 ## Handle compute notifications.
 The storage controller independently moves tenant attachments between pageservers in response to
 changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable
 postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
 location changes.
 The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
 JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
 In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
 the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
 the compute hook.
 When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
 the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
 ```
 struct ComputeHookNotifyRequestShard {
    node_id: NodeId,
    shard_number: ShardNumber,
 }
 struct ComputeHookNotifyRequest {
    tenant_id: TenantId,
    stripe_size: Option<ShardStripeSize>,
    shards: Vec<ComputeHookNotifyRequestShard>,
 }
 ```
 When a notification is received:
 1. Modify postgres configuration for this tenant:
   - set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
     shards identified by `NodeId` must be converted to the address+port of the node.
   - if stripe_size is not None, set `neon.stripe_size` to this value
 2. Send SIGHUP to postgres to reload configuration
 3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
   will retry the notification until it succeeds..
 ### Example notification body
 ```
 {
  "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
  "stripe_size": 32768,
  "shards": [
      {"node_id": 344, "shard_number": 0},
      {"node_id": 722, "shard_number": 1},
  ],
 }
 ```
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -10,13 +10,11 @@ libc.workspace = true
 once_cell.workspace = true
 chrono.workspace = true
 twox-hash.workspace = true
 measured.workspace = true
 workspace_hack.workspace = true
 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
 measured-process.workspace = true
 [dev-dependencies]
 rand = "0.8"
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -7,19 +7,14 @@
 //! use significantly less memory than this, but can only approximate the cardinality.
 use std::{
-    hash::{BuildHasher, BuildHasherDefault, Hash},
+    collections::HashMap,
-    sync::atomic::AtomicU8,
+    hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
    sync::{atomic::AtomicU8, Arc, RwLock},
 };
-use measured::{
+use prometheus::{
-    label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
+    core::{self, Describer},
-    metric::{
+    proto, Opts,
        group::{Encoding, MetricValue},
        name::MetricNameEncoder,
        Metric, MetricType, MetricVec,
    },
    text::TextEncoder,
    LabelGroup,
 };
 use twox_hash::xxh3;
@@ -45,7 +40,7 @@ macro_rules! register_hll {
    }};
    ($N:literal, $NAME:expr, $HELP:expr $(,)?) => {{
-        $crate::register_hll!($N, $crate::opts!($NAME, $HELP))
+        $crate::register_hll!($N, $crate::opts!($NAME, $HELP), $LABELS_NAMES)
    }};
 }
@@ -98,25 +93,203 @@ macro_rules! register_hll {
 /// ```
 ///
 /// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
-pub type HyperLogLogVec<L, const N: usize> = MetricVec<HyperLogLogState<N>, L>;
+#[derive(Clone)]
-pub type HyperLogLog<const N: usize> = Metric<HyperLogLogState<N>>;
+pub struct HyperLogLogVec<const N: usize> {
-
+    core: Arc<HyperLogLogVecCore<N>>,
 pub struct HyperLogLogState<const N: usize> {
    shards: [AtomicU8; N],
 }
-impl<const N: usize> Default for HyperLogLogState<N> {
+
-    fn default() -> Self {
+struct HyperLogLogVecCore<const N: usize> {
-        #[allow(clippy::declare_interior_mutable_const)]
+    pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
-        const ZERO: AtomicU8 = AtomicU8::new(0);
+    pub desc: core::Desc,
-        Self { shards: [ZERO; N] }
+    pub opts: Opts,
 }
 impl<const N: usize> core::Collector for HyperLogLogVec<N> {
    fn desc(&self) -> Vec<&core::Desc> {
        vec![&self.core.desc]
    }
    fn collect(&self) -> Vec<proto::MetricFamily> {
        let mut m = proto::MetricFamily::default();
        m.set_name(self.core.desc.fq_name.clone());
        m.set_help(self.core.desc.help.clone());
        m.set_field_type(proto::MetricType::GAUGE);
        let mut metrics = Vec::new();
        for child in self.core.children.read().unwrap().values() {
            child.core.collect_into(&mut metrics);
        }
        m.set_metric(metrics);
        vec![m]
    }
 }
-impl<const N: usize> MetricType for HyperLogLogState<N> {
+impl<const N: usize> HyperLogLogVec<N> {
-    type Metadata = ();
+    /// Create a new [`HyperLogLogVec`] based on the provided
    /// [`Opts`] and partitioned by the given label names. At least one label name must be
    /// provided.
    pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
        assert!(N.is_power_of_two());
        let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
        let opts = opts.variable_labels(variable_names);
        let desc = opts.describe()?;
        let v = HyperLogLogVecCore {
            children: RwLock::new(HashMap::default()),
            desc,
            opts,
        };
        Ok(Self { core: Arc::new(v) })
    }
    /// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
    /// of label values (same order as the VariableLabels in Desc). If that combination of
    /// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
    ///
    /// An error is returned if the number of label values is not the same as the
    /// number of VariableLabels in Desc.
    pub fn get_metric_with_label_values(
        &self,
        vals: &[&str],
    ) -> prometheus::Result<HyperLogLog<N>> {
        self.core.get_metric_with_label_values(vals)
    }
    /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
    /// occurs.
    pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
        self.get_metric_with_label_values(vals).unwrap()
    }
 }
-impl<const N: usize> HyperLogLogState<N> {
+impl<const N: usize> HyperLogLogVecCore<N> {
    pub fn get_metric_with_label_values(
        &self,
        vals: &[&str],
    ) -> prometheus::Result<HyperLogLog<N>> {
        let h = self.hash_label_values(vals)?;
        if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
            return Ok(metric);
        }
        self.get_or_create_metric(h, vals)
    }
    pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
        if vals.len() != self.desc.variable_labels.len() {
            return Err(prometheus::Error::InconsistentCardinality {
                expect: self.desc.variable_labels.len(),
                got: vals.len(),
            });
        }
        let mut h = xxh3::Hash64::default();
        for val in vals {
            h.write(val.as_bytes());
        }
        Ok(h.finish())
    }
    fn get_or_create_metric(
        &self,
        hash: u64,
        label_values: &[&str],
    ) -> prometheus::Result<HyperLogLog<N>> {
        let mut children = self.children.write().unwrap();
        // Check exist first.
        if let Some(metric) = children.get(&hash).cloned() {
            return Ok(metric);
        }
        let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
        children.insert(hash, metric.clone());
        Ok(metric)
    }
 }
 /// HLL is a probabilistic cardinality measure.
 ///
 /// How to use this time-series for a metric name `my_metrics_total_hll`:
 ///
 /// ```promql
 /// # harmonic mean
 /// 1 / (
 ///     sum (
 ///         2 ^ -(
 ///             # HLL merge operation
 ///             max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
 ///         )
 ///     ) without (hll_shard)
 /// )
 /// * alpha
 /// * shards_count
 /// * shards_count
 /// ```
 ///
 /// If you want an estimate over time, you can use the following query:
 ///
 /// ```promql
 /// # harmonic mean
 /// 1 / (
 ///     sum (
 ///         2 ^ -(
 ///             # HLL merge operation
 ///             max (
 ///                 max_over_time(my_metrics_total_hll{}[$__rate_interval])
 ///             ) by (hll_shard, other_labels...)
 ///         )
 ///     ) without (hll_shard)
 /// )
 /// * alpha
 /// * shards_count
 /// * shards_count
 /// ```
 ///
 /// In the case of low cardinality, you might want to use the linear counting approximation:
 ///
 /// ```promql
 /// # LinearCounting(m, V) = m log (m / V)
 /// shards_count * ln(shards_count /
 ///     # calculate V = how many shards contain a 0
 ///     count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
 /// )
 /// ```
 ///
 /// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
 #[derive(Clone)]
 pub struct HyperLogLog<const N: usize> {
    core: Arc<HyperLogLogCore<N>>,
 }
 impl<const N: usize> HyperLogLog<N> {
    /// Create a [`HyperLogLog`] with the `name` and `help` arguments.
    pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
        assert!(N.is_power_of_two());
        let opts = Opts::new(name, help);
        Self::with_opts(opts)
    }
    /// Create a [`HyperLogLog`] with the `opts` options.
    pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
        Self::with_opts_and_label_values(&opts, &[])
    }
    fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
        let desc = opts.describe()?;
        let labels = make_label_pairs(&desc, label_values)?;
        let v = HyperLogLogCore {
            shards: [0; N].map(AtomicU8::new),
            desc,
            labels,
        };
        Ok(Self { core: Arc::new(v) })
    }
    pub fn measure(&self, item: &impl Hash) {
        // changing the hasher will break compatibility with previous measurements.
        self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
@@ -126,11 +299,42 @@ impl<const N: usize> HyperLogLogState<N> {
        let p = N.ilog2() as u8;
        let j = hash & (N as u64 - 1);
        let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
-        self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
+        self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
    }
 }
 struct HyperLogLogCore<const N: usize> {
    shards: [AtomicU8; N],
    desc: core::Desc,
    labels: Vec<proto::LabelPair>,
 }
 impl<const N: usize> core::Collector for HyperLogLog<N> {
    fn desc(&self) -> Vec<&core::Desc> {
        vec![&self.core.desc]
    }
-    fn take_sample(&self) -> [u8; N] {
+    fn collect(&self) -> Vec<proto::MetricFamily> {
-        self.shards.each_ref().map(|x| {
+        let mut m = proto::MetricFamily::default();
        m.set_name(self.core.desc.fq_name.clone());
        m.set_help(self.core.desc.help.clone());
        m.set_field_type(proto::MetricType::GAUGE);
        let mut metrics = Vec::new();
        self.core.collect_into(&mut metrics);
        m.set_metric(metrics);
        vec![m]
    }
 }
 impl<const N: usize> HyperLogLogCore<N> {
    fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
        self.shards.iter().enumerate().for_each(|(i, x)| {
            let mut shard_label = proto::LabelPair::default();
            shard_label.set_name("hll_shard".to_owned());
            shard_label.set_value(format!("{i}"));
            // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.
            // This seems like it would be a race condition,
@@ -140,90 +344,85 @@ impl<const N: usize> HyperLogLogState<N> {
            // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
            // this would mean that a dev port-forwarding the metrics url won't break the sampling.
-            x.swap(0, std::sync::atomic::Ordering::Relaxed)
+            let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
            let mut m = proto::Metric::default();
            let mut c = proto::Gauge::default();
            c.set_value(v as f64);
            m.set_gauge(c);
            let mut labels = Vec::with_capacity(self.labels.len() + 1);
            labels.extend_from_slice(&self.labels);
            labels.push(shard_label);
            m.set_label(labels);
            metrics.push(m);
        })
    }
 }
-impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
+
-    for HyperLogLogState<N>
+fn make_label_pairs(
-{
+    desc: &core::Desc,
-    fn write_type(
+    label_values: &[&str],
-        name: impl MetricNameEncoder,
+) -> prometheus::Result<Vec<proto::LabelPair>> {
-        enc: &mut TextEncoder<W>,
+    if desc.variable_labels.len() != label_values.len() {
-    ) -> Result<(), std::io::Error> {
+        return Err(prometheus::Error::InconsistentCardinality {
-        enc.write_type(&name, measured::text::MetricType::Gauge)
+            expect: desc.variable_labels.len(),
            got: label_values.len(),
        });
    }
    fn collect_into(
        &self,
        _: &(),
        labels: impl LabelGroup,
        name: impl MetricNameEncoder,
        enc: &mut TextEncoder<W>,
    ) -> Result<(), std::io::Error> {
        struct I64(i64);
        impl LabelValue for I64 {
            fn visit<V: LabelVisitor>(&self, v: V) -> V::Output {
                v.write_int(self.0)
            }
        }
-        struct HllShardLabel {
+    let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
-            hll_shard: i64,
+    if total_len == 0 {
-        }
+        return Ok(vec![]);
        impl LabelGroup for HllShardLabel {
            fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
                const LE: &LabelName = LabelName::from_str("hll_shard");
                v.write_value(LE, &I64(self.hll_shard));
            }
        }
        self.take_sample()
            .into_iter()
            .enumerate()
            .try_for_each(|(hll_shard, val)| {
                enc.write_metric_value(
                    name.by_ref(),
                    labels.by_ref().compose_with(HllShardLabel {
                        hll_shard: hll_shard as i64,
                    }),
                    MetricValue::Int(val as i64),
                )
            })
    }
    if desc.variable_labels.is_empty() {
        return Ok(desc.const_label_pairs.clone());
    }
    let mut label_pairs = Vec::with_capacity(total_len);
    for (i, n) in desc.variable_labels.iter().enumerate() {
        let mut label_pair = proto::LabelPair::default();
        label_pair.set_name(n.clone());
        label_pair.set_value(label_values[i].to_owned());
        label_pairs.push(label_pair);
    }
    for label_pair in &desc.const_label_pairs {
        label_pairs.push(label_pair.clone());
    }
    label_pairs.sort();
    Ok(label_pairs)
 }
 #[cfg(test)]
 mod tests {
    use std::collections::HashSet;
-    use measured::{label::StaticLabelSet, FixedCardinalityLabel};
+    use prometheus::{proto, Opts};
    use rand::{rngs::StdRng, Rng, SeedableRng};
    use rand_distr::{Distribution, Zipf};
    use crate::HyperLogLogVec;
-    #[derive(FixedCardinalityLabel, Clone, Copy)]
+    fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
-    #[label(singleton = "x")]
+        let mut metrics = vec![];
-    enum Label {
+        hll.core
-        A,
+            .children
-        B,
+            .read()
            .unwrap()
            .values()
            .for_each(|c| c.core.collect_into(&mut metrics));
        metrics
    }
-
+    fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
    fn collect(hll: &HyperLogLogVec<StaticLabelSet<Label>, 32>) -> ([u8; 32], [u8; 32]) {
        // cannot go through the `hll.collect_family_into` interface yet...
        // need to see if I can fix the conflicting impls problem in measured.
        (
            hll.get_metric(hll.with_labels(Label::A)).take_sample(),
            hll.get_metric(hll.with_labels(Label::B)).take_sample(),
        )
    }
    fn get_cardinality(samples: &[[u8; 32]]) -> f64 {
        let mut buckets = [0.0; 32];
-        for &sample in samples {
+        for metric in metrics.chunks_exact(32) {
-            for (i, m) in sample.into_iter().enumerate() {
+            if filter(&metric[0]) {
-                buckets[i] = f64::max(buckets[i], m as f64);
+                for (i, m) in metric.iter().enumerate() {
                    buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
                }
            }
        }
@@ -238,7 +437,7 @@ mod tests {
    }
    fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
-        let hll = HyperLogLogVec::<StaticLabelSet<Label>, 32>::new();
+        let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();
        let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
        let mut set_a = HashSet::new();
@@ -246,20 +445,18 @@ mod tests {
        for x in iter.by_ref().take(n) {
            set_a.insert(x.to_bits());
-            hll.get_metric(hll.with_labels(Label::A))
+            hll.with_label_values(&["a"]).measure(&x.to_bits());
                .measure(&x.to_bits());
        }
        for x in iter.by_ref().take(n) {
            set_b.insert(x.to_bits());
-            hll.get_metric(hll.with_labels(Label::B))
+            hll.with_label_values(&["b"]).measure(&x.to_bits());
                .measure(&x.to_bits());
        }
        let merge = &set_a | &set_b;
-        let (a, b) = collect(&hll);
+        let metrics = collect(&hll);
-        let len = get_cardinality(&[a, b]);
+        let len = get_cardinality(&metrics, |_| true);
-        let len_a = get_cardinality(&[a]);
+        let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
-        let len_b = get_cardinality(&[b]);
+        let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");
        ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
    }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -4,17 +4,6 @@
 //! a default registry.
 #![deny(clippy::undocumented_unsafe_blocks)]
 use measured::{
    label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels},
    metric::{
        counter::CounterState,
        gauge::GaugeState,
        group::{Encoding, MetricValue},
        name::{MetricName, MetricNameEncoder},
        MetricEncoding, MetricFamilyEncoding,
    },
    FixedCardinalityLabel, LabelGroup, MetricGroup,
 };
 use once_cell::sync::Lazy;
 use prometheus::core::{
    Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
@@ -22,7 +11,6 @@ use prometheus::core::{
 pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::Error;
 use prometheus::Registry;
 pub use prometheus::{core, default_registry, proto};
 pub use prometheus::{exponential_buckets, linear_buckets};
 pub use prometheus::{register_counter_vec, Counter, CounterVec};
@@ -35,12 +23,13 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
 pub use prometheus::{register_int_gauge, IntGauge};
 pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
 pub use prometheus::{Encoder, TextEncoder};
 use prometheus::{Registry, Result};
 pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
 mod hll;
-pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec};
+pub use hll::{HyperLogLog, HyperLogLogVec};
 #[cfg(target_os = "linux")]
 pub mod more_process_metrics;
@@ -70,7 +59,7 @@ static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
 /// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
 /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
 /// while holding the lock.
-pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
+pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
    INTERNAL_REGISTRY.register(c)
 }
@@ -107,127 +96,6 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
 ];
 pub struct BuildInfo {
    pub revision: &'static str,
    pub build_tag: &'static str,
 }
 // todo: allow label group without the set
 impl LabelGroup for BuildInfo {
    fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
        const REVISION: &LabelName = LabelName::from_str("revision");
        v.write_value(REVISION, &self.revision);
        const BUILD_TAG: &LabelName = LabelName::from_str("build_tag");
        v.write_value(BUILD_TAG, &self.build_tag);
    }
 }
 impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
 where
    GaugeState: MetricEncoding<T>,
 {
    fn collect_family_into(
        &self,
        name: impl measured::metric::name::MetricNameEncoder,
        enc: &mut T,
    ) -> Result<(), T::Err> {
        enc.write_help(&name, "Build/version information")?;
        GaugeState::write_type(&name, enc)?;
        GaugeState {
            count: std::sync::atomic::AtomicI64::new(1),
        }
        .collect_into(&(), self, name, enc)
    }
 }
 #[derive(MetricGroup)]
 #[metric(new(build_info: BuildInfo))]
 pub struct NeonMetrics {
    #[cfg(target_os = "linux")]
    #[metric(namespace = "process")]
    #[metric(init = measured_process::ProcessCollector::for_self())]
    process: measured_process::ProcessCollector,
    #[metric(namespace = "libmetrics")]
    #[metric(init = LibMetrics::new(build_info))]
    libmetrics: LibMetrics,
 }
 #[derive(MetricGroup)]
 #[metric(new(build_info: BuildInfo))]
 pub struct LibMetrics {
    #[metric(init = build_info)]
    build_info: BuildInfo,
    #[metric(flatten)]
    rusage: Rusage,
    serve_count: CollectionCounter,
 }
 fn write_gauge<Enc: Encoding>(
    x: i64,
    labels: impl LabelGroup,
    name: impl MetricNameEncoder,
    enc: &mut Enc,
 ) -> Result<(), Enc::Err> {
    enc.write_metric_value(name, labels, MetricValue::Int(x))
 }
 #[derive(Default)]
 struct Rusage;
 #[derive(FixedCardinalityLabel, Clone, Copy)]
 #[label(singleton = "io_operation")]
 enum IoOp {
    Read,
    Write,
 }
 impl<T: Encoding> MetricGroup<T> for Rusage
 where
    GaugeState: MetricEncoding<T>,
 {
    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
        const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total");
        const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb");
        let ru = get_rusage_stats();
        enc.write_help(
            DISK_IO,
            "Bytes written and read from disk, grouped by the operation (read|write)",
        )?;
        GaugeState::write_type(DISK_IO, enc)?;
        write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?;
        write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?;
        enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?;
        GaugeState::write_type(MAXRSS, enc)?;
        write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?;
        Ok(())
    }
 }
 #[derive(Default)]
 struct CollectionCounter(CounterState);
 impl<T: Encoding> MetricFamilyEncoding<T> for CollectionCounter
 where
    CounterState: MetricEncoding<T>,
 {
    fn collect_family_into(
        &self,
        name: impl measured::metric::name::MetricNameEncoder,
        enc: &mut T,
    ) -> Result<(), T::Err> {
        self.0.inc();
        enc.write_help(&name, "Number of metric requests made")?;
        self.0.collect_into(&(), NoLabels, name, enc)
    }
 }
 pub fn set_build_info_metric(revision: &str, build_tag: &str) {
    let metric = register_int_gauge_vec!(
        "libmetrics_build_info",
@@ -237,7 +105,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
    .expect("Failed to register build info metric");
    metric.with_label_values(&[revision, build_tag]).set(1);
 }
 const BYTES_IN_BLOCK: i64 = 512;
 // Records I/O stats in a "cross-platform" way.
 // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
@@ -250,6 +117,7 @@ const BYTES_IN_BLOCK: i64 = 512;
 fn update_rusage_metrics() {
    let rusage_stats = get_rusage_stats();
    const BYTES_IN_BLOCK: i64 = 512;
    DISK_IO_BYTES
        .with_label_values(&["read"])
        .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
@@ -283,7 +151,6 @@ macro_rules! register_int_counter_pair_vec {
        }
    }};
 }
 /// Create an [`IntCounterPair`] and registers to default registry.
 #[macro_export(local_inner_macros)]
 macro_rules! register_int_counter_pair {
@@ -321,10 +188,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
    ///
    /// An error is returned if the number of label values is not the same as the
    /// number of VariableLabels in Desc.
-    pub fn get_metric_with_label_values(
+    pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
        &self,
        vals: &[&str],
    ) -> prometheus::Result<GenericCounterPair<P>> {
        Ok(GenericCounterPair {
            inc: self.inc.get_metric_with_label_values(vals)?,
            dec: self.dec.get_metric_with_label_values(vals)?,
@@ -337,7 +201,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
        self.get_metric_with_label_values(vals).unwrap()
    }
-    pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
+    pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
        res[0] = self.inc.remove_label_values(vals);
        res[1] = self.dec.remove_label_values(vals);
    }
@@ -421,171 +285,3 @@ pub type IntCounterPair = GenericCounterPair<AtomicU64>;
 /// A guard for [`IntCounterPair`] that will decrement the gauge on drop
 pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
 pub trait CounterPairAssoc {
    const INC_NAME: &'static MetricName;
    const DEC_NAME: &'static MetricName;
    const INC_HELP: &'static str;
    const DEC_HELP: &'static str;
    type LabelGroupSet: LabelGroupSet;
 }
 pub struct CounterPairVec<A: CounterPairAssoc> {
    vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
 }
 impl<A: CounterPairAssoc> Default for CounterPairVec<A>
 where
    A::LabelGroupSet: Default,
 {
    fn default() -> Self {
        Self {
            vec: Default::default(),
        }
    }
 }
 impl<A: CounterPairAssoc> CounterPairVec<A> {
    pub fn guard(
        &self,
        labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
    ) -> MeasuredCounterPairGuard<'_, A> {
        let id = self.vec.with_labels(labels);
        self.vec.get_metric(id).inc.inc();
        MeasuredCounterPairGuard { vec: &self.vec, id }
    }
    pub fn inc(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
        let id = self.vec.with_labels(labels);
        self.vec.get_metric(id).inc.inc();
    }
    pub fn dec(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
        let id = self.vec.with_labels(labels);
        self.vec.get_metric(id).dec.inc();
    }
    pub fn remove_metric(
        &self,
        labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
    ) -> Option<MeasuredCounterPairState> {
        let id = self.vec.with_labels(labels);
        self.vec.remove_metric(id)
    }
 }
 impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
 where
    T: ::measured::metric::group::Encoding,
    A: CounterPairAssoc,
    ::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding<T>,
 {
    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
        // write decrement first to avoid a race condition where inc - dec < 0
        T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?;
        self.vec
            .collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?;
        T::write_help(enc, A::INC_NAME, A::INC_HELP)?;
        self.vec
            .collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?;
        Ok(())
    }
 }
 #[derive(MetricGroup, Default)]
 pub struct MeasuredCounterPairState {
    pub inc: CounterState,
    pub dec: CounterState,
 }
 impl measured::metric::MetricType for MeasuredCounterPairState {
    type Metadata = ();
 }
 pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> {
    vec: &'a measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
    id: measured::metric::LabelId<A::LabelGroupSet>,
 }
 impl<A: CounterPairAssoc> Drop for MeasuredCounterPairGuard<'_, A> {
    fn drop(&mut self) {
        self.vec.get_metric(self.id).dec.inc();
    }
 }
 /// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder.
 struct Inc<T>(T);
 /// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder.
 struct Dec<T>(T);
 impl<T: Encoding> Encoding for Inc<T> {
    type Err = T::Err;
    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
        self.0.write_help(name, help)
    }
    fn write_metric_value(
        &mut self,
        name: impl MetricNameEncoder,
        labels: impl LabelGroup,
        value: MetricValue,
    ) -> Result<(), Self::Err> {
        self.0.write_metric_value(name, labels, value)
    }
 }
 impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
 where
    CounterState: MetricEncoding<T>,
 {
    fn write_type(name: impl MetricNameEncoder, enc: &mut Inc<T>) -> Result<(), T::Err> {
        CounterState::write_type(name, &mut enc.0)
    }
    fn collect_into(
        &self,
        metadata: &(),
        labels: impl LabelGroup,
        name: impl MetricNameEncoder,
        enc: &mut Inc<T>,
    ) -> Result<(), T::Err> {
        self.inc.collect_into(metadata, labels, name, &mut enc.0)
    }
 }
 impl<T: Encoding> Encoding for Dec<T> {
    type Err = T::Err;
    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
        self.0.write_help(name, help)
    }
    fn write_metric_value(
        &mut self,
        name: impl MetricNameEncoder,
        labels: impl LabelGroup,
        value: MetricValue,
    ) -> Result<(), Self::Err> {
        self.0.write_metric_value(name, labels, value)
    }
 }
 /// Write the dec counter to the encoder
 impl<T: Encoding> MetricEncoding<Dec<T>> for MeasuredCounterPairState
 where
    CounterState: MetricEncoding<T>,
 {
    fn write_type(name: impl MetricNameEncoder, enc: &mut Dec<T>) -> Result<(), T::Err> {
        CounterState::write_type(name, &mut enc.0)
    }
    fn collect_into(
        &self,
        metadata: &(),
        labels: impl LabelGroup,
        name: impl MetricNameEncoder,
        enc: &mut Dec<T>,
    ) -> Result<(), T::Err> {
        self.dec.collect_into(metadata, labels, name, &mut enc.0)
    }
 }
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -2,14 +2,11 @@ use std::str::FromStr;
 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
-/// in [`storage_controller::http`]
+/// in [`attachment_service::http`]
 use serde::{Deserialize, Serialize};
-use utils::id::{NodeId, TenantId};
+use utils::id::NodeId;
-use crate::{
+use crate::{models::ShardParameters, shard::TenantShardId};
    models::{ShardParameters, TenantConfig},
    shard::{ShardStripeSize, TenantShardId},
 };
 #[derive(Serialize, Deserialize)]
 pub struct TenantCreateResponseShard {
@@ -38,16 +35,10 @@ pub struct NodeRegisterRequest {
 pub struct NodeConfigureRequest {
    pub node_id: NodeId,
-    pub availability: Option<NodeAvailabilityWrapper>,
+    pub availability: Option<NodeAvailability>,
    pub scheduling: Option<NodeSchedulingPolicy>,
 }
 #[derive(Serialize, Deserialize)]
 pub struct TenantPolicyRequest {
    pub placement: Option<PlacementPolicy>,
    pub scheduling: Option<ShardSchedulingPolicy>,
 }
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantLocateResponseShard {
    pub shard_id: TenantShardId,
@@ -66,48 +57,6 @@ pub struct TenantLocateResponse {
    pub shard_params: ShardParameters,
 }
 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponse {
    pub tenant_id: TenantId,
    pub shards: Vec<TenantDescribeResponseShard>,
    pub stripe_size: ShardStripeSize,
    pub policy: PlacementPolicy,
    pub config: TenantConfig,
 }
 #[derive(Serialize, Deserialize)]
 pub struct NodeDescribeResponse {
    pub id: NodeId,
    pub availability: NodeAvailabilityWrapper,
    pub scheduling: NodeSchedulingPolicy,
    pub listen_http_addr: String,
    pub listen_http_port: u16,
    pub listen_pg_addr: String,
    pub listen_pg_port: u16,
 }
 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponseShard {
    pub tenant_shard_id: TenantShardId,
    pub node_attached: Option<NodeId>,
    pub node_secondary: Vec<NodeId>,
    pub last_error: String,
    /// A task is currently running to reconcile this tenant's intent state with the state on pageservers
    pub is_reconciling: bool,
    /// This shard failed in sending a compute notification to the cloud control plane, and a retry is pending.
    pub is_pending_compute_notification: bool,
    /// A shard split is currently underway
    pub is_splitting: bool,
    pub scheduling_policy: ShardSchedulingPolicy,
 }
 /// Explicitly migrating a particular shard is a low level operation
 /// TODO: higher level "Reschedule tenant" operation where the request
 /// specifies some constraints, e.g. asking it to get off particular node(s)
@@ -117,94 +66,29 @@ pub struct TenantShardMigrateRequest {
    pub node_id: NodeId,
 }
-/// Utilisation score indicating how good a candidate a pageserver
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 /// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
 /// Lower values are better.
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
 pub struct UtilizationScore(pub u64);
 impl UtilizationScore {
    pub fn worst() -> Self {
        UtilizationScore(u64::MAX)
    }
 }
 #[derive(Serialize, Deserialize, Clone, Copy, Debug)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
    // Normal, happy state
-    Active(UtilizationScore),
+    Active,
    // Offline: Tenants shouldn't try to attach here, but they may assume that their
    // secondary locations on this node still exist.  Newly added nodes are in this
    // state until we successfully contact them.
    Offline,
 }
-impl PartialEq for NodeAvailability {
+impl FromStr for NodeAvailability {
-    fn eq(&self, other: &Self) -> bool {
+    type Err = anyhow::Error;
        use NodeAvailability::*;
        matches!((self, other), (Active(_), Active(_)) | (Offline, Offline))
    }
 }
-impl Eq for NodeAvailability {}
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
-
+        match s {
-// This wrapper provides serde functionality and it should only be used to
+            "active" => Ok(Self::Active),
-// communicate with external callers which don't know or care about the
+            "offline" => Ok(Self::Offline),
-// utilisation score of the pageserver it is targeting.
+            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
 #[derive(Serialize, Deserialize, Clone, Copy, Debug)]
 pub enum NodeAvailabilityWrapper {
    Active,
    Offline,
 }
 impl From<NodeAvailabilityWrapper> for NodeAvailability {
    fn from(val: NodeAvailabilityWrapper) -> Self {
        match val {
            // Assume the worst utilisation score to begin with. It will later be updated by
            // the heartbeats.
            NodeAvailabilityWrapper::Active => NodeAvailability::Active(UtilizationScore::worst()),
            NodeAvailabilityWrapper::Offline => NodeAvailability::Offline,
        }
    }
 }
-impl From<NodeAvailability> for NodeAvailabilityWrapper {
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
    fn from(val: NodeAvailability) -> Self {
        match val {
            NodeAvailability::Active(_) => NodeAvailabilityWrapper::Active,
            NodeAvailability::Offline => NodeAvailabilityWrapper::Offline,
        }
    }
 }
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum ShardSchedulingPolicy {
    // Normal mode: the tenant's scheduled locations may be updated at will, including
    // for non-essential optimization.
    Active,
    // Disable optimizations, but permit scheduling when necessary to fulfil the PlacementPolicy.
    // For example, this still permits a node's attachment location to change to a secondary in
    // response to a node failure, or to assign a new secondary if a node was removed.
    Essential,
    // No scheduling: leave the shard running wherever it currently is.  Even if the shard is
    // unavailable, it will not be rescheduled to another node.
    Pause,
    // No reconciling: we will make no location_conf API calls to pageservers at all.  If the
    // shard is unavailable, it stays that way.  If a node fails, this shard doesn't get failed over.
    Stop,
 }
 impl Default for ShardSchedulingPolicy {
    fn default() -> Self {
        Self::Active
    }
 }
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum NodeSchedulingPolicy {
    Active,
    Filling,
@@ -243,8 +127,11 @@ impl From<NodeSchedulingPolicy> for String {
 /// to create secondary locations.
 #[derive(Clone, Serialize, Deserialize, Debug, PartialEq, Eq)]
 pub enum PlacementPolicy {
-    /// Normal live state: one attached pageserver and zero or more secondaries.
+    /// Cheapest way to attach a tenant: just one pageserver, no secondary
-    Attached(usize),
+    Single,
    /// Production-ready way to attach a tenant: one attached pageserver and
    /// some number of secondaries.
    Double(usize),
    /// Create one secondary mode locations. This is useful when onboarding
    /// a tenant, or for an idle tenant that we might want to bring online quickly.
    Secondary,
@@ -266,14 +153,14 @@ mod test {
    /// Check stability of PlacementPolicy's serialization
    #[test]
    fn placement_policy_encoding() -> anyhow::Result<()> {
-        let v = PlacementPolicy::Attached(1);
+        let v = PlacementPolicy::Double(1);
        let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "{\"Attached\":1}");
+        assert_eq!(encoded, "{\"Double\":1}");
        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
-        let v = PlacementPolicy::Detached;
+        let v = PlacementPolicy::Single;
        let encoded = serde_json::to_string(&v)?;
-        assert_eq!(encoded, "\"Detached\"");
+        assert_eq!(encoded, "\"Single\"");
        assert_eq!(serde_json::from_str::<PlacementPolicy>(&encoded)?, v);
        Ok(())
    }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -4,7 +4,6 @@ pub mod utilization;
 pub use utilization::PageserverUtilization;
 use std::{
    borrow::Cow,
    collections::HashMap,
    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
@@ -20,7 +19,6 @@ use utils::{
    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
    serde_system_time,
 };
 use crate::controller_api::PlacementPolicy;
@@ -296,13 +294,13 @@ pub struct TenantConfig {
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
    pub trace_read_requests: Option<bool>,
    pub image_layer_compression: Option<CompressionAlgorithm>,
    pub eviction_policy: Option<EvictionPolicy>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
    pub heatmap_period: Option<String>,
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
    pub image_layer_creation_check_threshold: Option<u8>,
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -330,6 +328,23 @@ pub enum CompactionAlgorithm {
    Tiered,
 }
 #[derive(
    Debug,
    Clone,
    Copy,
    PartialEq,
    Eq,
    Serialize,
    Deserialize,
    strum_macros::FromRepr,
    enum_map::Enum,
 )]
 #[repr(u8)]
 pub enum CompressionAlgorithm {
    NoCompression,
    LZ4,
 }
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
 pub struct EvictionPolicyLayerAccessThreshold {
    #[serde(with = "humantime_serde")]
@@ -580,7 +595,7 @@ pub struct TimelineInfo {
    pub walreceiver_status: String,
 }
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize)]
 pub struct LayerMapInfo {
    pub in_memory_layers: Vec<InMemoryLayerInfo>,
    pub historic_layers: Vec<HistoricLayerInfo>,
@@ -598,7 +613,7 @@ pub enum LayerAccessKind {
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct LayerAccessStatFullDetails {
    pub when_millis_since_epoch: u64,
-    pub task_kind: Cow<'static, str>,
+    pub task_kind: &'static str,
    pub access_kind: LayerAccessKind,
 }
@@ -657,23 +672,23 @@ impl LayerResidenceEvent {
    }
 }
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize)]
 pub struct LayerAccessStats {
    pub access_count_by_access_kind: HashMap<LayerAccessKind, u64>,
-    pub task_kind_access_flag: Vec<Cow<'static, str>>,
+    pub task_kind_access_flag: Vec<&'static str>,
    pub first: Option<LayerAccessStatFullDetails>,
    pub accesses_history: HistoryBufferWithDropCounter<LayerAccessStatFullDetails, 16>,
    pub residence_events_history: HistoryBufferWithDropCounter<LayerResidenceEvent, 16>,
 }
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum InMemoryLayerInfo {
    Open { lsn_start: Lsn },
    Frozen { lsn_start: Lsn, lsn_end: Lsn },
 }
-#[derive(Debug, Clone, Serialize, Deserialize)]
+#[derive(Debug, Clone, Serialize)]
 #[serde(tag = "kind")]
 pub enum HistoricLayerInfo {
    Delta {
@@ -695,32 +710,6 @@ pub enum HistoricLayerInfo {
    },
 }
 impl HistoricLayerInfo {
    pub fn layer_file_name(&self) -> &str {
        match self {
            HistoricLayerInfo::Delta {
                layer_file_name, ..
            } => layer_file_name,
            HistoricLayerInfo::Image {
                layer_file_name, ..
            } => layer_file_name,
        }
    }
    pub fn is_remote(&self) -> bool {
        match self {
            HistoricLayerInfo::Delta { remote, .. } => *remote,
            HistoricLayerInfo::Image { remote, .. } => *remote,
        }
    }
    pub fn set_remote(&mut self, value: bool) {
        let field = match self {
            HistoricLayerInfo::Delta { remote, .. } => remote,
            HistoricLayerInfo::Image { remote, .. } => remote,
        };
        *field = value;
    }
 }
 #[derive(Debug, Serialize, Deserialize)]
 pub struct DownloadRemoteLayersTaskSpawnRequest {
    pub max_concurrent_downloads: NonZeroUsize,
@@ -747,48 +736,10 @@ pub struct TimelineGcRequest {
    pub gc_horizon: Option<u64>,
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalRedoManagerProcessStatus {
    pub pid: u32,
    /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
    /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
    pub kind: Cow<'static, str>,
 }
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalRedoManagerStatus {
    pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
-    pub process: Option<WalRedoManagerProcessStatus>,
+    pub pid: Option<u32>,
 }
 /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
 /// a download job, timing out while waiting for it to run, and then inspecting this status to understand
 /// what's happening.
 #[derive(Default, Debug, Serialize, Deserialize, Clone)]
 pub struct SecondaryProgress {
    /// The remote storage LastModified time of the heatmap object we last downloaded.
    pub heatmap_mtime: Option<serde_system_time::SystemTime>,
    /// The number of layers currently on-disk
    pub layers_downloaded: usize,
    /// The number of layers in the most recently seen heatmap
    pub layers_total: usize,
    /// The number of layer bytes currently on-disk
    pub bytes_downloaded: u64,
    /// The number of layer bytes in the most recently seen heatmap
    pub bytes_total: u64,
 }
 #[derive(Serialize, Deserialize, Debug)]
 pub struct TenantScanRemoteStorageShard {
    pub tenant_shard_id: TenantShardId,
    pub generation: Option<u32>,
 }
 #[derive(Serialize, Deserialize, Debug, Default)]
 pub struct TenantScanRemoteStorageResponse {
    pub shards: Vec<TenantScanRemoteStorageShard>,
 }
 pub mod virtual_file {
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -1,4 +1,4 @@
-use utils::serde_system_time::SystemTime;
+use std::time::SystemTime;
 /// Pageserver current utilization and scoring for how good candidate the pageserver would be for
 /// the next tenant.
@@ -7,7 +7,7 @@ use utils::serde_system_time::SystemTime;
 ///
 /// `format: int64` fields must use `ser_saturating_u63` because openapi generated clients might
 /// not handle full u64 values properly.
-#[derive(serde::Serialize, serde::Deserialize, Debug, Clone)]
+#[derive(serde::Serialize, Debug)]
 pub struct PageserverUtilization {
    /// Used disk space
    #[serde(serialize_with = "ser_saturating_u63")]
@@ -21,9 +21,17 @@ pub struct PageserverUtilization {
    /// When was this snapshot captured, pageserver local time.
    ///
    /// Use millis to give confidence that the value is regenerated often enough.
    #[serde(serialize_with = "ser_rfc3339_millis")]
    pub captured_at: SystemTime,
 }
 fn ser_rfc3339_millis<S: serde::Serializer>(
    ts: &SystemTime,
    serializer: S,
 ) -> Result<S::Ok, S::Error> {
    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
 }
 /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
 ///
 /// Instead of newtype, use this because a newtype would get require handling deserializing values
@@ -50,9 +58,7 @@ mod tests {
            disk_usage_bytes: u64::MAX,
            free_space_bytes: 0,
            utilization_score: u64::MAX,
-            captured_at: SystemTime(
+            captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
                std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
            ),
        };
        let s = serde_json::to_string(&doc).unwrap();
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -8,89 +8,12 @@ use hex::FromHex;
 use serde::{Deserialize, Serialize};
 use utils::id::TenantId;
 /// See docs/rfcs/031-sharding-static.md for an overview of sharding.
 ///
 /// This module contains a variety of types used to represent the concept of sharding
 /// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
 /// we provide an summary here.
 ///
 /// Types used to describe shards:
 /// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
 ///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
 ///   a shard suffix.
 /// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
 /// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
 ///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
 ///   tenant, such as layer files.
 /// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
 ///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
 /// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
 ///   four hex digits.  An unsharded tenant is `0000`.
 /// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
 ///
 /// Types used to describe the parameters for data distribution in a sharded tenant:
 /// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
 ///   multiple shards.  Its value is given in 8kiB pages.
 /// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
 ///   always zero: this is provided for future upgrades that might introduce different
 ///   data distribution schemes.
 ///
 /// Examples:
 /// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
 /// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
 /// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
 ///   and their slugs are 0004, 0104, 0204, and 0304.
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardNumber(pub u8);
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(u8);
 /// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
 /// when we need to know which shard we're dealing with, but do not need to know the full
 /// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
 /// the fully qualified TenantShardId.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct ShardIndex {
    pub shard_number: ShardNumber,
    pub shard_count: ShardCount,
 }
 /// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
 /// and to check whether that [`ShardNumber`] is the same as the current shard.
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardIdentity {
    pub number: ShardNumber,
    pub count: ShardCount,
    pub stripe_size: ShardStripeSize,
    layout: ShardLayout,
 }
 /// Formatting helper, for generating the `shard_id` label in traces.
 struct ShardSlug<'a>(&'a TenantShardId);
 /// TenantShardId globally identifies a particular shard in a particular tenant.
 ///
 /// These are written as `<TenantId>-<ShardSlug>`, for example:
 ///   # The second shard in a two-shard tenant
 ///   072f1291a5310026820b2fe4b2968934-0102
 ///
 /// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
 /// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
 /// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
 ///
 /// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
 /// is both forward and backward compatible with TenantId: a legacy TenantId can be
 /// decoded as a TenantShardId, and when re-encoded it will be parseable
 /// as a TenantId.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct TenantShardId {
    pub tenant_id: TenantId,
    pub shard_number: ShardNumber,
    pub shard_count: ShardCount,
 }
 impl ShardCount {
    pub const MAX: Self = Self(u8::MAX);
@@ -115,7 +38,6 @@ impl ShardCount {
        self.0
    }
    ///
    pub fn is_unsharded(&self) -> bool {
        self.0 == 0
    }
@@ -131,6 +53,33 @@ impl ShardNumber {
    pub const MAX: Self = Self(u8::MAX);
 }
 /// TenantShardId identify the units of work for the Pageserver.
 ///
 /// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
 ///
 ///   # The second shard in a two-shard tenant
 ///   072f1291a5310026820b2fe4b2968934-0102
 ///
 /// Historically, tenants could not have multiple shards, and were identified
 /// by TenantId.  To support this, TenantShardId has a special legacy
 /// mode where `shard_count` is equal to zero: this represents a single-sharded
 /// tenant which should be written as a TenantId with no suffix.
 ///
 /// The human-readable encoding of TenantShardId, such as used in API URLs,
 /// is both forward and backward compatible: a legacy TenantId can be
 /// decoded as a TenantShardId, and when re-encoded it will be parseable
 /// as a TenantId.
 ///
 /// Note that the binary encoding is _not_ backward compatible, because
 /// at the time sharding is introduced, there are no existing binary structures
 /// containing TenantId that we need to handle.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct TenantShardId {
    pub tenant_id: TenantId,
    pub shard_number: ShardNumber,
    pub shard_count: ShardCount,
 }
 impl TenantShardId {
    pub fn unsharded(tenant_id: TenantId) -> Self {
        Self {
@@ -162,13 +111,10 @@ impl TenantShardId {
    }
    /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_shard_zero(&self) -> bool {
+    pub fn is_zero(&self) -> bool {
        self.shard_number == ShardNumber(0)
    }
    /// The "unsharded" value is distinct from simply having a single shard: it represents
    /// a tenant which is not shard-aware at all, and whose storage paths will not include
    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
    }
@@ -204,6 +150,9 @@ impl TenantShardId {
    }
 }
 /// Formatting helper
 struct ShardSlug<'a>(&'a TenantShardId);
 impl<'a> std::fmt::Display for ShardSlug<'a> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
@@ -273,6 +222,16 @@ impl From<[u8; 18]> for TenantShardId {
    }
 }
 /// For use within the context of a particular tenant, when we need to know which
 /// shard we're dealing with, but do not need to know the full ShardIdentity (because
 /// we won't be doing any page->shard mapping), and do not need to know the fully qualified
 /// TenantShardId.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct ShardIndex {
    pub shard_number: ShardNumber,
    pub shard_count: ShardCount,
 }
 impl ShardIndex {
    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
        Self {
@@ -287,9 +246,6 @@ impl ShardIndex {
        }
    }
    /// The "unsharded" value is distinct from simply having a single shard: it represents
    /// a tenant which is not shard-aware at all, and whose storage paths will not include
    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
    }
@@ -357,8 +313,6 @@ impl Serialize for TenantShardId {
        if serializer.is_human_readable() {
            serializer.collect_str(self)
        } else {
            // Note: while human encoding of [`TenantShardId`] is backward and forward
            // compatible, this binary encoding is not.
            let mut packed: [u8; 18] = [0; 18];
            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
            packed[16] = self.shard_number.0;
@@ -436,6 +390,16 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
 /// Default stripe size in pages: 256MiB divided by 8kiB page size.
 const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
 /// The ShardIdentity contains the information needed for one member of map
 /// to resolve a key to a shard, and then check whether that shard is ==self.
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardIdentity {
    pub number: ShardNumber,
    pub count: ShardCount,
    pub stripe_size: ShardStripeSize,
    layout: ShardLayout,
 }
 #[derive(thiserror::Error, Debug, PartialEq, Eq)]
 pub enum ShardConfigError {
    #[error("Invalid shard count")]
@@ -475,9 +439,6 @@ impl ShardIdentity {
        }
    }
    /// The "unsharded" value is distinct from simply having a single shard: it represents
    /// a tenant which is not shard-aware at all, and whose storage paths will not include
    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.number == ShardNumber(0) && self.count == ShardCount(0)
    }
@@ -526,8 +487,6 @@ impl ShardIdentity {
    }
    /// Return true if the key should be ingested by this shard
    ///
    /// Shards must ingest _at least_ keys which return true from this check.
    pub fn is_key_local(&self, key: &Key) -> bool {
        assert!(!self.is_broken());
        if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
@@ -538,9 +497,7 @@ impl ShardIdentity {
    }
    /// Return true if the key should be discarded if found in this shard's
-    /// data store, e.g. during compaction after a split.
+    /// data store, e.g. during compaction after a split
    ///
    /// Shards _may_ drop keys which return false here, but are not obliged to.
    pub fn is_key_disposable(&self, key: &Key) -> bool {
        if key_is_shard0(key) {
            // Q: Why can't we dispose of shard0 content if we're not shard 0?
@@ -566,7 +523,7 @@ impl ShardIdentity {
    /// Convenience for checking if this identity is the 0th shard in a tenant,
    /// for special cases on shard 0 such as ingesting relation sizes.
-    pub fn is_shard_zero(&self) -> bool {
+    pub fn is_zero(&self) -> bool {
        self.number == ShardNumber(0)
    }
 }
--- a/libs/pageserver_api/src/upcall_api.rs
+++ b/libs/pageserver_api/src/upcall_api.rs
@@ -6,9 +6,7 @@
 use serde::{Deserialize, Serialize};
 use utils::id::NodeId;
-use crate::{
+use crate::{controller_api::NodeRegisterRequest, shard::TenantShardId};
    controller_api::NodeRegisterRequest, models::LocationConfigMode, shard::TenantShardId,
 };
 /// Upcall message sent by the pageserver to the configured `control_plane_api` on
 /// startup.
@@ -22,20 +20,12 @@ pub struct ReAttachRequest {
    pub register: Option<NodeRegisterRequest>,
 }
-fn default_mode() -> LocationConfigMode {
+#[derive(Serialize, Deserialize)]
    LocationConfigMode::AttachedSingle
 }
 #[derive(Serialize, Deserialize, Debug)]
 pub struct ReAttachResponseTenant {
    pub id: TenantShardId,
-    /// Mandatory if LocationConfigMode is None or set to an Attached* mode
+    pub gen: u32,
    pub gen: Option<u32>,
    /// Default value only for backward compat: this field should be set
    #[serde(default = "default_mode")]
    pub mode: LocationConfigMode,
 }
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponse {
    pub tenants: Vec<ReAttachResponseTenant>,
--- a/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
+++ b/libs/postgres_ffi/wal_craft/src/bin/wal_craft.rs
@@ -1,6 +1,5 @@
 use anyhow::*;
 use clap::{value_parser, Arg, ArgMatches, Command};
 use postgres::Client;
 use std::{path::PathBuf, str::FromStr};
 use wal_craft::*;
@@ -9,8 +8,8 @@ fn main() -> Result<()> {
        .init();
    let arg_matches = cli().get_matches();
-    let wal_craft = |arg_matches: &ArgMatches, client: &mut Client| {
+    let wal_craft = |arg_matches: &ArgMatches, client| {
-        let intermediate_lsns = match arg_matches
+        let (intermediate_lsns, end_of_wal_lsn) = match arg_matches
            .get_one::<String>("type")
            .map(|s| s.as_str())
            .context("'type' is required")?
@@ -26,7 +25,6 @@ fn main() -> Result<()> {
            LastWalRecordCrossingSegment::NAME => LastWalRecordCrossingSegment::craft(client)?,
            a => panic!("Unknown --type argument: {a}"),
        };
        let end_of_wal_lsn = client.pg_current_wal_insert_lsn()?;
        for lsn in intermediate_lsns {
            println!("intermediate_lsn = {lsn}");
        }
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -5,6 +5,7 @@ use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
 use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
 use std::cmp::Ordering;
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -231,52 +232,59 @@ pub fn ensure_server_config(client: &mut impl postgres::GenericClient) -> anyhow
 pub trait Crafter {
    const NAME: &'static str;
-    /// Generates WAL using the client `client`. Returns a vector of some valid
+    /// Generates WAL using the client `client`. Returns a pair of:
-    /// "interesting" intermediate LSNs which one may start reading from.
+    /// * A vector of some valid "interesting" intermediate LSNs which one may start reading from.
-    /// test_end_of_wal uses this to check various starting points.
+    ///   May include or exclude Lsn(0) and the end-of-wal.
-    ///
+    /// * The expected end-of-wal LSN.
-    /// Note that postgres is generally keen about writing some WAL. While we
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)>;
    /// try to disable it (autovacuum, big wal_writer_delay, etc) it is always
    /// possible, e.g. xl_running_xacts are dumped each 15s. So checks about
    /// stable WAL end would be flaky unless postgres is shut down. For this
    /// reason returning potential end of WAL here is pointless. Most of the
    /// time this doesn't happen though, so it is reasonable to create needed
    /// WAL structure and immediately kill postgres like test_end_of_wal does.
    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>>;
 }
 /// Wraps some WAL craft function, providing current LSN to it before the
 /// insertion and flushing WAL afterwards. Also pushes initial LSN to the
 /// result.
 fn craft_internal<C: postgres::GenericClient>(
    client: &mut C,
-    f: impl Fn(&mut C, PgLsn) -> anyhow::Result<Vec<PgLsn>>,
+    f: impl Fn(&mut C, PgLsn) -> anyhow::Result<(Vec<PgLsn>, Option<PgLsn>)>,
-) -> anyhow::Result<Vec<PgLsn>> {
+) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
    ensure_server_config(client)?;
    let initial_lsn = client.pg_current_wal_insert_lsn()?;
    info!("LSN initial = {}", initial_lsn);
-    let mut intermediate_lsns = f(client, initial_lsn)?;
+    let (mut intermediate_lsns, last_lsn) = f(client, initial_lsn)?;
    let last_lsn = match last_lsn {
        None => client.pg_current_wal_insert_lsn()?,
        Some(last_lsn) => {
            let insert_lsn = client.pg_current_wal_insert_lsn()?;
            match last_lsn.cmp(&insert_lsn) {
                Ordering::Less => bail!(
                    "Some records were inserted after the crafted WAL: {} vs {}",
                    last_lsn,
                    insert_lsn
                ),
                Ordering::Equal => last_lsn,
                Ordering::Greater => bail!("Reported LSN is greater than insert_lsn"),
            }
        }
    };
    if !intermediate_lsns.starts_with(&[initial_lsn]) {
        intermediate_lsns.insert(0, initial_lsn);
    }
    // Some records may be not flushed, e.g. non-transactional logical messages.
    //
    // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
    // because pg_current_wal_insert_lsn skips page headers.
    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
-    Ok(intermediate_lsns)
+    match last_lsn.cmp(&client.pg_current_wal_flush_lsn()?) {
        Ordering::Less => bail!("Some records were flushed after the crafted WAL"),
        Ordering::Equal => {}
        Ordering::Greater => bail!("Reported LSN is greater than flush_lsn"),
    }
    Ok((intermediate_lsns, last_lsn))
 }
 pub struct Simple;
 impl Crafter for Simple {
    const NAME: &'static str = "simple";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
        craft_internal(client, |client, _| {
            client.execute("CREATE table t(x int)", &[])?;
-            Ok(Vec::new())
+            Ok((Vec::new(), None))
        })
    }
 }
@@ -284,36 +292,29 @@ impl Crafter for Simple {
 pub struct LastWalRecordXlogSwitch;
 impl Crafter for LastWalRecordXlogSwitch {
    const NAME: &'static str = "last_wal_record_xlog_switch";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
-        // Do not use craft_internal because here we end up with flush_lsn exactly on
+        // Do not use generate_internal because here we end up with flush_lsn exactly on
        // the segment boundary and insert_lsn after the initial page header, which is unusual.
        ensure_server_config(client)?;
        client.execute("CREATE table t(x int)", &[])?;
        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-        // pg_switch_wal returns end of last record of the switched segment,
+        let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
-        // i.e. end of SWITCH itself.
+        let next_segment = PgLsn::from(0x0200_0000);
        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
        let before_xlog_switch_u64 = u64::from(before_xlog_switch);
        let next_segment = PgLsn::from(
            before_xlog_switch_u64 - (before_xlog_switch_u64 % WAL_SEGMENT_SIZE as u64)
                + WAL_SEGMENT_SIZE as u64,
        );
        ensure!(
-            xlog_switch_record_end <= next_segment,
+            after_xlog_switch <= next_segment,
-            "XLOG_SWITCH record ended after the expected segment boundary: {} > {}",
+            "XLOG_SWITCH message ended after the expected segment boundary: {} > {}",
-            xlog_switch_record_end,
+            after_xlog_switch,
            next_segment
        );
-        Ok(vec![before_xlog_switch, xlog_switch_record_end])
+        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
    }
 }
 pub struct LastWalRecordXlogSwitchEndsOnPageBoundary;
 /// Craft xlog SWITCH record ending at page boundary.
 impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
    const NAME: &'static str = "last_wal_record_xlog_switch_ends_on_page_boundary";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
        // Do not use generate_internal because here we end up with flush_lsn exactly on
        // the segment boundary and insert_lsn after the initial page header, which is unusual.
        ensure_server_config(client)?;
@@ -360,29 +361,28 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {
        // Emit the XLOG_SWITCH
        let before_xlog_switch = client.pg_current_wal_insert_lsn()?;
-        let xlog_switch_record_end: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
+        let after_xlog_switch: PgLsn = client.query_one("SELECT pg_switch_wal()", &[])?.get(0);
        let next_segment = PgLsn::from(0x0200_0000);
        ensure!(
-            xlog_switch_record_end < next_segment,
+            after_xlog_switch < next_segment,
-            "XLOG_SWITCH record ended on or after the expected segment boundary: {} > {}",
+            "XLOG_SWITCH message ended on or after the expected segment boundary: {} > {}",
-            xlog_switch_record_end,
+            after_xlog_switch,
            next_segment
        );
        ensure!(
-            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
+            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ == XLOG_SIZE_OF_XLOG_SHORT_PHD,
            "XLOG_SWITCH message ended not on page boundary: {}, offset = {}",
-            xlog_switch_record_end,
+            after_xlog_switch,
-            u64::from(xlog_switch_record_end) as usize % XLOG_BLCKSZ
+            u64::from(after_xlog_switch) as usize % XLOG_BLCKSZ
        );
-        Ok(vec![before_xlog_switch, xlog_switch_record_end])
+        Ok((vec![before_xlog_switch, after_xlog_switch], next_segment))
    }
 }
-/// Write ~16MB logical message; it should cross WAL segment.
+fn craft_single_logical_message(
 fn craft_seg_size_logical_message(
    client: &mut impl postgres::GenericClient,
    transactional: bool,
-) -> anyhow::Result<Vec<PgLsn>> {
+) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
    craft_internal(client, |client, initial_lsn| {
        ensure!(
            initial_lsn < PgLsn::from(0x0200_0000 - 1024 * 1024),
@@ -405,24 +405,34 @@ fn craft_seg_size_logical_message(
            "Logical message crossed two segments"
        );
-        Ok(vec![message_lsn])
+        if transactional {
            // Transactional logical messages are part of a transaction, so the one above is
            // followed by a small COMMIT record.
            let after_message_lsn = client.pg_current_wal_insert_lsn()?;
            ensure!(
                message_lsn < after_message_lsn,
                "No record found after the emitted message"
            );
            Ok((vec![message_lsn], Some(after_message_lsn)))
        } else {
            Ok((Vec::new(), Some(message_lsn)))
        }
    })
 }
 pub struct WalRecordCrossingSegmentFollowedBySmallOne;
 impl Crafter for WalRecordCrossingSegmentFollowedBySmallOne {
    const NAME: &'static str = "wal_record_crossing_segment_followed_by_small_one";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
-        // Transactional message crossing WAL segment will be followed by small
+        craft_single_logical_message(client, true)
        // commit record.
        craft_seg_size_logical_message(client, true)
    }
 }
 pub struct LastWalRecordCrossingSegment;
 impl Crafter for LastWalRecordCrossingSegment {
    const NAME: &'static str = "last_wal_record_crossing_segment";
-    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<Vec<PgLsn>> {
+    fn craft(client: &mut impl postgres::GenericClient) -> anyhow::Result<(Vec<PgLsn>, PgLsn)> {
-        craft_seg_size_logical_message(client, false)
+        craft_single_logical_message(client, false)
    }
 }
--- a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -11,15 +11,13 @@ use utils::const_assert;
 use utils::lsn::Lsn;
 fn init_logging() {
-    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(format!(
+    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
-        "crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"
+        format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
-    )))
+    ))
    .is_test(true)
    .try_init();
 }
 /// Test that find_end_of_wal returns the same results as pg_dump on various
 /// WALs created by Crafter.
 fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    use crate::*;
@@ -40,13 +38,13 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    }
    cfg.initdb().unwrap();
    let srv = cfg.start_server().unwrap();
-    let intermediate_lsns = C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
+    let (intermediate_lsns, expected_end_of_wal_partial) =
        C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
    let intermediate_lsns: Vec<Lsn> = intermediate_lsns
        .iter()
        .map(|&lsn| u64::from(lsn).into())
        .collect();
-    // Kill postgres. Note that it might have inserted to WAL something after
+    let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
    // 'craft' did its job.
    srv.kill();
    // Check find_end_of_wal on the initial WAL
@@ -58,7 +56,7 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
        .filter(|fname| IsXLogFileName(fname))
        .max()
        .unwrap();
-    let expected_end_of_wal = find_pg_waldump_end_of_wal(&cfg, &last_segment);
+    check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
    for start_lsn in intermediate_lsns
        .iter()
        .chain(std::iter::once(&expected_end_of_wal))
@@ -93,7 +91,11 @@ fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
    }
 }
-fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
+fn check_pg_waldump_end_of_wal(
    cfg: &crate::Conf,
    last_segment: &str,
    expected_end_of_wal: Lsn,
 ) {
    // Get the actual end of WAL by pg_waldump
    let waldump_output = cfg
        .pg_waldump("000000010000000000000001", last_segment)
@@ -111,8 +113,11 @@ fn find_pg_waldump_end_of_wal(cfg: &crate::Conf, last_segment: &str) -> Lsn {
        }
    };
    let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
-    info!("waldump erred on {}", waldump_wal_end);
+    info!(
-    waldump_wal_end
+        "waldump erred on {}, expected wal end at {}",
        waldump_wal_end, expected_end_of_wal
    );
    assert_eq!(waldump_wal_end, expected_end_of_wal);
 }
 fn check_end_of_wal(
@@ -205,9 +210,9 @@ pub fn test_update_next_xid() {
 #[test]
 pub fn test_encode_logical_message() {
    let expected = [
-        64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255, 38,
+        64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
-        0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114, 101, 102,
+        38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
-        105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
+        101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
    ];
    let actual = encode_logical_message("prefix", "message");
    assert_eq!(expected, actual[..]);
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -18,7 +18,6 @@ camino.workspace = true
 humantime.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
 rand.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -157,8 +157,9 @@ impl AzureBlobStorage {
            let mut bufs = Vec::new();
            while let Some(part) = response.next().await {
                let part = part?;
                let etag_str: &str = part.blob.properties.etag.as_ref();
                if etag.is_none() {
-                    etag = Some(part.blob.properties.etag);
+                    etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
                }
                if last_modified.is_none() {
                    last_modified = Some(part.blob.properties.last_modified.into());
@@ -173,16 +174,6 @@ impl AzureBlobStorage {
                    .map_err(|e| DownloadError::Other(e.into()))?;
                bufs.push(data);
            }
            if bufs.is_empty() {
                return Err(DownloadError::Other(anyhow::anyhow!(
                    "Azure GET response contained no buffers"
                )));
            }
            // unwrap safety: if these were None, bufs would be empty and we would have returned an error already
            let etag = etag.unwrap();
            let last_modified = last_modified.unwrap();
            Ok(Download {
                download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
                etag,
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -42,9 +42,6 @@ pub use self::{
 };
 use s3_bucket::RequestKind;
 /// Azure SDK's ETag type is a simple String wrapper: we use this internally instead of repeating it here.
 pub use azure_core::Etag;
 pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
@@ -134,11 +131,6 @@ impl RemotePath {
    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
        self.0.strip_prefix(&p.0)
    }
    pub fn add_trailing_slash(&self) -> Self {
        // Unwrap safety inputs are guararnteed to be valid UTF-8
        Self(format!("{}/", self.0).try_into().unwrap())
    }
 }
 /// We don't need callers to be able to pass arbitrary delimiters: just control
@@ -162,21 +154,47 @@ pub struct Listing {
 /// providing basic CRUD operations for storage files.
 #[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
+    /// Lists all top level subdirectories for a given prefix
-    /// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
+    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
-    ///
+    /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
-    /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
+    /// so this method doesnt need to.
-    /// from the absolute root of the bucket.
+    async fn list_prefixes(
-    ///
+        &self,
-    /// `mode` configures whether to use a delimiter.  Without a delimiter all keys
+        prefix: Option<&RemotePath>,
-    /// within the prefix are listed in the `keys` of the result.  With a delimiter, any "directories" at the top level of
+        cancel: &CancellationToken,
-    /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
+    ) -> Result<Vec<RemotePath>, DownloadError> {
-    /// returned in `keys` ().
+        let result = self
-    ///
+            .list(prefix, ListingMode::WithDelimiter, None, cancel)
-    /// `max_keys` controls the maximum number of keys that will be returned.  If this is None, this function
+            .await?
-    /// will iteratively call listobjects until it runs out of keys.  Note that this is not safe to use on
+            .prefixes;
-    /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
+        Ok(result)
    }
    /// Lists all files in directory "recursively"
    /// (not really recursively, because AWS has a flat namespace)
    /// Note: This is subtely different than list_prefixes,
    /// because it is for listing files instead of listing
    /// names sharing common prefixes.
    /// For example,
    /// list_files("foo/bar") = ["foo/bar/cat123.txt",
    /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
    /// whereas,
    /// list_prefixes("foo/bar/") = ["cat", "dog"]
    /// See `test_real_s3.rs` for more details.
    ///
    /// max_keys limits max number of keys returned; None means unlimited.
    async fn list_files(
        &self,
        prefix: Option<&RemotePath>,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        let result = self
            .list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
            .await?
            .keys;
        Ok(result)
    }
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
@@ -273,9 +291,9 @@ pub type DownloadStream =
 pub struct Download {
    pub download_stream: DownloadStream,
    /// The last time the file was modified (`last-modified` HTTP header)
-    pub last_modified: SystemTime,
+    pub last_modified: Option<SystemTime>,
    /// A way to identify this specific version of the resource (`etag` HTTP header)
-    pub etag: Etag,
+    pub etag: Option<String>,
    /// Extra key-value data, associated with the current remote file.
    pub metadata: Option<StorageMetadata>,
 }
@@ -315,6 +333,41 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }
    // A function for listing all the files in a "directory"
    // Example:
    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
    //
    // max_keys limits max number of keys returned; None means unlimited.
    pub async fn list_files(
        &self,
        folder: Option<&RemotePath>,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        match self {
            Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
            Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
            Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
            Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
        }
    }
    // lists common *prefixes*, if any of files
    // Example:
    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
    pub async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        match self {
            Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
            Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
            Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
            Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
        }
    }
    /// See [`RemoteStorage::upload`]
    pub async fn upload(
        &self,
@@ -509,16 +562,6 @@ impl GenericRemoteStorage {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct StorageMetadata(HashMap<String, String>);
 impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
    fn from(arr: [(&str, &str); N]) -> Self {
        let map: HashMap<String, String> = arr
            .iter()
            .map(|(k, v)| (k.to_string(), v.to_string()))
            .collect();
        Self(map)
    }
 }
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -5,10 +5,12 @@
 //! volume is mounted to the local FS.
 use std::{
-    collections::HashSet,
+    borrow::Cow,
    future::Future,
    io::ErrorKind,
    num::NonZeroU32,
-    time::{Duration, SystemTime, UNIX_EPOCH},
+    pin::Pin,
    time::{Duration, SystemTime},
 };
 use anyhow::{bail, ensure, Context};
@@ -20,15 +22,14 @@ use tokio::{
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tokio_util::{io::ReaderStream, sync::CancellationToken};
-use utils::crashsafe::path_with_suffix_extension;
+use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
 use crate::{
    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };
 use super::{RemoteStorage, StorageMetadata};
 use crate::Etag;
 const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
@@ -91,47 +92,7 @@ impl LocalFs {
    #[cfg(test)]
    async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
-        use std::{future::Future, pin::Pin};
+        Ok(get_all_files(&self.storage_root, true)
        fn get_all_files<'a, P>(
            directory_path: P,
        ) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
        where
            P: AsRef<Utf8Path> + Send + Sync + 'a,
        {
            Box::pin(async move {
                let directory_path = directory_path.as_ref();
                if directory_path.exists() {
                    if directory_path.is_dir() {
                        let mut paths = Vec::new();
                        let mut dir_contents = fs::read_dir(directory_path).await?;
                        while let Some(dir_entry) = dir_contents.next_entry().await? {
                            let file_type = dir_entry.file_type().await?;
                            let entry_path =
                                Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
                                    anyhow::Error::msg(format!(
                                        "non-Unicode path: {}",
                                        pb.to_string_lossy()
                                    ))
                                })?;
                            if file_type.is_symlink() {
                                tracing::debug!("{entry_path:?} is a symlink, skipping")
                            } else if file_type.is_dir() {
                                paths.extend(get_all_files(&entry_path).await?.into_iter())
                            } else {
                                paths.push(entry_path);
                            }
                        }
                        Ok(paths)
                    } else {
                        bail!("Path {directory_path:?} is not a directory")
                    }
                } else {
                    Ok(Vec::new())
                }
            })
        }
        Ok(get_all_files(&self.storage_root)
            .await?
            .into_iter()
            .map(|path| {
@@ -153,20 +114,11 @@ impl LocalFs {
            None => self.storage_root.clone(),
        };
        eprintln!("local_fs list: searching from {full_path} for initial_dir");
        // If we were given a directory, we may use it as our starting point.
        // Otherwise, we must go up to the first ancestor dir that exists.  This is because
        // S3 object list prefixes can be arbitrary strings, but when reading
        // the local filesystem we need a directory to start calling read_dir on.
        let mut initial_dir = full_path.clone();
        // If there's no trailing slash, we have to start looking from one above: even if
        // `initial_dir` is a directory, we should still list any prefixes in the parent
        // that start with the same string.
        if !full_path.to_string().ends_with('/') {
            initial_dir.pop();
        }
        loop {
            // Did we make it to the root?
            if initial_dir.parent().is_none() {
@@ -197,8 +149,6 @@ impl LocalFs {
        // starts_with later.
        let prefix = full_path.as_str();
        eprintln!("local_fs list: initial_dir={initial_dir}");
        let mut files = vec![];
        let mut directory_queue = vec![initial_dir];
        while let Some(cur_folder) = directory_queue.pop() {
@@ -212,8 +162,6 @@ impl LocalFs {
                    if full_file_name.is_dir() {
                        directory_queue.push(full_file_name);
                    }
                } else {
                    eprintln!("Drop {full_file_name}, not in prefix");
                }
            }
        }
@@ -249,7 +197,6 @@ impl LocalFs {
            fs::OpenOptions::new()
                .write(true)
                .create(true)
                .truncate(true)
                .open(&temp_file_path)
                .await
                .with_context(|| {
@@ -343,76 +290,64 @@ impl RemoteStorage for LocalFs {
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
    ) -> Result<Listing, DownloadError> {
        if let Some(prefix) = prefix {
            eprintln!("local_fs list: prefix={}", prefix);
        }
        let op = async {
            let mut result = Listing::default();
-            // Filter out directories: in S3 directories don't exist, only the keys within them do.
+            if let ListingMode::NoDelimiter = mode {
-            let keys = self
+                let keys = self
-                .list_recursive(prefix)
+                    .list_recursive(prefix)
                    .await
                    .map_err(DownloadError::Other)?;
                result.keys = keys
                    .into_iter()
                    .filter(|k| {
                        let path = k.with_base(&self.storage_root);
                        !path.is_dir()
                    })
                    .collect();
                if let Some(max_keys) = max_keys {
                    result.keys.truncate(max_keys.get() as usize);
                }
                return Ok(result);
            }
            let path = match prefix {
                Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
                None => Cow::Borrowed(&self.storage_root),
            };
            let prefixes_to_filter = get_all_files(path.as_ref(), false)
                .await
                .map_err(DownloadError::Other)?;
            let keys = keys
                .into_iter()
                .filter(|k| {
                    let path = k.with_base(&self.storage_root);
                    !path.is_dir()
                })
                .collect();
-            if let ListingMode::NoDelimiter = mode {
+            // filter out empty directories to mirror s3 behavior.
-                result.keys = keys;
+            for prefix in prefixes_to_filter {
-            } else {
+                if prefix.is_dir()
-                let mut prefixes = HashSet::new();
+                    && is_directory_empty(&prefix)
-                for key in keys {
+                        .await
-                    eprintln!("key: {key}");
+                        .map_err(DownloadError::Other)?
-                    // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
+                {
-                    let relative_key = if let Some(prefix) = prefix {
+                    continue;
-                        let mut prefix = prefix.clone();
+                }
-                        // We only strip the dirname of the prefix, so that when we strip it from the start of keys we
+
-                        // end up with full file/dir names.
+                let stripped = prefix
-                        let prefix_full_local_path = prefix.with_base(&self.storage_root);
+                    .strip_prefix(&self.storage_root)
-                        let has_slash = prefix.0.to_string().ends_with('/');
+                    .context("Failed to strip prefix")
-                        let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
+                    .and_then(RemotePath::new)
-                            prefix
+                    .expect(
-                        } else {
+                        "We list files for storage root, hence should be able to remote the prefix",
-                            prefix.0.pop();
+                    );
-                            prefix
+
-                        };
+                if prefix.is_dir() {
-                        eprintln!("strip_prefix={strip_prefix}");
+                    result.prefixes.push(stripped);
-
+                } else {
-                        RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
+                    result.keys.push(stripped);
                    } else {
                        key
                    };
                    eprintln!("relative_key: {relative_key}");
                    let relative_key = format!("{}", relative_key);
                    if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
                        let first_part = relative_key
                            .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
                            .next()
                            .unwrap()
                            .to_owned();
                        prefixes.insert(first_part);
                    } else {
                        result
                            .keys
                            .push(RemotePath::from_string(&relative_key).unwrap());
                    }
                }
                result.prefixes = prefixes
                    .into_iter()
                    .map(|s| RemotePath::from_string(&s).unwrap())
                    .collect();
            }
            if let Some(max_keys) = max_keys {
                result.keys.truncate(max_keys.get() as usize);
            }
            Ok(result)
        };
@@ -471,37 +406,35 @@ impl RemoteStorage for LocalFs {
        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError> {
        let target_path = from.with_base(&self.storage_root);
        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
            let source = ReaderStream::new(
                fs::OpenOptions::new()
                    .read(true)
                    .open(&target_path)
                    .await
                    .with_context(|| {
                        format!("Failed to open source file {target_path:?} to use in the download")
                    })
                    .map_err(DownloadError::Other)?,
            );
-        let file_metadata = file_metadata(&target_path).await?;
+            let metadata = self
-
+                .read_storage_metadata(&target_path)
        let source = ReaderStream::new(
            fs::OpenOptions::new()
                .read(true)
                .open(&target_path)
                .await
-                .with_context(|| {
+                .map_err(DownloadError::Other)?;
                    format!("Failed to open source file {target_path:?} to use in the download")
                })
                .map_err(DownloadError::Other)?,
        );
-        let metadata = self
+            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
-            .read_storage_metadata(&target_path)
+            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
            .await
            .map_err(DownloadError::Other)?;
-        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+            Ok(Download {
-        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+                metadata,
-
+                last_modified: None,
-        let etag = mock_etag(&file_metadata);
+                etag: None,
-        Ok(Download {
+                download_stream: Box::pin(source),
-            metadata,
+            })
-            last_modified: file_metadata
+        } else {
-                .modified()
+            Err(DownloadError::NotFound)
-                .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
+        }
            etag,
            download_stream: Box::pin(source),
        })
    }
    async fn download_byte_range(
@@ -519,51 +452,50 @@ impl RemoteStorage for LocalFs {
                return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
            }
        }
        let target_path = from.with_base(&self.storage_root);
-        let file_metadata = file_metadata(&target_path).await?;
+        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-        let mut source = tokio::fs::OpenOptions::new()
+            let mut source = tokio::fs::OpenOptions::new()
-            .read(true)
+                .read(true)
-            .open(&target_path)
+                .open(&target_path)
-            .await
+                .await
-            .with_context(|| {
+                .with_context(|| {
-                format!("Failed to open source file {target_path:?} to use in the download")
+                    format!("Failed to open source file {target_path:?} to use in the download")
                })
                .map_err(DownloadError::Other)?;
            let len = source
                .metadata()
                .await
                .context("query file length")
                .map_err(DownloadError::Other)?
                .len();
            source
                .seek(io::SeekFrom::Start(start_inclusive))
                .await
                .context("Failed to seek to the range start in a local storage file")
                .map_err(DownloadError::Other)?;
            let metadata = self
                .read_storage_metadata(&target_path)
                .await
                .map_err(DownloadError::Other)?;
            let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
            let source = ReaderStream::new(source);
            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
            Ok(Download {
                metadata,
                last_modified: None,
                etag: None,
                download_stream: Box::pin(source),
            })
-            .map_err(DownloadError::Other)?;
+        } else {
-
+            Err(DownloadError::NotFound)
-        let len = source
+        }
            .metadata()
            .await
            .context("query file length")
            .map_err(DownloadError::Other)?
            .len();
        source
            .seek(io::SeekFrom::Start(start_inclusive))
            .await
            .context("Failed to seek to the range start in a local storage file")
            .map_err(DownloadError::Other)?;
        let metadata = self
            .read_storage_metadata(&target_path)
            .await
            .map_err(DownloadError::Other)?;
        let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
        let source = ReaderStream::new(source);
        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
        let etag = mock_etag(&file_metadata);
        Ok(Download {
            metadata,
            last_modified: file_metadata
                .modified()
                .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
            etag,
            download_stream: Box::pin(source),
        })
    }
    async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
@@ -623,6 +555,50 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
    path_with_suffix_extension(original_path, "metadata")
 }
 fn get_all_files<'a, P>(
    directory_path: P,
    recursive: bool,
 ) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
 where
    P: AsRef<Utf8Path> + Send + Sync + 'a,
 {
    Box::pin(async move {
        let directory_path = directory_path.as_ref();
        if directory_path.exists() {
            if directory_path.is_dir() {
                let mut paths = Vec::new();
                let mut dir_contents = fs::read_dir(directory_path).await?;
                while let Some(dir_entry) = dir_contents.next_entry().await? {
                    let file_type = dir_entry.file_type().await?;
                    let entry_path =
                        Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
                            anyhow::Error::msg(format!(
                                "non-Unicode path: {}",
                                pb.to_string_lossy()
                            ))
                        })?;
                    if file_type.is_symlink() {
                        debug!("{entry_path:?} is a symlink, skipping")
                    } else if file_type.is_dir() {
                        if recursive {
                            paths.extend(get_all_files(&entry_path, true).await?.into_iter())
                        } else {
                            paths.push(entry_path)
                        }
                    } else {
                        paths.push(entry_path);
                    }
                }
                Ok(paths)
            } else {
                bail!("Path {directory_path:?} is not a directory")
            }
        } else {
            Ok(Vec::new())
        }
    })
 }
 async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
    let target_dir = match target_file_path.parent() {
        Some(parent_dir) => parent_dir,
@@ -634,22 +610,13 @@ async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<
    Ok(())
 }
-async fn file_metadata(file_path: &Utf8Path) -> Result<std::fs::Metadata, DownloadError> {
+fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
-    tokio::fs::metadata(&file_path).await.map_err(|e| {
+    if file_path.exists() {
-        if e.kind() == ErrorKind::NotFound {
+        ensure!(file_path.is_file(), "file path '{file_path}' is not a file");
-            DownloadError::NotFound
+        Ok(true)
-        } else {
+    } else {
-            DownloadError::BadInput(e.into())
+        Ok(false)
-        }
+    }
    })
 }
 // Use mtime as stand-in for ETag.  We could calculate a meaningful one by md5'ing the contents of files we
 // read, but that's expensive and the local_fs test helper's whole reason for existence is to run small tests
 // quickly, with less overhead than using a mock S3 server.
 fn mock_etag(meta: &std::fs::Metadata) -> Etag {
    let mtime = meta.modified().expect("Filesystem mtime missing");
    format!("{}", mtime.duration_since(UNIX_EPOCH).unwrap().as_millis()).into()
 }
 #[cfg(test)]
@@ -942,18 +909,13 @@ mod fs_tests {
        // No delimiter: should recursively list everything
        let (storage, cancel) = create_storage()?;
        let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
        let child_sibling =
            upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;
        let listing = storage
            .list(None, ListingMode::NoDelimiter, None, &cancel)
            .await?;
        assert!(listing.prefixes.is_empty());
-        assert_eq!(
+        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());
            listing.keys,
            [uncle.clone(), child.clone(), child_sibling.clone()].to_vec()
        );
        // Delimiter: should only go one deep
        let listing = storage
@@ -966,25 +928,7 @@ mod fs_tests {
        );
        assert!(listing.keys.is_empty());
-        // Delimiter & prefix with a trailing slash
+        // Delimiter & prefix
        let listing = storage
            .list(
                Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
                ListingMode::WithDelimiter,
                None,
                &cancel,
            )
            .await?;
        assert_eq!(
            listing.keys,
            [RemotePath::from_string("uncle").unwrap()].to_vec()
        );
        assert_eq!(
            listing.prefixes,
            [RemotePath::from_string("parent").unwrap()].to_vec()
        );
        // Delimiter and prefix without a trailing slash
        let listing = storage
            .list(
                Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
@@ -993,66 +937,12 @@ mod fs_tests {
                &cancel,
            )
            .await?;
        assert_eq!(listing.keys, [].to_vec());
        assert_eq!(
            listing.prefixes,
-            [RemotePath::from_string("grandparent").unwrap()].to_vec()
+            [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
-        );
+                .to_vec()
        // Delimiter and prefix that's partway through a path component
        let listing = storage
            .list(
                Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
                ListingMode::WithDelimiter,
                None,
                &cancel,
            )
            .await?;
        assert_eq!(listing.keys, [].to_vec());
        assert_eq!(
            listing.prefixes,
            [RemotePath::from_string("grandparent").unwrap()].to_vec()
        );
        Ok(())
    }
    #[tokio::test]
    async fn list_part_component() -> anyhow::Result<()> {
        // No delimiter: should recursively list everything
        let (storage, cancel) = create_storage()?;
        // Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
        // of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
        // a freeform prefix.
        let _child_a =
            upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
        let _child_b =
            upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
        // Delimiter and prefix that's partway through a path component
        let listing = storage
            .list(
                Some(
                    &RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
                ),
                ListingMode::WithDelimiter,
                None,
                &cancel,
            )
            .await?;
        assert_eq!(listing.keys, [].to_vec());
        let mut found_prefixes = listing.prefixes.clone();
        found_prefixes.sort();
        assert_eq!(
            found_prefixes,
            [
                RemotePath::from_string("tenant").unwrap(),
                RemotePath::from_string("tenant-01").unwrap(),
            ]
            .to_vec()
        );
        assert_eq!(listing.keys, [uncle.clone()].to_vec());
        Ok(())
    }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -35,8 +35,8 @@ use aws_sdk_s3::{
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
 use aws_smithy_types::byte_stream::ByteStream;
 use aws_smithy_types::{body::SdkBody, DateTime};
 use aws_smithy_types::{byte_stream::ByteStream, date_time::ConversionError};
 use bytes::Bytes;
 use futures::stream::Stream;
 use hyper::Body;
@@ -178,7 +178,10 @@ impl S3Bucket {
    pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path.get_path().as_str();
+        let path_string = path
            .get_path()
            .as_str()
            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
        match &self.prefix_in_bucket {
            Some(prefix) => prefix.clone() + "/" + path_string,
            None => path_string.to_string(),
@@ -284,17 +287,8 @@ impl S3Bucket {
        let remaining = self.timeout.saturating_sub(started_at.elapsed());
        let metadata = object_output.metadata().cloned().map(StorageMetadata);
-        let etag = object_output
+        let etag = object_output.e_tag;
-            .e_tag
+        let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
            .ok_or(DownloadError::Other(anyhow::anyhow!("Missing ETag header")))?
            .into();
        let last_modified = object_output
            .last_modified
            .ok_or(DownloadError::Other(anyhow::anyhow!(
                "Missing LastModified header"
            )))?
            .try_into()
            .map_err(|e: ConversionError| DownloadError::Other(e.into()))?;
        let body = object_output.body;
        let body = ByteStreamAsStream::from(body);
@@ -468,7 +462,17 @@ impl RemoteStorage for S3Bucket {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_s3_object(p))
-            .or_else(|| self.prefix_in_bucket.clone().map(|s| s + "/"));
+            .or_else(|| self.prefix_in_bucket.clone())
            .map(|mut p| {
                // required to end with a separator
                // otherwise request will return only the entry of a prefix
                if matches!(mode, ListingMode::WithDelimiter)
                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
                {
                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
                }
                p
            });
        let _permit = self.permit(kind, cancel).await?;
@@ -536,15 +540,11 @@ impl RemoteStorage for S3Bucket {
                }
            }
-            // S3 gives us prefixes like "foo/", we return them like "foo"
+            result.prefixes.extend(
-            result.prefixes.extend(prefixes.iter().filter_map(|o| {
+                prefixes
-                Some(
+                    .iter()
-                    self.s3_object_to_relative_path(
+                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
-                        o.prefix()?
+            );
                            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
                    ),
                )
            }));
            continuation_token = match response.next_continuation_token {
                Some(new_token) => Some(new_token),
@@ -1041,22 +1041,22 @@ mod tests {
            Some("/test/prefix/"),
        ];
        let expected_outputs = [
-            vec!["", "some/path", "some/path/"],
+            vec!["", "some/path", "some/path"],
-            vec!["/", "/some/path", "/some/path/"],
+            vec!["/", "/some/path", "/some/path"],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path/",
+                "test/prefix/some/path",
            ],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path/",
+                "test/prefix/some/path",
            ],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path/",
+                "test/prefix/some/path",
            ],
        ];
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -107,6 +107,27 @@ impl UnreliableWrapper {
 type VoidStorage = crate::LocalFs;
 impl RemoteStorage for UnreliableWrapper {
    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
            .map_err(DownloadError::Other)?;
        self.inner.list_prefixes(prefix, cancel).await
    }
    async fn list_files(
        &self,
        folder: Option<&RemotePath>,
        max_keys: Option<NonZeroU32>,
        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
            .map_err(DownloadError::Other)?;
        self.inner.list_files(folder, max_keys, cancel).await
    }
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,6 +1,5 @@
 use anyhow::Context;
 use camino::Utf8Path;
 use remote_storage::ListingMode;
 use remote_storage::RemotePath;
 use std::sync::Arc;
 use std::{collections::HashSet, num::NonZeroU32};
@@ -55,9 +54,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
        .context("common_prefix construction")?;
    let root_remote_prefixes = test_client
-        .list(None, ListingMode::WithDelimiter, None, &cancel)
+        .list_prefixes(None, &cancel)
-        .await?
+        .await
-        .prefixes
+        .context("client list root prefixes failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
@@ -66,14 +65,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
    );
    let nested_remote_prefixes = test_client
-        .list(
+        .list_prefixes(Some(&base_prefix), &cancel)
-            Some(&base_prefix.add_trailing_slash()),
+        .await
-            ListingMode::WithDelimiter,
+        .context("client list nested prefixes failure")?
            None,
            &cancel,
        )
        .await?
        .prefixes
        .into_iter()
        .collect::<HashSet<_>>();
    let remote_only_prefixes = nested_remote_prefixes
@@ -96,13 +90,11 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
 ///
 /// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
 /// Then performs the following queries:
-///    1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
 #[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
 #[tokio::test]
-async fn list_no_delimiter_works(
+async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
    ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
 ) -> anyhow::Result<()> {
    let ctx = match ctx {
        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
@@ -115,36 +107,29 @@ async fn list_no_delimiter_works(
    let base_prefix =
        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
    let root_files = test_client
-        .list(None, ListingMode::NoDelimiter, None, &cancel)
+        .list_files(None, None, &cancel)
        .await
        .context("client list root files failure")?
        .keys
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
        root_files,
        ctx.remote_blobs.clone(),
-        "remote storage list on root mismatches with the uploads."
+        "remote storage list_files on root mismatches with the uploads."
    );
    // Test that max_keys limit works. In total there are about 21 files (see
    // upload_simple_remote_data call in test_real_s3.rs).
    let limited_root_files = test_client
-        .list(
+        .list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
            None,
            ListingMode::NoDelimiter,
            Some(NonZeroU32::new(2).unwrap()),
            &cancel,
        )
        .await
        .context("client list root files failure")?;
-    assert_eq!(limited_root_files.keys.len(), 2);
+    assert_eq!(limited_root_files.len(), 2);
    let nested_remote_files = test_client
-        .list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
+        .list_files(Some(&base_prefix), None, &cancel)
        .await
        .context("client list nested files failure")?
        .keys
        .into_iter()
        .collect::<HashSet<_>>();
    let trim_remote_blobs: HashSet<_> = ctx
@@ -156,7 +141,7 @@ async fn list_no_delimiter_works(
        .collect();
    assert_eq!(
        nested_remote_files, trim_remote_blobs,
-        "remote storage list on subdirrectory mismatches with the uploads."
+        "remote storage list_files on subdirrectory mismatches with the uploads."
    );
    Ok(())
 }
@@ -214,11 +199,7 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
    ctx.client.delete_objects(&[path1, path2], &cancel).await?;
-    let prefixes = ctx
+    let prefixes = ctx.client.list_prefixes(None, &cancel).await?;
        .client
        .list(None, ListingMode::WithDelimiter, None, &cancel)
        .await?
        .prefixes;
    assert_eq!(prefixes.len(), 1);
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -57,6 +57,7 @@ enum MaybeEnabledStorage {
    Disabled,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -85,6 +86,7 @@ struct AzureWithTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -132,6 +134,10 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    }
 }
 // NOTE: the setups for the list_prefixes test and the list_files test are very similar
 // However, they are not idential. The list_prefixes function is concerned with listing prefixes,
 // whereas the list_files function is concerned with listing files.
 // See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(AzureWithSimpleTestBlobs),
    Disabled,
@@ -142,6 +148,7 @@ struct AzureWithSimpleTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -12,8 +12,8 @@ use anyhow::Context;
 use camino::Utf8Path;
 use futures_util::StreamExt;
 use remote_storage::{
-    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
+    DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
-    RemoteStorageKind, S3Config,
+    S3Config,
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
@@ -75,14 +75,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
        client: &Arc<GenericRemoteStorage>,
        cancel: &CancellationToken,
    ) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(
+        Ok(retry(|| client.list_files(None, None, cancel))
-            retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
+            .await
-                .await
+            .context("list root files failure")?
-                .context("list root files failure")?
+            .into_iter()
-                .keys
+            .collect::<HashSet<_>>())
                .into_iter()
                .collect::<HashSet<_>>(),
        )
    }
    let cancel = CancellationToken::new();
@@ -121,7 +118,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    // A little check to ensure that our clock is not too far off from the S3 clock
    {
        let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
-        let last_modified = dl.last_modified;
+        let last_modified = dl.last_modified.unwrap();
        let half_wt = WAIT_TIME.mul_f32(0.5);
        let t0_hwt = t0 + half_wt;
        let t1_hwt = t1 - half_wt;
@@ -222,6 +219,7 @@ enum MaybeEnabledStorage {
    Disabled,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -250,6 +248,7 @@ struct S3WithTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -297,6 +296,10 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    }
 }
 // NOTE: the setups for the list_prefixes test and the list_files test are very similar
 // However, they are not idential. The list_prefixes function is concerned with listing prefixes,
 // whereas the list_files function is concerned with listing files.
 // See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(S3WithSimpleTestBlobs),
    Disabled,
@@ -307,6 +310,7 @@ struct S3WithSimpleTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }
 #[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
--- a/libs/tenant_size_model/tests/tests.rs
+++ b/libs/tenant_size_model/tests/tests.rs
@@ -247,7 +247,7 @@ fn scenario_4() {
    //
    // This is in total 5000 + 1000 + 5000 + 1000 = 12000
    //
-    // (If we used the method from the previous scenario, and
+    // (If we used the the method from the previous scenario, and
    // kept only snapshot at the branch point, we'd need to keep
    // all the WAL between 10000-18000 on the main branch, so
    // the total size would be 5000 + 1000 + 8000 = 14000. The
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -13,7 +13,6 @@ testing = ["fail/failpoints"]
 [dependencies]
 arc-swap.workspace = true
 sentry.workspace = true
 async-compression.workspace = true
 async-trait.workspace = true
 anyhow.workspace = true
 bincode.workspace = true
@@ -22,7 +21,6 @@ camino.workspace = true
 chrono.workspace = true
 heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
 humantime.workspace = true
 hyper = { workspace = true, features = ["full"] }
 fail.workspace = true
 futures = { workspace = true}
@@ -38,7 +36,6 @@ serde_json.workspace = true
 signal-hook.workspace = true
 thiserror.workspace = true
 tokio.workspace = true
 tokio-tar.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 tracing-error.workspace = true
@@ -49,7 +46,6 @@ strum.workspace = true
 strum_macros.workspace = true
 url.workspace = true
 uuid.workspace = true
 walkdir.workspace = true
 pq_proto.workspace = true
 postgres_connection.workspace = true
--- a/libs/utils/src/env.rs
+++ b/libs/utils/src/env.rs
@@ -1,21 +0,0 @@
 //! Wrapper around `std::env::var` for parsing environment variables.
 use std::{fmt::Display, str::FromStr};
 pub fn var<V, E>(varname: &str) -> Option<V>
 where
    V: FromStr<Err = E>,
    E: Display,
 {
    match std::env::var(varname) {
        Ok(s) => Some(
            s.parse()
                .map_err(|e| format!("failed to parse env var {varname}: {e:#}"))
                .unwrap(),
        ),
        Err(std::env::VarError::NotPresent) => None,
        Err(std::env::VarError::NotUnicode(_)) => {
            panic!("env var {varname} is not unicode")
        }
    }
 }
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -34,8 +34,6 @@ pub enum Generation {
 /// scenarios where pageservers might otherwise issue conflicting writes to
 /// remote storage
 impl Generation {
    pub const MAX: Self = Self::Valid(u32::MAX);
    /// Create a new Generation that represents a legacy key format with
    /// no generation suffix
    pub fn none() -> Self {
--- a/libs/utils/src/history_buffer.rs
+++ b/libs/utils/src/history_buffer.rs
@@ -47,10 +47,9 @@ impl<T, const L: usize> ops::Deref for HistoryBufferWithDropCounter<T, L> {
    }
 }
-#[derive(serde::Serialize, serde::Deserialize)]
+#[derive(serde::Serialize)]
 struct SerdeRepr<T> {
    buffer: Vec<T>,
    buffer_size: usize,
    drop_count: u64,
 }
@@ -62,7 +61,6 @@ where
        let HistoryBufferWithDropCounter { buffer, drop_count } = value;
        SerdeRepr {
            buffer: buffer.iter().cloned().collect(),
            buffer_size: L,
            drop_count: *drop_count,
        }
    }
@@ -80,52 +78,19 @@ where
    }
 }
 impl<'de, T, const L: usize> serde::de::Deserialize<'de> for HistoryBufferWithDropCounter<T, L>
 where
    T: Clone + serde::Deserialize<'de>,
 {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        let SerdeRepr {
            buffer: des_buffer,
            drop_count,
            buffer_size,
        } = SerdeRepr::<T>::deserialize(deserializer)?;
        if buffer_size != L {
            use serde::de::Error;
            return Err(D::Error::custom(format!(
                "invalid buffer_size, expecting {L} got {buffer_size}"
            )));
        }
        let mut buffer = HistoryBuffer::new();
        buffer.extend(des_buffer);
        Ok(HistoryBufferWithDropCounter { buffer, drop_count })
    }
 }
 #[cfg(test)]
 mod test {
    use super::HistoryBufferWithDropCounter;
    #[test]
    fn test_basics() {
-        let mut b = HistoryBufferWithDropCounter::<usize, 2>::default();
+        let mut b = HistoryBufferWithDropCounter::<_, 2>::default();
        b.write(1);
        b.write(2);
        b.write(3);
        assert!(b.iter().any(|e| *e == 2));
        assert!(b.iter().any(|e| *e == 3));
        assert!(!b.iter().any(|e| *e == 1));
        // round-trip serde
        let round_tripped: HistoryBufferWithDropCounter<usize, 2> =
            serde_json::from_str(&serde_json::to_string(&b).unwrap()).unwrap();
        assert_eq!(
            round_tripped.iter().cloned().collect::<Vec<_>>(),
            b.iter().cloned().collect::<Vec<_>>()
        );
    }
    #[test]
--- a/libs/utils/src/http/endpoint.rs
+++ b/libs/utils/src/http/endpoint.rs
@@ -245,7 +245,7 @@ impl std::io::Write for ChannelWriter {
    }
 }
-pub async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+async fn prometheus_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    SERVE_METRICS_COUNT.inc();
    let started_at = std::time::Instant::now();
@@ -367,6 +367,7 @@ pub fn make_router() -> RouterBuilder<hyper::Body, ApiError> {
        .middleware(Middleware::post_with_info(
            add_request_id_header_to_response,
        ))
        .get("/metrics", |r| request_span(r, prometheus_metrics_handler))
        .err_handler(route_error_handler)
 }
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -63,7 +63,6 @@ pub mod measured_stream;
 pub mod serde_percent;
 pub mod serde_regex;
 pub mod serde_system_time;
 pub mod pageserver_feedback;
@@ -88,12 +87,6 @@ pub mod failpoint_support;
 pub mod yielding_loop;
 pub mod zstd;
 pub mod env;
 pub mod poison;
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/lock_file.rs
+++ b/libs/utils/src/lock_file.rs
@@ -63,7 +63,6 @@ impl UnwrittenLockFile {
 pub fn create_exclusive(lock_file_path: &Utf8Path) -> anyhow::Result<UnwrittenLockFile> {
    let lock_file = fs::OpenOptions::new()
        .create(true) // O_CREAT
        .truncate(true)
        .write(true)
        .open(lock_file_path)
        .context("open lock file")?;
--- a/libs/utils/src/pageserver_feedback.rs
+++ b/libs/utils/src/pageserver_feedback.rs
@@ -29,10 +29,12 @@ pub struct PageserverFeedback {
    // Serialize with RFC3339 format.
    #[serde(with = "serde_systemtime")]
    pub replytime: SystemTime,
    /// Used to track feedbacks from different shards. Always zero for unsharded tenants.
    pub shard_number: u32,
 }
 // NOTE: Do not forget to increment this number when adding new fields to PageserverFeedback.
 // Do not remove previously available fields because this might be backwards incompatible.
 pub const PAGESERVER_FEEDBACK_FIELDS_NUMBER: u8 = 5;
 impl PageserverFeedback {
    pub fn empty() -> PageserverFeedback {
        PageserverFeedback {
@@ -41,7 +43,6 @@ impl PageserverFeedback {
            remote_consistent_lsn: Lsn::INVALID,
            disk_consistent_lsn: Lsn::INVALID,
            replytime: *PG_EPOCH,
            shard_number: 0,
        }
    }
@@ -58,26 +59,17 @@ impl PageserverFeedback {
    //
    // TODO: change serialized fields names once all computes migrate to rename.
    pub fn serialize(&self, buf: &mut BytesMut) {
-        let buf_ptr = buf.len();
+        buf.put_u8(PAGESERVER_FEEDBACK_FIELDS_NUMBER); // # of keys
        buf.put_u8(0); // # of keys, will be filled later
        let mut nkeys = 0;
        nkeys += 1;
        buf.put_slice(b"current_timeline_size\0");
        buf.put_i32(8);
        buf.put_u64(self.current_timeline_size);
        nkeys += 1;
        buf.put_slice(b"ps_writelsn\0");
        buf.put_i32(8);
        buf.put_u64(self.last_received_lsn.0);
        nkeys += 1;
        buf.put_slice(b"ps_flushlsn\0");
        buf.put_i32(8);
        buf.put_u64(self.disk_consistent_lsn.0);
        nkeys += 1;
        buf.put_slice(b"ps_applylsn\0");
        buf.put_i32(8);
        buf.put_u64(self.remote_consistent_lsn.0);
@@ -88,19 +80,9 @@ impl PageserverFeedback {
            .expect("failed to serialize pg_replytime earlier than PG_EPOCH")
            .as_micros() as i64;
        nkeys += 1;
        buf.put_slice(b"ps_replytime\0");
        buf.put_i32(8);
        buf.put_i64(timestamp);
        if self.shard_number > 0 {
            nkeys += 1;
            buf.put_slice(b"shard_number\0");
            buf.put_i32(4);
            buf.put_u32(self.shard_number);
        }
        buf[buf_ptr] = nkeys;
    }
    // Deserialize PageserverFeedback message
@@ -143,8 +125,9 @@ impl PageserverFeedback {
                }
                b"shard_number" => {
                    let len = buf.get_i32();
-                    assert_eq!(len, 4);
+                    // TODO: this will be implemented in the next update,
-                    rf.shard_number = buf.get_u32();
+                    //  for now, we just skip the value.
                    buf.advance(len as usize);
                }
                _ => {
                    let len = buf.get_i32();
@@ -217,7 +200,10 @@ mod tests {
        rf.serialize(&mut data);
        // Add an extra field to the buffer and adjust number of keys
-        data[0] += 1;
+        if let Some(first) = data.first_mut() {
            *first = PAGESERVER_FEEDBACK_FIELDS_NUMBER + 1;
        }
        data.put_slice(b"new_field_one\0");
        data.put_i32(8);
        data.put_u64(42);
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -1,121 +0,0 @@
 //!  Protect a piece of state from reuse after it is left in an inconsistent state.
 //!
 //!  # Example
 //!
 //!  ```
 //!  # tokio_test::block_on(async {
 //!  use utils::poison::Poison;
 //!  use std::time::Duration;
 //!
 //!  struct State {
 //!    clean: bool,
 //!  }
 //!  let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true }));
 //!
 //!  let mut mutex_guard = state.lock().await;
 //!  let mut poison_guard = mutex_guard.check_and_arm()?;
 //!  let state = poison_guard.data_mut();
 //!  state.clean = false;
 //!  // If we get cancelled at this await point, subsequent check_and_arm() calls will fail.
 //!  tokio::time::sleep(Duration::from_secs(10)).await;
 //!  state.clean = true;
 //!  poison_guard.disarm();
 //!  # Ok::<(), utils::poison::Error>(())
 //!  # });
 //!  ```
 use tracing::warn;
 pub struct Poison<T> {
    what: &'static str,
    state: State,
    data: T,
 }
 #[derive(Clone, Copy)]
 enum State {
    Clean,
    Armed,
    Poisoned { at: chrono::DateTime<chrono::Utc> },
 }
 impl<T> Poison<T> {
    /// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed.
    pub fn new(what: &'static str, data: T) -> Self {
        Self {
            what,
            state: State::Clean,
            data,
        }
    }
    /// Check for poisoning and return a [`Guard`] that provides access to the wrapped state.
    pub fn check_and_arm(&mut self) -> Result<Guard<T>, Error> {
        match self.state {
            State::Clean => {
                self.state = State::Armed;
                Ok(Guard(self))
            }
            State::Armed => unreachable!("transient state"),
            State::Poisoned { at } => Err(Error::Poisoned {
                what: self.what,
                at,
            }),
        }
    }
 }
 /// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
 /// Once modifications are done, use [`Self::disarm`].
 /// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
 /// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
 pub struct Guard<'a, T>(&'a mut Poison<T>);
 impl<'a, T> Guard<'a, T> {
    pub fn data(&self) -> &T {
        &self.0.data
    }
    pub fn data_mut(&mut self) -> &mut T {
        &mut self.0.data
    }
    pub fn disarm(self) {
        match self.0.state {
            State::Clean => unreachable!("we set it to Armed in check_and_arm()"),
            State::Armed => {
                self.0.state = State::Clean;
            }
            State::Poisoned { at } => {
                unreachable!("we fail check_and_arm() if it's in that state: {at}")
            }
        }
    }
 }
 impl<'a, T> Drop for Guard<'a, T> {
    fn drop(&mut self) {
        match self.0.state {
            State::Clean => {
                // set by disarm()
            }
            State::Armed => {
                // still armed => poison it
                let at = chrono::Utc::now();
                self.0.state = State::Poisoned { at };
                warn!(at=?at, "poisoning {}", self.0.what);
            }
            State::Poisoned { at } => {
                unreachable!("we fail check_and_arm() if it's in that state: {at}")
            }
        }
    }
 }
 #[derive(thiserror::Error, Debug)]
 pub enum Error {
    #[error("poisoned at {at}: {what}")]
    Poisoned {
        what: &'static str,
        at: chrono::DateTime<chrono::Utc>,
    },
 }
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -182,18 +182,6 @@ where
        }
    }
    /// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
    pub fn would_wait_for(&self, num: V) -> Result<(), V> {
        let internal = self.internal.lock().unwrap();
        let cnt = internal.current.cnt_value();
        drop(internal);
        if cnt >= num {
            Ok(())
        } else {
            Err(cnt)
        }
    }
    /// Register and return a channel that will be notified when a number arrives,
    /// or None, if it has already arrived.
    fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
--- a/libs/utils/src/serde_system_time.rs
+++ b/libs/utils/src/serde_system_time.rs
@@ -1,55 +0,0 @@
 //! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision.
 #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)]
 #[serde(transparent)]
 pub struct SystemTime(
    #[serde(
        deserialize_with = "deser_rfc3339_millis",
        serialize_with = "ser_rfc3339_millis"
    )]
    pub std::time::SystemTime,
 );
 fn ser_rfc3339_millis<S: serde::ser::Serializer>(
    ts: &std::time::SystemTime,
    serializer: S,
 ) -> Result<S::Ok, S::Error> {
    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
 }
 fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<std::time::SystemTime, D::Error>
 where
    D: serde::de::Deserializer<'de>,
 {
    let s: String = serde::de::Deserialize::deserialize(deserializer)?;
    humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    /// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds.
    fn to_millisecond_precision(time: SystemTime) -> SystemTime {
        match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) {
            Ok(duration) => {
                let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis());
                SystemTime(
                    std::time::SystemTime::UNIX_EPOCH
                        + std::time::Duration::from_millis(total_millis),
                )
            }
            Err(_) => time,
        }
    }
    #[test]
    fn test_serialize_deserialize() {
        let input = SystemTime(std::time::SystemTime::now());
        let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0));
        let serialized = serde_json::to_string(&input).unwrap();
        assert_eq!(expected_serialized, serialized);
        let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap();
        assert_eq!(to_millisecond_precision(input), deserialized);
    }
 }
--- a/libs/utils/src/sync/heavier_once_cell.rs
+++ b/libs/utils/src/sync/heavier_once_cell.rs
@@ -110,49 +110,6 @@ impl<T> OnceCell<T> {
        }
    }
    /// Returns a guard to an existing initialized value, or returns an unique initialization
    /// permit which can be used to initialize this `OnceCell` using `OnceCell::set`.
    pub async fn get_or_init_detached(&self) -> Result<Guard<'_, T>, InitPermit> {
        // It looks like OnceCell::get_or_init could be implemented using this method instead of
        // duplication. However, that makes the future be !Send due to possibly holding on to the
        // MutexGuard over an await point.
        loop {
            let sem = {
                let guard = self.inner.lock().unwrap();
                if guard.value.is_some() {
                    return Ok(Guard(guard));
                }
                guard.init_semaphore.clone()
            };
            {
                let permit = {
                    // increment the count for the duration of queued
                    let _guard = CountWaitingInitializers::start(self);
                    sem.acquire().await
                };
                let Ok(permit) = permit else {
                    let guard = self.inner.lock().unwrap();
                    if !Arc::ptr_eq(&sem, &guard.init_semaphore) {
                        // there was a take_and_deinit in between
                        continue;
                    }
                    assert!(
                        guard.value.is_some(),
                        "semaphore got closed, must be initialized"
                    );
                    return Ok(Guard(guard));
                };
                permit.forget();
            }
            let permit = InitPermit(sem);
            return Err(permit);
        }
    }
    /// Assuming a permit is held after previous call to [`Guard::take_and_deinit`], it can be used
    /// to complete initializing the inner value.
    ///
@@ -192,14 +149,6 @@ impl<T> OnceCell<T> {
        }
    }
    /// Like [`Guard::take_and_deinit`], but will return `None` if this OnceCell was never
    /// initialized.
    pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
        let inner = self.inner.get_mut().unwrap();
        inner.take_and_deinit()
    }
    /// Return the number of [`Self::get_or_init`] calls waiting for initialization to complete.
    pub fn initializer_count(&self) -> usize {
        self.initializers.load(Ordering::Relaxed)
@@ -253,24 +202,16 @@ impl<'a, T> Guard<'a, T> {
    ///
    /// The permit will be on a semaphore part of the new internal value, and any following
    /// [`OnceCell::get_or_init`] will wait on it to complete.
-    pub fn take_and_deinit(mut self) -> (T, InitPermit) {
+    pub fn take_and_deinit(&mut self) -> (T, InitPermit) {
        self.0
            .take_and_deinit()
            .expect("guard is not created unless value has been initialized")
    }
 }
 impl<T> Inner<T> {
    pub fn take_and_deinit(&mut self) -> Option<(T, InitPermit)> {
        let value = self.value.take()?;
        let mut swapped = Inner::default();
        let sem = swapped.init_semaphore.clone();
        // acquire and forget right away, moving the control over to InitPermit
        sem.try_acquire().expect("we just created this").forget();
-        let permit = InitPermit(sem);
+        std::mem::swap(&mut *self.0, &mut swapped);
-        std::mem::swap(self, &mut swapped);
+        swapped
-        Some((value, permit))
+            .value
            .map(|v| (v, InitPermit(sem)))
            .expect("guard is not created unless value has been initialized")
    }
 }
@@ -279,13 +220,6 @@ impl<T> Inner<T> {
 /// On drop, this type will return the permit.
 pub struct InitPermit(Arc<tokio::sync::Semaphore>);
 impl std::fmt::Debug for InitPermit {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        let ptr = Arc::as_ptr(&self.0) as *const ();
        f.debug_tuple("InitPermit").field(&ptr).finish()
    }
 }
 impl Drop for InitPermit {
    fn drop(&mut self) {
        assert_eq!(
@@ -547,57 +481,4 @@ mod tests {
        assert_eq!("t1", *cell.get().unwrap());
    }
    #[tokio::test(start_paused = true)]
    async fn detached_init_smoke() {
        let target = OnceCell::default();
        let Err(permit) = target.get_or_init_detached().await else {
            unreachable!("it is not initialized")
        };
        tokio::time::timeout(
            std::time::Duration::from_secs(3600 * 24 * 7 * 365),
            target.get_or_init(|permit2| async { Ok::<_, Infallible>((11, permit2)) }),
        )
        .await
        .expect_err("should timeout since we are already holding the permit");
        target.set(42, permit);
        let (_answer, permit) = {
            let guard = target
                .get_or_init(|permit| async { Ok::<_, Infallible>((11, permit)) })
                .await
                .unwrap();
            assert_eq!(*guard, 42);
            guard.take_and_deinit()
        };
        assert!(target.get().is_none());
        target.set(11, permit);
        assert_eq!(*target.get().unwrap(), 11);
    }
    #[tokio::test]
    async fn take_and_deinit_on_mut() {
        use std::convert::Infallible;
        let mut target = OnceCell::<u32>::default();
        assert!(target.take_and_deinit().is_none());
        target
            .get_or_init(|permit| async move { Ok::<_, Infallible>((42, permit)) })
            .await
            .unwrap();
        let again = target.take_and_deinit();
        assert!(matches!(again, Some((42, _))), "{again:?}");
        assert!(target.take_and_deinit().is_none());
    }
 }
--- a/libs/utils/src/vec_map.rs
+++ b/libs/utils/src/vec_map.rs
@@ -1,60 +1,27 @@
 use std::{alloc::Layout, cmp::Ordering, ops::RangeBounds};
 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
 pub enum VecMapOrdering {
    Greater,
    GreaterOrEqual,
 }
 /// Ordered map datastructure implemented in a Vec.
 /// Append only - can only add keys that are larger than the
 /// current max key.
 /// Ordering can be adjusted using [`VecMapOrdering`]
 /// during `VecMap` construction.
 #[derive(Clone, Debug)]
-pub struct VecMap<K, V> {
+pub struct VecMap<K, V>(Vec<(K, V)>);
    data: Vec<(K, V)>,
    ordering: VecMapOrdering,
 }
 impl<K, V> Default for VecMap<K, V> {
    fn default() -> Self {
-        VecMap {
+        VecMap(Default::default())
            data: Default::default(),
            ordering: VecMapOrdering::Greater,
        }
    }
 }
-#[derive(thiserror::Error, Debug)]
+#[derive(Debug)]
-pub enum VecMapError {
+pub struct InvalidKey;
    #[error("Key violates ordering constraint")]
    InvalidKey,
    #[error("Mismatched ordering constraints")]
    ExtendOrderingError,
 }
 impl<K: Ord, V> VecMap<K, V> {
    pub fn new(ordering: VecMapOrdering) -> Self {
        Self {
            data: Vec::new(),
            ordering,
        }
    }
    pub fn with_capacity(capacity: usize, ordering: VecMapOrdering) -> Self {
        Self {
            data: Vec::with_capacity(capacity),
            ordering,
        }
    }
    pub fn is_empty(&self) -> bool {
-        self.data.is_empty()
+        self.0.is_empty()
    }
    pub fn as_slice(&self) -> &[(K, V)] {
-        self.data.as_slice()
+        self.0.as_slice()
    }
    /// This function may panic if given a range where the lower bound is
@@ -62,7 +29,7 @@ impl<K: Ord, V> VecMap<K, V> {
    pub fn slice_range<R: RangeBounds<K>>(&self, range: R) -> &[(K, V)] {
        use std::ops::Bound::*;
-        let binary_search = |k: &K| self.data.binary_search_by_key(&k, extract_key);
+        let binary_search = |k: &K| self.0.binary_search_by_key(&k, extract_key);
        let start_idx = match range.start_bound() {
            Unbounded => 0,
@@ -74,7 +41,7 @@ impl<K: Ord, V> VecMap<K, V> {
        };
        let end_idx = match range.end_bound() {
-            Unbounded => self.data.len(),
+            Unbounded => self.0.len(),
            Included(k) => match binary_search(k) {
                Ok(idx) => idx + 1,
                Err(idx) => idx,
@@ -82,30 +49,34 @@ impl<K: Ord, V> VecMap<K, V> {
            Excluded(k) => binary_search(k).unwrap_or_else(std::convert::identity),
        };
-        &self.data[start_idx..end_idx]
+        &self.0[start_idx..end_idx]
    }
    /// Add a key value pair to the map.
-    /// If `key` is not respective of the `self` ordering the
+    /// If `key` is less than or equal to the current maximum key
-    /// pair will not be added and `InvalidKey` error will be returned.
+    /// the pair will not be added and InvalidKey error will be returned.
-    pub fn append(&mut self, key: K, value: V) -> Result<usize, VecMapError> {
+    pub fn append(&mut self, key: K, value: V) -> Result<usize, InvalidKey> {
-        self.validate_key_order(&key)?;
+        if let Some((last_key, _last_value)) = self.0.last() {
            if &key <= last_key {
                return Err(InvalidKey);
            }
        }
        let delta_size = self.instrument_vec_op(|vec| vec.push((key, value)));
        Ok(delta_size)
    }
    /// Update the maximum key value pair or add a new key value pair to the map.
-    /// If `key` is not respective of the `self` ordering no updates or additions
+    /// If `key` is less than the current maximum key no updates or additions
-    /// will occur and `InvalidKey` error will be returned.
+    /// will occur and InvalidKey error will be returned.
    pub fn append_or_update_last(
        &mut self,
        key: K,
        mut value: V,
-    ) -> Result<(Option<V>, usize), VecMapError> {
+    ) -> Result<(Option<V>, usize), InvalidKey> {
-        if let Some((last_key, last_value)) = self.data.last_mut() {
+        if let Some((last_key, last_value)) = self.0.last_mut() {
            match key.cmp(last_key) {
-                Ordering::Less => return Err(VecMapError::InvalidKey),
+                Ordering::Less => return Err(InvalidKey),
                Ordering::Equal => {
                    std::mem::swap(last_value, &mut value);
                    const DELTA_SIZE: usize = 0;
@@ -129,67 +100,40 @@ impl<K: Ord, V> VecMap<K, V> {
        V: Clone,
    {
        let split_idx = self
-            .data
+            .0
            .binary_search_by_key(&cutoff, extract_key)
            .unwrap_or_else(std::convert::identity);
        (
-            VecMap {
+            VecMap(self.0[..split_idx].to_vec()),
-                data: self.data[..split_idx].to_vec(),
+            VecMap(self.0[split_idx..].to_vec()),
                ordering: self.ordering,
            },
            VecMap {
                data: self.data[split_idx..].to_vec(),
                ordering: self.ordering,
            },
        )
    }
    /// Move items from `other` to the end of `self`, leaving `other` empty.
-    /// If the `other` ordering is different from `self` ordering
+    /// If any keys in `other` is less than or equal to any key in `self`,
-    /// `ExtendOrderingError` error will be returned.
+    /// `InvalidKey` error will be returned and no mutation will occur.
-    /// If any keys in `other` is not respective of the ordering defined in
+    pub fn extend(&mut self, other: &mut Self) -> Result<usize, InvalidKey> {
-    /// `self`, `InvalidKey` error will be returned and no mutation will occur.
+        let self_last_opt = self.0.last().map(extract_key);
-    pub fn extend(&mut self, other: &mut Self) -> Result<usize, VecMapError> {
+        let other_first_opt = other.0.last().map(extract_key);
        if self.ordering != other.ordering {
            return Err(VecMapError::ExtendOrderingError);
        }
-        let other_first_opt = other.data.last().map(extract_key);
+        if let (Some(self_last), Some(other_first)) = (self_last_opt, other_first_opt) {
-        if let Some(other_first) = other_first_opt {
+            if self_last >= other_first {
-            self.validate_key_order(other_first)?;
+                return Err(InvalidKey);
        }
        let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.data));
        Ok(delta_size)
    }
    /// Validate the current last key in `self` and key being
    /// inserted against the order defined in `self`.
    fn validate_key_order(&self, key: &K) -> Result<(), VecMapError> {
        if let Some(last_key) = self.data.last().map(extract_key) {
            match (&self.ordering, &key.cmp(last_key)) {
                (VecMapOrdering::Greater, Ordering::Less | Ordering::Equal) => {
                    return Err(VecMapError::InvalidKey);
                }
                (VecMapOrdering::Greater, Ordering::Greater) => {}
                (VecMapOrdering::GreaterOrEqual, Ordering::Less) => {
                    return Err(VecMapError::InvalidKey);
                }
                (VecMapOrdering::GreaterOrEqual, Ordering::Equal | Ordering::Greater) => {}
            }
        }
-        Ok(())
+        let delta_size = self.instrument_vec_op(|vec| vec.append(&mut other.0));
        Ok(delta_size)
    }
    /// Instrument an operation on the underlying [`Vec`].
    /// Will panic if the operation decreases capacity.
    /// Returns the increase in memory usage caused by the op.
    fn instrument_vec_op(&mut self, op: impl FnOnce(&mut Vec<(K, V)>)) -> usize {
-        let old_cap = self.data.capacity();
+        let old_cap = self.0.capacity();
-        op(&mut self.data);
+        op(&mut self.0);
-        let new_cap = self.data.capacity();
+        let new_cap = self.0.capacity();
        match old_cap.cmp(&new_cap) {
            Ordering::Less => {
@@ -201,36 +145,6 @@ impl<K: Ord, V> VecMap<K, V> {
            Ordering::Greater => panic!("VecMap capacity shouldn't ever decrease"),
        }
    }
    /// Similar to `from_iter` defined in `FromIter` trait except
    /// that it accepts an [`VecMapOrdering`]
    pub fn from_iter<I: IntoIterator<Item = (K, V)>>(iter: I, ordering: VecMapOrdering) -> Self {
        let iter = iter.into_iter();
        let initial_capacity = {
            match iter.size_hint() {
                (lower_bound, None) => lower_bound,
                (_, Some(upper_bound)) => upper_bound,
            }
        };
        let mut vec_map = VecMap::with_capacity(initial_capacity, ordering);
        for (key, value) in iter {
            vec_map
                .append(key, value)
                .expect("The passed collection needs to be sorted!");
        }
        vec_map
    }
 }
 impl<K: Ord, V> IntoIterator for VecMap<K, V> {
    type Item = (K, V);
    type IntoIter = std::vec::IntoIter<(K, V)>;
    fn into_iter(self) -> Self::IntoIter {
        self.data.into_iter()
    }
 }
 fn extract_key<K, V>(entry: &(K, V)) -> &K {
@@ -241,7 +155,7 @@ fn extract_key<K, V>(entry: &(K, V)) -> &K {
 mod tests {
    use std::{collections::BTreeMap, ops::Bound};
-    use super::{VecMap, VecMapOrdering};
+    use super::VecMap;
    #[test]
    fn unbounded_range() {
@@ -396,59 +310,5 @@ mod tests {
        left.extend(&mut one_map).unwrap_err();
        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
        assert_eq!(one_map.as_slice(), &[(1, ())]);
        let mut map_greater_or_equal = VecMap::new(VecMapOrdering::GreaterOrEqual);
        map_greater_or_equal.append(2, ()).unwrap();
        map_greater_or_equal.append(2, ()).unwrap();
        left.extend(&mut map_greater_or_equal).unwrap_err();
        assert_eq!(left.as_slice(), &[(0, ()), (1, ())]);
        assert_eq!(map_greater_or_equal.as_slice(), &[(2, ()), (2, ())]);
    }
    #[test]
    fn extend_with_ordering() {
        let mut left = VecMap::new(VecMapOrdering::GreaterOrEqual);
        left.append(0, ()).unwrap();
        assert_eq!(left.as_slice(), &[(0, ())]);
        let mut greater_right = VecMap::new(VecMapOrdering::Greater);
        greater_right.append(0, ()).unwrap();
        left.extend(&mut greater_right).unwrap_err();
        assert_eq!(left.as_slice(), &[(0, ())]);
        let mut greater_or_equal_right = VecMap::new(VecMapOrdering::GreaterOrEqual);
        greater_or_equal_right.append(2, ()).unwrap();
        greater_or_equal_right.append(2, ()).unwrap();
        left.extend(&mut greater_or_equal_right).unwrap();
        assert_eq!(left.as_slice(), &[(0, ()), (2, ()), (2, ())]);
    }
    #[test]
    fn vec_map_from_sorted() {
        let vec = vec![(1, ()), (2, ()), (3, ()), (6, ())];
        let vec_map = VecMap::from_iter(vec, VecMapOrdering::Greater);
        assert_eq!(vec_map.as_slice(), &[(1, ()), (2, ()), (3, ()), (6, ())]);
        let vec = vec![(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())];
        let vec_map = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
        assert_eq!(
            vec_map.as_slice(),
            &[(1, ()), (2, ()), (3, ()), (3, ()), (6, ()), (6, ())]
        );
    }
    #[test]
    #[should_panic]
    fn vec_map_from_unsorted_greater() {
        let vec = vec![(1, ()), (2, ()), (2, ()), (3, ()), (6, ())];
        let _ = VecMap::from_iter(vec, VecMapOrdering::Greater);
    }
    #[test]
    #[should_panic]
    fn vec_map_from_unsorted_greater_or_equal() {
        let vec = vec![(1, ()), (2, ()), (3, ()), (6, ()), (5, ())];
        let _ = VecMap::from_iter(vec, VecMapOrdering::GreaterOrEqual);
    }
 }
--- a/libs/utils/src/zstd.rs
+++ b/libs/utils/src/zstd.rs
@@ -1,78 +0,0 @@
 use std::io::SeekFrom;
 use anyhow::{Context, Result};
 use async_compression::{
    tokio::{bufread::ZstdDecoder, write::ZstdEncoder},
    zstd::CParameter,
    Level,
 };
 use camino::Utf8Path;
 use nix::NixPath;
 use tokio::{
    fs::{File, OpenOptions},
    io::AsyncBufRead,
    io::AsyncSeekExt,
    io::AsyncWriteExt,
 };
 use tokio_tar::{Archive, Builder, HeaderMode};
 use walkdir::WalkDir;
 /// Creates a Zstandard tarball.
 pub async fn create_zst_tarball(path: &Utf8Path, tarball: &Utf8Path) -> Result<(File, u64)> {
    let file = OpenOptions::new()
        .create(true)
        .truncate(true)
        .read(true)
        .write(true)
        .open(&tarball)
        .await
        .with_context(|| format!("tempfile creation {tarball}"))?;
    let mut paths = Vec::new();
    for entry in WalkDir::new(path) {
        let entry = entry?;
        let metadata = entry.metadata().expect("error getting dir entry metadata");
        // Also allow directories so that we also get empty directories
        if !(metadata.is_file() || metadata.is_dir()) {
            continue;
        }
        let path = entry.into_path();
        paths.push(path);
    }
    // Do a sort to get a more consistent listing
    paths.sort_unstable();
    let zstd = ZstdEncoder::with_quality_and_params(
        file,
        Level::Default,
        &[CParameter::enable_long_distance_matching(true)],
    );
    let mut builder = Builder::new(zstd);
    // Use reproducible header mode
    builder.mode(HeaderMode::Deterministic);
    for p in paths {
        let rel_path = p.strip_prefix(path)?;
        if rel_path.is_empty() {
            // The top directory should not be compressed,
            // the tar crate doesn't like that
            continue;
        }
        builder.append_path_with_name(&p, rel_path).await?;
    }
    let mut zstd = builder.into_inner().await?;
    zstd.shutdown().await?;
    let mut compressed = zstd.into_inner();
    let compressed_len = compressed.metadata().await?.len();
    compressed.seek(SeekFrom::Start(0)).await?;
    Ok((compressed, compressed_len))
 }
 /// Creates a Zstandard tarball.
 pub async fn extract_zst_tarball(
    path: &Utf8Path,
    tarball: impl AsyncBufRead + Unpin,
 ) -> Result<()> {
    let decoder = Box::pin(ZstdDecoder::new(tarball));
    let mut archive = Archive::new(decoder);
    archive.unpack(path).await?;
    Ok(())
 }
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -69,7 +69,7 @@ pub struct Config {
    /// should be removed once we have a better solution there.
    sys_buffer_bytes: u64,
-    /// Minimum fraction of total system memory reserved *before* the cgroup threshold; in
+    /// Minimum fraction of total system memory reserved *before* the the cgroup threshold; in
    /// other words, providing a ceiling for the highest value of the threshold by enforcing that
    /// there's at least `cgroup_min_overhead_fraction` of the total memory remaining beyond the
    /// threshold.
--- a/Show More
+++ b/Show More
Author	SHA1	Message	Date
Konstantin Knizhnik	70d1086e0f	Prepare for first stage of deployment: do not bump format version and do not write data in new format but recognoze new format	2024-03-15 10:02:51 +02:00
Konstantin Knizhnik	5a8e8baf9f	Make ruff happy	2024-03-14 18:05:30 +02:00
Konstantin Knizhnik	57a4119a7b	Add test for compression	2024-03-14 16:45:45 +02:00
Konstantin Knizhnik	aaef3789b0	Ignore format version when comparing summary for delta_layer	2024-03-14 14:21:35 +02:00
Konstantin Knizhnik	0b57e0b8f2	Fix image layer format version matching	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	485ecbaf8f	Fix test_attach_tenant_config.py test	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	0bcbce197a	Fix test_attach_tenent_config.py test	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	19d59e58d2	Use CompressionAlgorithm enum	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	ce65d13dbd	Add compress_image_layer to openapi spec	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	18fefff026	Fix compressed blob writer	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	2a69861896	Fix parse_tenant_config test	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	98375b3896	Support vectored comp[ressed blobs read	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	8c60359ae5	Emable iomage layer compression by default	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	8c7136b057	Add compress_image_layer property to TenantConfig	2024-03-14 08:33:37 +02:00
Konstantin Knizhnik	0df6c41eaa	Compress image layer	2024-03-14 08:33:37 +02:00