Merge pull request #7284 from neondatabase/rc/2024-04-01

Release 2024-04-01
Merge pull request #7263 from neondatabase/rc/2024-03-27
2026-01-22 21:02:56 +00:00 · 2024-04-02 18:15:28 +03:00 · 2024-03-27 14:52:38 -04:00 · 2024-03-26 15:17:00 +00:00 · 2024-03-25 12:28:09 +00:00 · 2024-03-19 12:07:14 +01:00
323 changed files with 10040 additions and 21376 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -22,7 +22,6 @@
 !s3_scrubber/
 !safekeeper/
 !storage_broker/
-!storage_controller/
 !trace/
 !vendor/postgres-*/
 !workspace_hack/
--- a/.github/actions/allure-report-generate/action.yml
+++ b/.github/actions/allure-report-generate/action.yml
@@ -150,7 +150,7 @@ runs:

        # Use aws s3 cp (instead of aws s3 sync) to keep files from previous runs to make old URLs work,
        # and to keep files on the host to upload them to the database
-        time s5cmd --log error cp "${WORKDIR}/report/*" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}/"
+        time aws s3 cp --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"

        # Generate redirect
        cat <<EOF > ${WORKDIR}/index.html
--- a/.github/actions/neon-branch-create/action.yml
+++ b/.github/actions/neon-branch-create/action.yml
@@ -10,7 +10,7 @@ inputs:
    required: true
  api_host:
    desctiption: 'Neon API host'
-    default: console-stage.neon.build
+    default: console.stage.neon.tech
 outputs:
  dsn:
    description: 'Created Branch DSN (for main database)'
--- a/.github/actions/neon-branch-delete/action.yml
+++ b/.github/actions/neon-branch-delete/action.yml
@@ -13,7 +13,7 @@ inputs:
    required: true
  api_host:
    desctiption: 'Neon API host'
-    default: console-stage.neon.build
+    default: console.stage.neon.tech

 runs:
  using: "composite"
--- a/.github/actions/neon-project-create/action.yml
+++ b/.github/actions/neon-project-create/action.yml
@@ -13,7 +13,7 @@ inputs:
    default: 15
  api_host:
    desctiption: 'Neon API host'
-    default: console-stage.neon.build
+    default: console.stage.neon.tech
  provisioner:
    desctiption: 'k8s-pod or k8s-neonvm'
    default: 'k8s-pod'
--- a/.github/actions/neon-project-delete/action.yml
+++ b/.github/actions/neon-project-delete/action.yml
@@ -10,7 +10,7 @@ inputs:
    required: true
  api_host:
    desctiption: 'Neon API host'
-    default: console-stage.neon.build
+    default: console.stage.neon.tech

 runs:
  using: "composite"
--- a/.github/workflows/approved-for-ci-run.yml
+++ b/.github/workflows/approved-for-ci-run.yml
@@ -18,7 +18,6 @@ on:

 concurrency:
  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
-  cancel-in-progress: false

 env:
  GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -147,16 +147,15 @@ jobs:
            "neonvm-captest-new"
          ],
          "db_size": [ "10gb" ],
-          "include": [{ "platform": "neon-captest-freetier",         "db_size": "3gb"  },
-                      { "platform": "neon-captest-new",              "db_size": "50gb" },
-                      { "platform": "neonvm-captest-freetier",       "db_size": "3gb"  },
-                      { "platform": "neonvm-captest-new",            "db_size": "50gb" },
-                      { "platform": "neonvm-captest-sharding-reuse", "db_size": "50gb" }]
+          "include": [{ "platform": "neon-captest-freetier",   "db_size": "3gb"  },
+                      { "platform": "neon-captest-new",        "db_size": "50gb" },
+                      { "platform": "neonvm-captest-freetier", "db_size": "3gb"  },
+                      { "platform": "neonvm-captest-new",      "db_size": "50gb" }]
        }'

        if [ "$(date +%A)" = "Saturday" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "db_size": "10gb"},
-                                                     { "platform": "rds-aurora",   "db_size": "50gb"}]')
+                                                   { "platform": "rds-aurora",   "db_size": "50gb"}]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -172,7 +171,7 @@ jobs:

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres" },
-                                                     { "platform": "rds-aurora"   }]')
+                                                   { "platform": "rds-aurora"   }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -191,7 +190,7 @@ jobs:

        if [ "$(date +%A)" = "Saturday" ] || [ ${RUN_AWS_RDS_AND_AURORA} = "true" ]; then
          matrix=$(echo "$matrix" | jq '.include += [{ "platform": "rds-postgres", "scale": "10" },
-                                                     { "platform": "rds-aurora",   "scale": "10" }]')
+                                                    { "platform": "rds-aurora",   "scale": "10" }]')
        fi

        echo "matrix=$(echo "$matrix" | jq --compact-output '.')" >> $GITHUB_OUTPUT
@@ -254,9 +253,6 @@ jobs:
          neon-captest-reuse)
            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_CONNSTR }}
            ;;
-          neonvm-captest-sharding-reuse)
-            CONNSTR=${{ secrets.BENCHMARK_CAPTEST_SHARDING_CONNSTR }}
-            ;;
          neon-captest-new | neon-captest-freetier | neonvm-captest-new | neonvm-captest-freetier)
            CONNSTR=${{ steps.create-neon-project.outputs.dsn }}
            ;;
@@ -274,15 +270,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Benchmark init
      uses: ./.github/actions/run-python-test-set
@@ -409,15 +401,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: ClickBench benchmark
      uses: ./.github/actions/run-python-test-set
@@ -519,15 +507,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Run TPC-H benchmark
      uses: ./.github/actions/run-python-test-set
@@ -613,15 +597,11 @@ jobs:

        echo "connstr=${CONNSTR}" >> $GITHUB_OUTPUT

-        QUERIES=("SELECT version()")
+        QUERY="SELECT version();"
        if [[ "${PLATFORM}" = "neon"* ]]; then
-          QUERIES+=("SHOW neon.tenant_id")
-          QUERIES+=("SHOW neon.timeline_id")
+          QUERY="${QUERY} SHOW neon.tenant_id; SHOW neon.timeline_id;"
        fi
-
-        for q in "${QUERIES[@]}"; do
-          psql ${CONNSTR} -c "${q}"
-        done
+        psql ${CONNSTR} -c "${QUERY}"

    - name: Run user examples
      uses: ./.github/actions/run-python-test-set
--- a/.github/workflows/build-build-tools-image.yml
+++ b/.github/workflows/build-build-tools-image.yml
@@ -21,7 +21,6 @@ defaults:

 concurrency:
  group: build-build-tools-image-${{ inputs.image-tag }}
-  cancel-in-progress: false

 # No permission for GITHUB_TOKEN by default; the **minimal required** set of permissions should be granted in each job.
 permissions: {}
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -477,8 +477,6 @@ jobs:
          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
          PAGESERVER_GET_VECTORED_IMPL: vectored
-          PAGESERVER_GET_IMPL: vectored
-          PAGESERVER_VALIDATE_VEC_GET: true

      # Temporary disable this step until we figure out why it's so flaky
      # Ref https://github.com/neondatabase/neon/issues/4540
@@ -558,9 +556,6 @@ jobs:
          PERF_TEST_RESULT_CONNSTR: "${{ secrets.PERF_TEST_RESULT_CONNSTR }}"
          TEST_RESULT_CONNSTR: "${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}"
          PAGESERVER_VIRTUAL_FILE_IO_ENGINE: tokio-epoll-uring
-          PAGESERVER_GET_VECTORED_IMPL: vectored
-          PAGESERVER_GET_IMPL: vectored
-          PAGESERVER_VALIDATE_VEC_GET: false
      # XXX: no coverage data handling here, since benchmarks are run on release builds,
      # while coverage is currently collected for the debug ones

@@ -740,7 +735,7 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3

      - uses: docker/login-action@v3
        with:
@@ -797,7 +792,7 @@ jobs:
        run: |
          mkdir -p .docker-custom
          echo DOCKER_CONFIG=$(pwd)/.docker-custom >> $GITHUB_ENV
-      - uses: docker/setup-buildx-action@v2
+      - uses: docker/setup-buildx-action@v3
        with:
          # Disable parallelism for docker buildkit.
          # As we already build everything with `make -j$(nproc)`, running it in additional level of parallelisam blows up the Runner.
@@ -870,7 +865,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.28.1
+      VM_BUILDER_VERSION: v0.23.2

    steps:
      - name: Checkout
@@ -1138,6 +1133,8 @@ jobs:
              -f deployPreprodRegion=true

            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main \
+              -f deployPgSniRouter=false \
+              -f deployProxy=false \
              -f deployStorage=true \
              -f deployStorageBroker=true \
              -f deployStorageController=true \
--- a/.github/workflows/check-build-tools-image.yml
+++ b/.github/workflows/check-build-tools-image.yml
@@ -28,9 +28,7 @@ jobs:
      - name: Get build-tools image tag for the current commit
        id: get-build-tools-tag
        env:
-          # Usually, for COMMIT_SHA, we use `github.event.pull_request.head.sha || github.sha`, but here, even for PRs,
-          # we want to use `github.sha` i.e. point to a phantom merge commit to determine the image tag correctly.
-          COMMIT_SHA: ${{ github.sha }}
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
          GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
        run: |
          LAST_BUILD_TOOLS_SHA=$(
--- a/.github/workflows/pin-build-tools-image.yml
+++ b/.github/workflows/pin-build-tools-image.yml
@@ -20,7 +20,6 @@ defaults:

 concurrency:
  group: pin-build-tools-image-${{ inputs.from-tag }}
-  cancel-in-progress: false

 permissions: {}

--- a/2
+++ b/2
@@ -1,5 +1,5 @@
 /compute_tools/ @neondatabase/control-plane @neondatabase/compute
-/storage_controller @neondatabase/storage
+/control_plane/attachment_service @neondatabase/storage
 /libs/pageserver_api/ @neondatabase/storage
 /libs/postgres_ffi/ @neondatabase/compute @neondatabase/safekeepers
 /libs/remote_storage/ @neondatabase/storage
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,7 +3,7 @@ resolver = "2"
 members = [
    "compute_tools",
    "control_plane",
-    "control_plane/storcon_cli",
+    "control_plane/attachment_service",
    "pageserver",
    "pageserver/compaction",
    "pageserver/ctl",
@@ -12,7 +12,6 @@ members = [
    "proxy",
    "safekeeper",
    "storage_broker",
-    "storage_controller",
    "s3_scrubber",
    "workspace_hack",
    "trace",
@@ -44,11 +43,10 @@ license = "Apache-2.0"
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
-atomic-take = "1.1.0"
-azure_core = "0.19"
-azure_identity = "0.19"
-azure_storage = "0.19"
-azure_storage_blobs = "0.19"
+azure_core = "0.18"
+azure_identity = "0.18"
+azure_storage = "0.18"
+azure_storage_blobs = "0.18"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
@@ -98,7 +96,7 @@ http-types = { version = "2", default-features = false }
 humantime = "2.1"
 humantime-serde = "1.1.1"
 hyper = "0.14"
-hyper-tungstenite = "0.13.0"
+hyper-tungstenite = "0.11"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
@@ -107,8 +105,7 @@ lasso = "0.7"
 leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
-measured = { version = "0.0.21", features=["lasso"] }
-measured-process = { version = "0.0.21" }
+measured = { version = "0.0.13", features=["default", "lasso"] }
 memoffset = "0.8"
 native-tls = "0.2"
 nix = { version = "0.27", features = ["fs", "process", "socket", "signal", "poll"] }
@@ -130,10 +127,10 @@ prost = "0.11"
 rand = "0.8"
 redis = { version = "0.25.2", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
-reqwest = { version = "0.12", default-features = false, features = ["rustls-tls"] }
-reqwest-tracing = { version = "0.5", features = ["opentelemetry_0_20"] }
-reqwest-middleware = "0.3.0"
-reqwest-retry = "0.5"
+reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
+reqwest-tracing = { version = "0.4.7", features = ["opentelemetry_0_20"] }
+reqwest-middleware = "0.2.0"
+reqwest-retry = "0.2.2"
 routerify = "3"
 rpds = "0.13"
 rustc-hash = "1.1.0"
@@ -143,7 +140,7 @@ rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
 sd-notify = "0.4.1"
-sentry = { version = "0.32", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
+sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_path_to_error = "0.1"
@@ -157,12 +154,11 @@ socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
 "subtle"  = "2.5.0"
-# https://github.com/nical/rust_debug/pull/4
-svg_fmt = { git = "https://github.com/neondatabase/fork--nical--rust_debug", branch = "neon" }
+svg_fmt = "0.4.1"
 sync_wrapper = "0.1.2"
 tar = "0.4"
 task-local-extensions = "0.1.4"
-test-context = "0.3"
+test-context = "0.1"
 thiserror = "1.0"
 tikv-jemallocator = "0.5"
 tikv-jemalloc-ctl = "0.5"
@@ -177,11 +173,10 @@ tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.7"
 toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
-tower-service = "0.3.2"
 tracing = "0.1"
 tracing-error = "0.2.0"
-tracing-opentelemetry = "0.21.0"
-tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json", "ansi"] }
+tracing-opentelemetry = "0.20.0"
+tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 twox-hash = { version = "1.6.3", default-features = false }
 url = "2.2"
 urlencoding = "2.1"
--- a/Dockerfile.build-tools
+++ b/Dockerfile.build-tools
@@ -58,14 +58,8 @@ RUN curl -fsSL "https://github.com/protocolbuffers/protobuf/releases/download/v$
    && mv protoc/include/google /usr/local/include/google \
    && rm -rf protoc.zip protoc

-# s5cmd
-ENV S5CMD_VERSION=2.2.2
-RUN curl -sL "https://github.com/peak/s5cmd/releases/download/v${S5CMD_VERSION}/s5cmd_${S5CMD_VERSION}_Linux-$(uname -m | sed 's/x86_64/64bit/g' | sed 's/aarch64/arm64/g').tar.gz" | tar zxvf - s5cmd \
-    && chmod +x s5cmd \
-    && mv s5cmd /usr/local/bin/s5cmd
-
 # LLVM
-ENV LLVM_VERSION=18
+ENV LLVM_VERSION=17
 RUN curl -fsSL 'https://apt.llvm.org/llvm-snapshot.gpg.key' | apt-key add - \
    && echo "deb http://apt.llvm.org/bullseye/ llvm-toolchain-bullseye-${LLVM_VERSION} main" > /etc/apt/sources.list.d/llvm.stable.list \
    && apt update \
@@ -141,7 +135,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.78.0
+ENV RUSTC_VERSION=1.77.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
--- a/18
+++ b/18
@@ -25,16 +25,14 @@ ifeq ($(UNAME_S),Linux)
 	# Seccomp BPF is only available for Linux
 	PG_CONFIGURE_OPTS += --with-libseccomp
 else ifeq ($(UNAME_S),Darwin)
-	ifndef DISABLE_HOMEBREW
-		# macOS with brew-installed openssl requires explicit paths
-		# It can be configured with OPENSSL_PREFIX variable
-		OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
-		PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
-		PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
-		# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
-		# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
-		EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
-	endif
+	# macOS with brew-installed openssl requires explicit paths
+	# It can be configured with OPENSSL_PREFIX variable
+	OPENSSL_PREFIX ?= $(shell brew --prefix openssl@3)
+	PG_CONFIGURE_OPTS += --with-includes=$(OPENSSL_PREFIX)/include --with-libraries=$(OPENSSL_PREFIX)/lib
+	PG_CONFIGURE_OPTS += PKG_CONFIG_PATH=$(shell brew --prefix icu4c)/lib/pkgconfig
+	# macOS already has bison and flex in the system, but they are old and result in postgres-v14 target failure
+	# brew formulae are keg-only and not symlinked into HOMEBREW_PREFIX, force their usage
+	EXTRA_PATH_OVERRIDES += $(shell brew --prefix bison)/bin/:$(shell brew --prefix flex)/bin/:
 endif

 # Use -C option so that when PostgreSQL "make install" installs the
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -47,7 +47,7 @@ use chrono::Utc;
 use clap::Arg;
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
-use tracing::{error, info, warn};
+use tracing::{error, info};
 use url::Url;

 use compute_api::responses::ComputeStatus;
@@ -62,7 +62,6 @@ use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
 use compute_tools::spec::*;
-use compute_tools::swap::resize_swap;

 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
@@ -111,7 +110,6 @@ fn main() -> Result<()> {
        .expect("Postgres connection string is required");
    let spec_json = matches.get_one::<String>("spec");
    let spec_path = matches.get_one::<String>("spec-path");
-    let resize_swap_on_bind = matches.get_flag("resize-swap-on-bind");

    // Extract OpenTelemetry context for the startup actions from the
    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
@@ -228,14 +226,14 @@ fn main() -> Result<()> {

    // If this is a pooled VM, prewarm before starting HTTP server and becoming
    // available for binding. Prewarming helps Postgres start quicker later,
-    // because QEMU will already have its memory allocated from the host, and
+    // because QEMU will already have it's memory allocated from the host, and
    // the necessary binaries will already be cached.
    if !spec_set {
        compute.prewarm_postgres()?;
    }

-    // Launch http service first, so that we can serve control-plane requests
-    // while configuration is still in progress.
+    // Launch http service first, so we were able to serve control-plane
+    // requests, while configuration is still in progress.
    let _http_handle =
        launch_http_server(http_port, &compute).expect("cannot launch http endpoint thread");

@@ -255,22 +253,21 @@ fn main() -> Result<()> {
                break;
            }
        }
-
-        // Record for how long we slept waiting for the spec.
-        let now = Utc::now();
-        state.metrics.wait_for_spec_ms = now
-            .signed_duration_since(state.start_time)
-            .to_std()
-            .unwrap()
-            .as_millis() as u64;
-
-        // Reset start time, so that the total startup time that is calculated later will
-        // not include the time that we waited for the spec.
-        state.start_time = now;
    }

    // We got all we need, update the state.
    let mut state = compute.state.lock().unwrap();
+
+    // Record for how long we slept waiting for the spec.
+    state.metrics.wait_for_spec_ms = Utc::now()
+        .signed_duration_since(state.start_time)
+        .to_std()
+        .unwrap()
+        .as_millis() as u64;
+    // Reset start time to the actual start of the configuration, so that
+    // total startup time was properly measured at the end.
+    state.start_time = Utc::now();
+
    state.status = ComputeStatus::Init;
    compute.state_changed.notify_all();

@@ -278,72 +275,33 @@ fn main() -> Result<()> {
        "running compute with features: {:?}",
        state.pspec.as_ref().unwrap().spec.features
    );
-    // before we release the mutex, fetch the swap size (if any) for later.
-    let swap_size_bytes = state.pspec.as_ref().unwrap().spec.swap_size_bytes;
    drop(state);

    // Launch remaining service threads
    let _monitor_handle = launch_monitor(&compute);
    let _configurator_handle = launch_configurator(&compute);

-    let mut prestartup_failed = false;
-    let mut delay_exit = false;
-
-    // Resize swap to the desired size if the compute spec says so
-    if let (Some(size_bytes), true) = (swap_size_bytes, resize_swap_on_bind) {
-        // To avoid 'swapoff' hitting postgres startup, we need to run resize-swap to completion
-        // *before* starting postgres.
-        //
-        // In theory, we could do this asynchronously if SkipSwapon was enabled for VMs, but this
-        // carries a risk of introducing hard-to-debug issues - e.g. if postgres sometimes gets
-        // OOM-killed during startup because swap wasn't available yet.
-        match resize_swap(size_bytes) {
-            Ok(()) => {
-                let size_gib = size_bytes as f32 / (1 << 20) as f32; // just for more coherent display.
-                info!(%size_bytes, %size_gib, "resized swap");
-            }
-            Err(err) => {
-                let err = err.context("failed to resize swap");
-                error!("{err:#}");
-
-                // Mark compute startup as failed; don't try to start postgres, and report this
-                // error to the control plane when it next asks.
-                prestartup_failed = true;
-                let mut state = compute.state.lock().unwrap();
-                state.error = Some(format!("{err:?}"));
-                state.status = ComputeStatus::Failed;
-                compute.state_changed.notify_all();
-                delay_exit = true;
-            }
-        }
-    }
-
    // Start Postgres
-    let mut pg = None;
+    let mut delay_exit = false;
    let mut exit_code = None;
-
-    if !prestartup_failed {
-        pg = match compute.start_compute(extension_server_port) {
-            Ok(pg) => Some(pg),
-            Err(err) => {
-                error!("could not start the compute node: {:#}", err);
-                let mut state = compute.state.lock().unwrap();
-                state.error = Some(format!("{:?}", err));
-                state.status = ComputeStatus::Failed;
-                // Notify others that Postgres failed to start. In case of configuring the
-                // empty compute, it's likely that API handler is still waiting for compute
-                // state change. With this we will notify it that compute is in Failed state,
-                // so control plane will know about it earlier and record proper error instead
-                // of timeout.
-                compute.state_changed.notify_all();
-                drop(state); // unlock
-                delay_exit = true;
-                None
-            }
-        };
-    } else {
-        warn!("skipping postgres startup because pre-startup step failed");
-    }
+    let pg = match compute.start_compute(extension_server_port) {
+        Ok(pg) => Some(pg),
+        Err(err) => {
+            error!("could not start the compute node: {:#}", err);
+            let mut state = compute.state.lock().unwrap();
+            state.error = Some(format!("{:?}", err));
+            state.status = ComputeStatus::Failed;
+            // Notify others that Postgres failed to start. In case of configuring the
+            // empty compute, it's likely that API handler is still waiting for compute
+            // state change. With this we will notify it that compute is in Failed state,
+            // so control plane will know about it earlier and record proper error instead
+            // of timeout.
+            compute.state_changed.notify_all();
+            drop(state); // unlock
+            delay_exit = true;
+            None
+        }
+    };

    // Start the vm-monitor if directed to. The vm-monitor only runs on linux
    // because it requires cgroups.
@@ -568,11 +526,6 @@ fn cli() -> clap::Command {
                )
                .value_name("FILECACHE_CONNSTR"),
        )
-        .arg(
-            Arg::new("resize-swap-on-bind")
-                .long("resize-swap-on-bind")
-                .action(clap::ArgAction::SetTrue),
-        )
 }

 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -818,15 +818,9 @@ impl ComputeNode {
                        Client::connect(zenith_admin_connstr.as_str(), NoTls)
                            .context("broken cloud_admin credential: tried connecting with cloud_admin but could not authenticate, and zenith_admin does not work either")?;
                    // Disable forwarding so that users don't get a cloud_admin role
-
-                    let mut func = || {
-                        client.simple_query("SET neon.forward_ddl = false")?;
-                        client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
-                        client.simple_query("GRANT zenith_admin TO cloud_admin")?;
-                        Ok::<_, anyhow::Error>(())
-                    };
-                    func().context("apply_config setup cloud_admin")?;
-
+                    client.simple_query("SET neon.forward_ddl = false")?;
+                    client.simple_query("CREATE USER cloud_admin WITH SUPERUSER")?;
+                    client.simple_query("GRANT zenith_admin TO cloud_admin")?;
                    drop(client);

                    // reconnect with connstring with expected name
@@ -838,29 +832,24 @@ impl ComputeNode {
        };

        // Disable DDL forwarding because control plane already knows about these roles/databases.
-        client
-            .simple_query("SET neon.forward_ddl = false")
-            .context("apply_config SET neon.forward_ddl = false")?;
+        client.simple_query("SET neon.forward_ddl = false")?;

        // Proceed with post-startup configuration. Note, that order of operations is important.
        let spec = &compute_state.pspec.as_ref().expect("spec must be set").spec;
-        create_neon_superuser(spec, &mut client).context("apply_config create_neon_superuser")?;
-        cleanup_instance(&mut client).context("apply_config cleanup_instance")?;
-        handle_roles(spec, &mut client).context("apply_config handle_roles")?;
-        handle_databases(spec, &mut client).context("apply_config handle_databases")?;
-        handle_role_deletions(spec, connstr.as_str(), &mut client)
-            .context("apply_config handle_role_deletions")?;
+        create_neon_superuser(spec, &mut client)?;
+        cleanup_instance(&mut client)?;
+        handle_roles(spec, &mut client)?;
+        handle_databases(spec, &mut client)?;
+        handle_role_deletions(spec, connstr.as_str(), &mut client)?;
        handle_grants(
            spec,
            &mut client,
            connstr.as_str(),
            self.has_feature(ComputeFeature::AnonExtension),
-        )
-        .context("apply_config handle_grants")?;
-        handle_extensions(spec, &mut client).context("apply_config handle_extensions")?;
-        handle_extension_neon(&mut client).context("apply_config handle_extension_neon")?;
-        create_availability_check_data(&mut client)
-            .context("apply_config create_availability_check_data")?;
+        )?;
+        handle_extensions(spec, &mut client)?;
+        handle_extension_neon(&mut client)?;
+        create_availability_check_data(&mut client)?;

        // 'Close' connection
        drop(client);
@@ -868,7 +857,7 @@ impl ComputeNode {
        // Run migrations separately to not hold up cold starts
        thread::spawn(move || {
            let mut client = Client::connect(connstr.as_str(), NoTls)?;
-            handle_migrations(&mut client).context("apply_config handle_migrations")
+            handle_migrations(&mut client)
        });
        Ok(())
    }
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -6,8 +6,8 @@ use std::path::Path;
 use anyhow::Result;

 use crate::pg_helpers::escape_conf_value;
-use crate::pg_helpers::{GenericOptionExt, PgOptionsSerialize};
-use compute_api::spec::{ComputeMode, ComputeSpec, GenericOption};
+use crate::pg_helpers::PgOptionsSerialize;
+use compute_api::spec::{ComputeMode, ComputeSpec};

 /// Check that `line` is inside a text file and put it there if it is not.
 /// Create file if it doesn't exist.
@@ -92,27 +92,6 @@ pub fn write_postgres_conf(
        }
    }

-    if cfg!(target_os = "linux") {
-        // Check /proc/sys/vm/overcommit_memory -- if it equals 2 (i.e. linux memory overcommit is
-        // disabled), then the control plane has enabled swap and we should set
-        // dynamic_shared_memory_type = 'mmap'.
-        //
-        // This is (maybe?) temporary - for more, see https://github.com/neondatabase/cloud/issues/12047.
-        let overcommit_memory_contents = std::fs::read_to_string("/proc/sys/vm/overcommit_memory")
-            // ignore any errors - they may be expected to occur under certain situations (e.g. when
-            // not running in Linux).
-            .unwrap_or_else(|_| String::new());
-        if overcommit_memory_contents.trim() == "2" {
-            let opt = GenericOption {
-                name: "dynamic_shared_memory_type".to_owned(),
-                value: Some("mmap".to_owned()),
-                vartype: "enum".to_owned(),
-            };
-
-            write!(file, "{}", opt.to_pg_setting())?;
-        }
-    }
-
    // If there are any extra options in the 'settings' field, append those
    if spec.cluster.settings.is_some() {
        writeln!(file, "# Managed by compute_ctl: begin")?;
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -14,5 +14,4 @@ pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
 pub mod spec;
-pub mod swap;
 pub mod sync_sk;
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -44,7 +44,7 @@ pub fn escape_conf_value(s: &str) -> String {
    format!("'{}'", res)
 }

-pub trait GenericOptionExt {
+trait GenericOptionExt {
    fn to_pg_option(&self) -> String;
    fn to_pg_setting(&self) -> String;
 }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -2,7 +2,7 @@ use std::fs::File;
 use std::path::Path;
 use std::str::FromStr;

-use anyhow::{anyhow, bail, Context, Result};
+use anyhow::{anyhow, bail, Result};
 use postgres::config::Config;
 use postgres::{Client, NoTls};
 use reqwest::StatusCode;
@@ -490,7 +490,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                "rename_db" => {
                    let new_name = op.new_name.as_ref().unwrap();

-                    if existing_dbs.contains_key(&op.name) {
+                    if existing_dbs.get(&op.name).is_some() {
                        let query: String = format!(
                            "ALTER DATABASE {} RENAME TO {}",
                            op.name.pg_quote(),
@@ -698,8 +698,7 @@ pub fn handle_grants(

        // it is important to run this after all grants
        if enable_anon_extension {
-            handle_extension_anon(spec, &db.owner, &mut db_client, false)
-                .context("handle_grants handle_extension_anon")?;
+            handle_extension_anon(spec, &db.owner, &mut db_client, false)?;
        }
    }

@@ -814,36 +813,28 @@ $$;"#,
        // Add new migrations below.
    ];

-    let mut func = || {
-        let query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
-        client.simple_query(query)?;
+    let mut query = "CREATE SCHEMA IF NOT EXISTS neon_migration";
+    client.simple_query(query)?;

-        let query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
-        client.simple_query(query)?;
+    query = "CREATE TABLE IF NOT EXISTS neon_migration.migration_id (key INT NOT NULL PRIMARY KEY, id bigint NOT NULL DEFAULT 0)";
+    client.simple_query(query)?;

-        let query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
-        client.simple_query(query)?;
+    query = "INSERT INTO neon_migration.migration_id VALUES (0, 0) ON CONFLICT DO NOTHING";
+    client.simple_query(query)?;

-        let query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
-        client.simple_query(query)?;
+    query = "ALTER SCHEMA neon_migration OWNER TO cloud_admin";
+    client.simple_query(query)?;

-        let query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
-        client.simple_query(query)?;
-        Ok::<_, anyhow::Error>(())
-    };
-    func().context("handle_migrations prepare")?;
+    query = "REVOKE ALL ON SCHEMA neon_migration FROM PUBLIC";
+    client.simple_query(query)?;

-    let query = "SELECT id FROM neon_migration.migration_id";
-    let row = client
-        .query_one(query, &[])
-        .context("handle_migrations get migration_id")?;
+    query = "SELECT id FROM neon_migration.migration_id";
+    let row = client.query_one(query, &[])?;
    let mut current_migration: usize = row.get::<&str, i64>("id") as usize;
    let starting_migration_id = current_migration;

-    let query = "BEGIN";
-    client
-        .simple_query(query)
-        .context("handle_migrations begin")?;
+    query = "BEGIN";
+    client.simple_query(query)?;

    while current_migration < migrations.len() {
        let migration = &migrations[current_migration];
@@ -851,9 +842,7 @@ $$;"#,
            info!("Skip migration id={}", current_migration);
        } else {
            info!("Running migration:\n{}\n", migration);
-            client.simple_query(migration).with_context(|| {
-                format!("handle_migrations current_migration={}", current_migration)
-            })?;
+            client.simple_query(migration)?;
        }
        current_migration += 1;
    }
@@ -861,14 +850,10 @@ $$;"#,
        "UPDATE neon_migration.migration_id SET id={}",
        migrations.len()
    );
-    client
-        .simple_query(&setval)
-        .context("handle_migrations update id")?;
+    client.simple_query(&setval)?;

-    let query = "COMMIT";
-    client
-        .simple_query(query)
-        .context("handle_migrations commit")?;
+    query = "COMMIT";
+    client.simple_query(query)?;

    info!(
        "Ran {} migrations",
--- a/compute_tools/src/swap.rs
+++ b/compute_tools/src/swap.rs
@@ -1,36 +0,0 @@
-use anyhow::{anyhow, Context};
-use tracing::warn;
-
-pub const RESIZE_SWAP_BIN: &str = "/neonvm/bin/resize-swap";
-
-pub fn resize_swap(size_bytes: u64) -> anyhow::Result<()> {
-    // run `/neonvm/bin/resize-swap --once {size_bytes}`
-    //
-    // Passing '--once' causes resize-swap to delete itself after successful completion, which
-    // means that if compute_ctl restarts later, we won't end up calling 'swapoff' while
-    // postgres is running.
-    //
-    // NOTE: resize-swap is not very clever. If present, --once MUST be the first arg.
-    let child_result = std::process::Command::new("/usr/bin/sudo")
-        .arg(RESIZE_SWAP_BIN)
-        .arg("--once")
-        .arg(size_bytes.to_string())
-        .spawn();
-
-    if matches!(&child_result, Err(e) if e.kind() == std::io::ErrorKind::NotFound) {
-        warn!("ignoring \"not found\" error from resize-swap to avoid swapoff while compute is running");
-        return Ok(());
-    }
-
-    child_result
-        .context("spawn() failed")
-        .and_then(|mut child| child.wait().context("wait() failed"))
-        .and_then(|status| match status.success() {
-            true => Ok(()),
-            false => Err(anyhow!("process exited with {status}")),
-        })
-        // wrap any prior error with the overall context that we couldn't run the command
-        .with_context(|| {
-            format!("could not run `/usr/bin/sudo {RESIZE_SWAP_BIN} --once {size_bytes}`")
-        })
-}
--- a/control_plane/Cargo.toml
+++ b/control_plane/Cargo.toml
@@ -17,7 +17,6 @@ nix.workspace = true
 once_cell.workspace = true
 postgres.workspace = true
 hex.workspace = true
-humantime-serde.workspace = true
 hyper.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["blocking", "json"] }
@@ -28,7 +27,6 @@ serde_with.workspace = true
 tar.workspace = true
 thiserror.workspace = true
 toml.workspace = true
-toml_edit.workspace = true
 tokio.workspace = true
 tokio-postgres.workspace = true
 tokio-util.workspace = true
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "storage_controller"
+name = "attachment_service"
 version = "0.1.0"
 edition.workspace = true
 license.workspace = true
@@ -25,13 +25,12 @@ git-version.workspace = true
 hex.workspace = true
 hyper.workspace = true
 humantime.workspace = true
-itertools.workspace = true
 lasso.workspace = true
 once_cell.workspace = true
 pageserver_api.workspace = true
 pageserver_client.workspace = true
 postgres_connection.workspace = true
-reqwest = { workspace = true, features = ["stream"] }
+reqwest.workspace = true
 routerify.workspace = true
 serde.workspace = true
 serde_json.workspace = true
@@ -40,15 +39,13 @@ tokio.workspace = true
 tokio-util.workspace = true
 tracing.workspace = true
 measured.workspace = true
-strum.workspace = true
-strum_macros.workspace = true

 diesel = { version = "2.1.4", features = ["serde_json", "postgres", "r2d2"] }
 diesel_migrations = { version = "2.1.0" }
 r2d2 = { version = "0.8.10" }

-utils = { path = "../libs/utils/" }
-metrics = { path = "../libs/metrics/" }
-control_plane = { path = "../control_plane" }
-workspace_hack = { version = "0.1", path = "../workspace_hack" }
+utils = { path = "../../libs/utils/" }
+metrics = { path = "../../libs/metrics/" }
+control_plane = { path = ".." }
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }

--- a/control_plane/attachment_service/migrations/.keep
+++ b/control_plane/attachment_service/migrations/.keep
--- a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/down.sql
--- a/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
+++ b/control_plane/attachment_service/migrations/00000000000000_diesel_initial_setup/up.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/down.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-211257_create_tenant_shards/up.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/down.sql
--- a/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
+++ b/control_plane/attachment_service/migrations/2024-01-07-212945_create_nodes/up.sql
--- a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/down.sql
--- a/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
+++ b/control_plane/attachment_service/migrations/2024-02-29-094122_generations_null/up.sql
--- a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
+++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/down.sql
--- a/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
+++ b/control_plane/attachment_service/migrations/2024-03-18-184429_rename_policy/up.sql
--- a/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/down.sql
+++ b/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/down.sql
--- a/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/up.sql
+++ b/control_plane/attachment_service/migrations/2024-03-27-133204_tenant_policies/up.sql
--- a/control_plane/attachment_service/src/auth.rs
+++ b/control_plane/attachment_service/src/auth.rs
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -0,0 +1,465 @@
+use std::{collections::HashMap, time::Duration};
+
+use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
+use control_plane::local_env::LocalEnv;
+use hyper::{Method, StatusCode};
+use pageserver_api::shard::{ShardCount, ShardNumber, ShardStripeSize, TenantShardId};
+use postgres_connection::parse_host_port;
+use serde::{Deserialize, Serialize};
+use tokio_util::sync::CancellationToken;
+use utils::{
+    backoff::{self},
+    id::{NodeId, TenantId},
+};
+
+use crate::service::Config;
+
+const SLOWDOWN_DELAY: Duration = Duration::from_secs(5);
+
+pub(crate) const API_CONCURRENCY: usize = 32;
+
+struct ShardedComputeHookTenant {
+    stripe_size: ShardStripeSize,
+    shard_count: ShardCount,
+    shards: Vec<(ShardNumber, NodeId)>,
+}
+
+enum ComputeHookTenant {
+    Unsharded(NodeId),
+    Sharded(ShardedComputeHookTenant),
+}
+
+impl ComputeHookTenant {
+    /// Construct with at least one shard's information
+    fn new(tenant_shard_id: TenantShardId, stripe_size: ShardStripeSize, node_id: NodeId) -> Self {
+        if tenant_shard_id.shard_count.count() > 1 {
+            Self::Sharded(ShardedComputeHookTenant {
+                shards: vec![(tenant_shard_id.shard_number, node_id)],
+                stripe_size,
+                shard_count: tenant_shard_id.shard_count,
+            })
+        } else {
+            Self::Unsharded(node_id)
+        }
+    }
+
+    /// Set one shard's location.  If stripe size or shard count have changed, Self is reset
+    /// and drops existing content.
+    fn update(
+        &mut self,
+        tenant_shard_id: TenantShardId,
+        stripe_size: ShardStripeSize,
+        node_id: NodeId,
+    ) {
+        match self {
+            Self::Unsharded(existing_node_id) if tenant_shard_id.shard_count.count() == 1 => {
+                *existing_node_id = node_id
+            }
+            Self::Sharded(sharded_tenant)
+                if sharded_tenant.stripe_size == stripe_size
+                    && sharded_tenant.shard_count == tenant_shard_id.shard_count =>
+            {
+                if let Some(existing) = sharded_tenant
+                    .shards
+                    .iter()
+                    .position(|s| s.0 == tenant_shard_id.shard_number)
+                {
+                    sharded_tenant.shards.get_mut(existing).unwrap().1 = node_id;
+                } else {
+                    sharded_tenant
+                        .shards
+                        .push((tenant_shard_id.shard_number, node_id));
+                    sharded_tenant.shards.sort_by_key(|s| s.0)
+                }
+            }
+            _ => {
+                // Shard count changed: reset struct.
+                *self = Self::new(tenant_shard_id, stripe_size, node_id);
+            }
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+struct ComputeHookNotifyRequestShard {
+    node_id: NodeId,
+    shard_number: ShardNumber,
+}
+
+/// Request body that we send to the control plane to notify it of where a tenant is attached
+#[derive(Serialize, Deserialize, Debug)]
+struct ComputeHookNotifyRequest {
+    tenant_id: TenantId,
+    stripe_size: Option<ShardStripeSize>,
+    shards: Vec<ComputeHookNotifyRequestShard>,
+}
+
+/// Error type for attempts to call into the control plane compute notification hook
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum NotifyError {
+    // Request was not send successfully, e.g. transport error
+    #[error("Sending request: {0}")]
+    Request(#[from] reqwest::Error),
+    // Request could not be serviced right now due to ongoing Operation in control plane, but should be possible soon.
+    #[error("Control plane tenant busy")]
+    Busy,
+    // Explicit 429 response asking us to retry less frequently
+    #[error("Control plane overloaded")]
+    SlowDown,
+    // A 503 response indicates the control plane can't handle the request right now
+    #[error("Control plane unavailable (status {0})")]
+    Unavailable(StatusCode),
+    // API returned unexpected non-success status.  We will retry, but log a warning.
+    #[error("Control plane returned unexpected status {0}")]
+    Unexpected(StatusCode),
+    // We shutdown while sending
+    #[error("Shutting down")]
+    ShuttingDown,
+    // A response indicates we will never succeed, such as 400 or 404
+    #[error("Non-retryable error {0}")]
+    Fatal(StatusCode),
+}
+
+impl ComputeHookTenant {
+    fn maybe_reconfigure(&self, tenant_id: TenantId) -> Option<ComputeHookNotifyRequest> {
+        match self {
+            Self::Unsharded(node_id) => Some(ComputeHookNotifyRequest {
+                tenant_id,
+                shards: vec![ComputeHookNotifyRequestShard {
+                    shard_number: ShardNumber(0),
+                    node_id: *node_id,
+                }],
+                stripe_size: None,
+            }),
+            Self::Sharded(sharded_tenant)
+                if sharded_tenant.shards.len() == sharded_tenant.shard_count.count() as usize =>
+            {
+                Some(ComputeHookNotifyRequest {
+                    tenant_id,
+                    shards: sharded_tenant
+                        .shards
+                        .iter()
+                        .map(|(shard_number, node_id)| ComputeHookNotifyRequestShard {
+                            shard_number: *shard_number,
+                            node_id: *node_id,
+                        })
+                        .collect(),
+                    stripe_size: Some(sharded_tenant.stripe_size),
+                })
+            }
+            Self::Sharded(sharded_tenant) => {
+                // Sharded tenant doesn't yet have information for all its shards
+
+                tracing::info!(
+                    "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
+                    sharded_tenant.shards.len(),
+                    sharded_tenant.shard_count.count()
+                );
+                None
+            }
+        }
+    }
+}
+
+/// The compute hook is a destination for notifications about changes to tenant:pageserver
+/// mapping.  It aggregates updates for the shards in a tenant, and when appropriate reconfigures
+/// the compute connection string.
+pub(super) struct ComputeHook {
+    config: Config,
+    state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
+    authorization_header: Option<String>,
+}
+
+impl ComputeHook {
+    pub(super) fn new(config: Config) -> Self {
+        let authorization_header = config
+            .control_plane_jwt_token
+            .clone()
+            .map(|jwt| format!("Bearer {}", jwt));
+
+        Self {
+            state: Default::default(),
+            config,
+            authorization_header,
+        }
+    }
+
+    /// For test environments: use neon_local's LocalEnv to update compute
+    async fn do_notify_local(
+        &self,
+        reconfigure_request: ComputeHookNotifyRequest,
+    ) -> anyhow::Result<()> {
+        let env = match LocalEnv::load_config() {
+            Ok(e) => e,
+            Err(e) => {
+                tracing::warn!("Couldn't load neon_local config, skipping compute update ({e})");
+                return Ok(());
+            }
+        };
+        let cplane =
+            ComputeControlPlane::load(env.clone()).expect("Error loading compute control plane");
+        let ComputeHookNotifyRequest {
+            tenant_id,
+            shards,
+            stripe_size,
+        } = reconfigure_request;
+
+        let compute_pageservers = shards
+            .into_iter()
+            .map(|shard| {
+                let ps_conf = env
+                    .get_pageserver_conf(shard.node_id)
+                    .expect("Unknown pageserver");
+                let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr)
+                    .expect("Unable to parse listen_pg_addr");
+                (pg_host, pg_port.unwrap_or(5432))
+            })
+            .collect::<Vec<_>>();
+
+        for (endpoint_name, endpoint) in &cplane.endpoints {
+            if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
+                tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
+                endpoint
+                    .reconfigure(compute_pageservers.clone(), stripe_size)
+                    .await?;
+            }
+        }
+
+        Ok(())
+    }
+
+    async fn do_notify_iteration(
+        &self,
+        client: &reqwest::Client,
+        url: &String,
+        reconfigure_request: &ComputeHookNotifyRequest,
+        cancel: &CancellationToken,
+    ) -> Result<(), NotifyError> {
+        let req = client.request(Method::PUT, url);
+        let req = if let Some(value) = &self.authorization_header {
+            req.header(reqwest::header::AUTHORIZATION, value)
+        } else {
+            req
+        };
+
+        tracing::info!(
+            "Sending notify request to {} ({:?})",
+            url,
+            reconfigure_request
+        );
+        let send_result = req.json(&reconfigure_request).send().await;
+        let response = match send_result {
+            Ok(r) => r,
+            Err(e) => return Err(e.into()),
+        };
+
+        // Treat all 2xx responses as success
+        if response.status() >= StatusCode::OK && response.status() < StatusCode::MULTIPLE_CHOICES {
+            if response.status() != StatusCode::OK {
+                // Non-200 2xx response: it doesn't make sense to retry, but this is unexpected, so
+                // log a warning.
+                tracing::warn!(
+                    "Unexpected 2xx response code {} from control plane",
+                    response.status()
+                );
+            }
+
+            return Ok(());
+        }
+
+        // Error response codes
+        match response.status() {
+            StatusCode::TOO_MANY_REQUESTS => {
+                // TODO: 429 handling should be global: set some state visible to other requests
+                // so that they will delay before starting, rather than all notifications trying
+                // once before backing off.
+                tokio::time::timeout(SLOWDOWN_DELAY, cancel.cancelled())
+                    .await
+                    .ok();
+                Err(NotifyError::SlowDown)
+            }
+            StatusCode::LOCKED => {
+                // We consider this fatal, because it's possible that the operation blocking the control one is
+                // also the one that is waiting for this reconcile.  We should let the reconciler calling
+                // this hook fail, to give control plane a chance to un-lock.
+                tracing::info!("Control plane reports tenant is locked, dropping out of notify");
+                Err(NotifyError::Busy)
+            }
+            StatusCode::SERVICE_UNAVAILABLE
+            | StatusCode::GATEWAY_TIMEOUT
+            | StatusCode::BAD_GATEWAY => Err(NotifyError::Unavailable(response.status())),
+            StatusCode::BAD_REQUEST | StatusCode::UNAUTHORIZED | StatusCode::FORBIDDEN => {
+                Err(NotifyError::Fatal(response.status()))
+            }
+            _ => Err(NotifyError::Unexpected(response.status())),
+        }
+    }
+
+    async fn do_notify(
+        &self,
+        url: &String,
+        reconfigure_request: ComputeHookNotifyRequest,
+        cancel: &CancellationToken,
+    ) -> Result<(), NotifyError> {
+        let client = reqwest::Client::new();
+        backoff::retry(
+            || self.do_notify_iteration(&client, url, &reconfigure_request, cancel),
+            |e| {
+                matches!(
+                    e,
+                    NotifyError::Fatal(_) | NotifyError::Unexpected(_) | NotifyError::Busy
+                )
+            },
+            3,
+            10,
+            "Send compute notification",
+            cancel,
+        )
+        .await
+        .ok_or_else(|| NotifyError::ShuttingDown)
+        .and_then(|x| x)
+    }
+
+    /// Call this to notify the compute (postgres) tier of new pageservers to use
+    /// for a tenant.  notify() is called by each shard individually, and this function
+    /// will decide whether an update to the tenant is sent.  An update is sent on the
+    /// condition that:
+    /// - We know a pageserver for every shard.
+    /// - All the shards have the same shard_count (i.e. we are not mid-split)
+    ///
+    /// Cancellation token enables callers to drop out, e.g. if calling from a Reconciler
+    /// that is cancelled.
+    ///
+    /// This function is fallible, including in the case that the control plane is transiently
+    /// unavailable.  A limited number of retries are done internally to efficiently hide short unavailability
+    /// periods, but we don't retry forever.  The **caller** is responsible for handling failures and
+    /// ensuring that they eventually call again to ensure that the compute is eventually notified of
+    /// the proper pageserver nodes for a tenant.
+    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
+    pub(super) async fn notify(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node_id: NodeId,
+        stripe_size: ShardStripeSize,
+        cancel: &CancellationToken,
+    ) -> Result<(), NotifyError> {
+        let mut locked = self.state.lock().await;
+
+        use std::collections::hash_map::Entry;
+        let tenant = match locked.entry(tenant_shard_id.tenant_id) {
+            Entry::Vacant(e) => e.insert(ComputeHookTenant::new(
+                tenant_shard_id,
+                stripe_size,
+                node_id,
+            )),
+            Entry::Occupied(e) => {
+                let tenant = e.into_mut();
+                tenant.update(tenant_shard_id, stripe_size, node_id);
+                tenant
+            }
+        };
+
+        let reconfigure_request = tenant.maybe_reconfigure(tenant_shard_id.tenant_id);
+        let Some(reconfigure_request) = reconfigure_request else {
+            // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
+            // until it does.
+            tracing::info!("Tenant isn't yet ready to emit a notification");
+            return Ok(());
+        };
+
+        if let Some(notify_url) = &self.config.compute_hook_url {
+            self.do_notify(notify_url, reconfigure_request, cancel)
+                .await
+        } else {
+            self.do_notify_local(reconfigure_request)
+                .await
+                .map_err(|e| {
+                    // This path is for testing only, so munge the error into our prod-style error type.
+                    tracing::error!("Local notification hook failed: {e}");
+                    NotifyError::Fatal(StatusCode::INTERNAL_SERVER_ERROR)
+                })
+        }
+    }
+}
+
+#[cfg(test)]
+pub(crate) mod tests {
+    use pageserver_api::shard::{ShardCount, ShardNumber};
+    use utils::id::TenantId;
+
+    use super::*;
+
+    #[test]
+    fn tenant_updates() -> anyhow::Result<()> {
+        let tenant_id = TenantId::generate();
+        let mut tenant_state = ComputeHookTenant::new(
+            TenantShardId {
+                tenant_id,
+                shard_count: ShardCount::new(0),
+                shard_number: ShardNumber(0),
+            },
+            ShardStripeSize(12345),
+            NodeId(1),
+        );
+
+        // An unsharded tenant is always ready to emit a notification
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .shards
+                .len(),
+            1
+        );
+        assert!(tenant_state
+            .maybe_reconfigure(tenant_id)
+            .unwrap()
+            .stripe_size
+            .is_none());
+
+        // Writing the first shard of a multi-sharded situation (i.e. in a split)
+        // resets the tenant state and puts it in an non-notifying state (need to
+        // see all shards)
+        tenant_state.update(
+            TenantShardId {
+                tenant_id,
+                shard_count: ShardCount::new(2),
+                shard_number: ShardNumber(1),
+            },
+            ShardStripeSize(32768),
+            NodeId(1),
+        );
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_none());
+
+        // Writing the second shard makes it ready to notify
+        tenant_state.update(
+            TenantShardId {
+                tenant_id,
+                shard_count: ShardCount::new(2),
+                shard_number: ShardNumber(0),
+            },
+            ShardStripeSize(32768),
+            NodeId(1),
+        );
+
+        assert!(tenant_state.maybe_reconfigure(tenant_id).is_some());
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .shards
+                .len(),
+            2
+        );
+        assert_eq!(
+            tenant_state
+                .maybe_reconfigure(tenant_id)
+                .unwrap()
+                .stripe_size,
+            Some(ShardStripeSize(32768))
+        );
+
+        Ok(())
+    }
+}
--- a/control_plane/attachment_service/src/heartbeater.rs
+++ b/control_plane/attachment_service/src/heartbeater.rs
@@ -184,19 +184,6 @@ impl HeartbeaterTask {
                }
            }
        }
-        tracing::info!(
-            "Heartbeat round complete for {} nodes, {} offline",
-            new_state.len(),
-            new_state
-                .values()
-                .filter(|s| match s {
-                    PageserverState::Available { .. } => {
-                        false
-                    }
-                    PageserverState::Offline => true,
-                })
-                .count()
-        );

        let mut deltas = Vec::new();
        let now = Instant::now();
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -4,12 +4,10 @@ use crate::metrics::{
 };
 use crate::reconciler::ReconcileError;
 use crate::service::{Service, STARTUP_RECONCILE_TIMEOUT};
-use anyhow::Context;
 use futures::Future;
 use hyper::header::CONTENT_TYPE;
 use hyper::{Body, Request, Response};
 use hyper::{StatusCode, Uri};
-use metrics::{BuildInfo, NeonMetrics};
 use pageserver_api::models::{
    TenantConfigRequest, TenantCreateRequest, TenantLocationConfigRequest, TenantShardSplitRequest,
    TenantTimeTravelRequest, TimelineCreateRequest,
@@ -46,19 +44,15 @@ use control_plane::storage_controller::{AttachHookRequest, InspectRequest};
 use routerify::Middleware;

 /// State available to HTTP request handlers
+#[derive(Clone)]
 pub struct HttpState {
    service: Arc<crate::service::Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
-    neon_metrics: NeonMetrics,
    allowlist_routes: Vec<Uri>,
 }

 impl HttpState {
-    pub fn new(
-        service: Arc<crate::service::Service>,
-        auth: Option<Arc<SwappableJwtAuth>>,
-        build_info: BuildInfo,
-    ) -> Self {
+    pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
        let allowlist_routes = ["/status", "/ready", "/metrics"]
            .iter()
            .map(|v| v.parse().unwrap())
@@ -66,7 +60,6 @@ impl HttpState {
        Self {
            service,
            auth,
-            neon_metrics: NeonMetrics::new(build_info),
            allowlist_routes,
        }
    }
@@ -259,12 +252,6 @@ async fn handle_tenant_time_travel_remote_storage(
    json_response(StatusCode::OK, ())
 }

-fn map_reqwest_hyper_status(status: reqwest::StatusCode) -> Result<hyper::StatusCode, ApiError> {
-    hyper::StatusCode::from_u16(status.as_u16())
-        .context("invalid status code")
-        .map_err(ApiError::InternalServerError)
-}
-
 async fn handle_tenant_secondary_download(
    service: Arc<Service>,
    req: Request<Body>,
@@ -273,7 +260,7 @@ async fn handle_tenant_secondary_download(
    let wait = parse_query_param(&req, "wait_ms")?.map(Duration::from_millis);

    let (status, progress) = service.tenant_secondary_download(tenant_id, wait).await?;
-    json_response(map_reqwest_hyper_status(status)?, progress)
+    json_response(status, progress)
 }

 async fn handle_tenant_delete(
@@ -284,10 +271,7 @@ async fn handle_tenant_delete(
    check_permissions(&req, Scope::PageServerApi)?;

    deletion_wrapper(service, move |service| async move {
-        service
-            .tenant_delete(tenant_id)
-            .await
-            .and_then(map_reqwest_hyper_status)
+        service.tenant_delete(tenant_id).await
    })
    .await
 }
@@ -318,10 +302,7 @@ async fn handle_tenant_timeline_delete(
    let timeline_id: TimelineId = parse_request_param(&req, "timeline_id")?;

    deletion_wrapper(service, move |service| async move {
-        service
-            .tenant_timeline_delete(tenant_id, timeline_id)
-            .await
-            .and_then(map_reqwest_hyper_status)
+        service.tenant_timeline_delete(tenant_id, timeline_id).await
    })
    .await
 }
@@ -384,9 +365,11 @@ async fn handle_tenant_timeline_passthrough(
    }

    // We have a reqest::Response, would like a http::Response
-    let mut builder = hyper::Response::builder().status(map_reqwest_hyper_status(resp.status())?);
+    let mut builder = hyper::Response::builder()
+        .status(resp.status())
+        .version(resp.version());
    for (k, v) in resp.headers() {
-        builder = builder.header(k.as_str(), v.as_bytes());
+        builder = builder.header(k, v);
    }

    let response = builder
@@ -416,15 +399,6 @@ async fn handle_tenant_describe(
    json_response(StatusCode::OK, service.tenant_describe(tenant_id)?)
 }

-async fn handle_tenant_list(
-    service: Arc<Service>,
-    req: Request<Body>,
-) -> Result<Response<Body>, ApiError> {
-    check_permissions(&req, Scope::Admin)?;
-
-    json_response(StatusCode::OK, service.tenant_list())
-}
-
 async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -438,10 +412,7 @@ async fn handle_node_list(req: Request<Body>) -> Result<Response<Body>, ApiError
    check_permissions(&req, Scope::Admin)?;

    let state = get_state(&req);
-    let nodes = state.service.node_list().await?;
-    let api_nodes = nodes.into_iter().map(|n| n.describe()).collect::<Vec<_>>();
-
-    json_response(StatusCode::OK, api_nodes)
+    json_response(StatusCode::OK, state.service.node_list().await?)
 }

 async fn handle_node_drop(req: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -533,18 +504,6 @@ async fn handle_tenant_drop(req: Request<Body>) -> Result<Response<Body>, ApiErr
    json_response(StatusCode::OK, state.service.tenant_drop(tenant_id).await?)
 }

-async fn handle_tenant_import(req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
-    check_permissions(&req, Scope::PageServerApi)?;
-
-    let state = get_state(&req);
-
-    json_response(
-        StatusCode::OK,
-        state.service.tenant_import(tenant_id).await?,
-    )
-}
-
 async fn handle_tenants_dump(req: Request<Body>) -> Result<Response<Body>, ApiError> {
    check_permissions(&req, Scope::Admin)?;

@@ -631,17 +590,9 @@ where
    .await
 }

-/// Check if the required scope is held in the request's token, or if the request has
-/// a token with 'admin' scope then always permit it.
 fn check_permissions(request: &Request<Body>, required_scope: Scope) -> Result<(), ApiError> {
    check_permission_with(request, |claims| {
-        match crate::auth::check_permission(claims, required_scope) {
-            Err(e) => match crate::auth::check_permission(claims, Scope::Admin) {
-                Ok(()) => Ok(()),
-                Err(_) => Err(e),
-            },
-            Ok(()) => Ok(()),
-        }
+        crate::auth::check_permission(claims, required_scope)
    })
 }

@@ -701,11 +652,10 @@ fn epilogue_metrics_middleware<B: hyper::body::HttpBody + Send + Sync + 'static>
    })
 }

-pub async fn measured_metrics_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+pub async fn measured_metrics_handler(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
    pub const TEXT_FORMAT: &str = "text/plain; version=0.0.4";

-    let state = get_state(&req);
-    let payload = crate::metrics::METRICS_REGISTRY.encode(&state.neon_metrics);
+    let payload = crate::metrics::METRICS_REGISTRY.encode();
    let response = Response::builder()
        .status(200)
        .header(CONTENT_TYPE, TEXT_FORMAT)
@@ -734,7 +684,6 @@ where
 pub fn make_router(
    service: Arc<Service>,
    auth: Option<Arc<SwappableJwtAuth>>,
-    build_info: BuildInfo,
 ) -> RouterBuilder<hyper::Body, ApiError> {
    let mut router = endpoint::make_router()
        .middleware(prologue_metrics_middleware())
@@ -751,7 +700,7 @@ pub fn make_router(
    }

    router
-        .data(Arc::new(HttpState::new(service, auth, build_info)))
+        .data(Arc::new(HttpState::new(service, auth)))
        .get("/metrics", |r| {
            named_request_span(r, measured_metrics_handler, RequestName("metrics"))
        })
@@ -782,13 +731,6 @@ pub fn make_router(
        .post("/debug/v1/node/:node_id/drop", |r| {
            named_request_span(r, handle_node_drop, RequestName("debug_v1_node_drop"))
        })
-        .post("/debug/v1/tenant/:tenant_id/import", |r| {
-            named_request_span(
-                r,
-                handle_tenant_import,
-                RequestName("debug_v1_tenant_import"),
-            )
-        })
        .get("/debug/v1/tenant", |r| {
            named_request_span(r, handle_tenants_dump, RequestName("debug_v1_tenant"))
        })
@@ -851,9 +793,6 @@ pub fn make_router(
                RequestName("control_v1_tenant_describe"),
            )
        })
-        .get("/control/v1/tenant", |r| {
-            tenant_service_handler(r, handle_tenant_list, RequestName("control_v1_tenant_list"))
-        })
        .put("/control/v1/tenant/:tenant_id/policy", |r| {
            named_request_span(
                r,
@@ -912,7 +851,7 @@ pub fn make_router(
                RequestName("v1_tenant_timeline"),
            )
        })
-        // Tenant detail GET passthrough to shard zero:
+        // Tenant detail GET passthrough to shard zero
        .get("/v1/tenant/:tenant_id", |r| {
            tenant_service_handler(
                r,
@@ -920,14 +859,13 @@ pub fn make_router(
                RequestName("v1_tenant_passthrough"),
            )
        })
-        // The `*` in the  URL is a wildcard: any tenant/timeline GET APIs on the pageserver
-        // are implicitly exposed here.  This must be last in the list to avoid
-        // taking precedence over other GET methods we might implement by hand.
-        .get("/v1/tenant/:tenant_id/*", |r| {
+        // Timeline GET passthrough to shard zero.  Note that the `*` in the URL is a wildcard: any future
+        // timeline GET APIs will be implicitly included.
+        .get("/v1/tenant/:tenant_id/timeline*", |r| {
            tenant_service_handler(
                r,
                handle_tenant_timeline_passthrough,
-                RequestName("v1_tenant_passthrough"),
+                RequestName("v1_tenant_timeline_passthrough"),
            )
        })
 }
--- a/control_plane/attachment_service/src/id_lock_map.rs
+++ b/control_plane/attachment_service/src/id_lock_map.rs
@@ -0,0 +1,54 @@
+use std::{collections::HashMap, sync::Arc};
+
+/// A map of locks covering some arbitrary identifiers. Useful if you have a collection of objects but don't
+/// want to embed a lock in each one, or if your locking granularity is different to your object granularity.
+/// For example, used in the storage controller where the objects are tenant shards, but sometimes locking
+/// is needed at a tenant-wide granularity.
+pub(crate) struct IdLockMap<T>
+where
+    T: Eq + PartialEq + std::hash::Hash,
+{
+    /// A synchronous lock for getting/setting the async locks that our callers will wait on.
+    entities: std::sync::Mutex<std::collections::HashMap<T, Arc<tokio::sync::RwLock<()>>>>,
+}
+
+impl<T> IdLockMap<T>
+where
+    T: Eq + PartialEq + std::hash::Hash,
+{
+    pub(crate) fn shared(
+        &self,
+        key: T,
+    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockReadGuard<()>> {
+        let mut locked = self.entities.lock().unwrap();
+        let entry = locked.entry(key).or_default();
+        entry.clone().read_owned()
+    }
+
+    pub(crate) fn exclusive(
+        &self,
+        key: T,
+    ) -> impl std::future::Future<Output = tokio::sync::OwnedRwLockWriteGuard<()>> {
+        let mut locked = self.entities.lock().unwrap();
+        let entry = locked.entry(key).or_default();
+        entry.clone().write_owned()
+    }
+
+    /// Rather than building a lock guard that re-takes the [`Self::entities`] lock, we just do
+    /// periodic housekeeping to avoid the map growing indefinitely
+    pub(crate) fn housekeeping(&self) {
+        let mut locked = self.entities.lock().unwrap();
+        locked.retain(|_k, lock| lock.try_write().is_err())
+    }
+}
+
+impl<T> Default for IdLockMap<T>
+where
+    T: Eq + PartialEq + std::hash::Hash,
+{
+    fn default() -> Self {
+        Self {
+            entities: std::sync::Mutex::new(HashMap::new()),
+        }
+    }
+}
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -14,7 +14,7 @@ mod reconciler;
 mod scheduler;
 mod schema;
 pub mod service;
-mod tenant_shard;
+mod tenant_state;

 #[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone, Serialize)]
 struct Sequence(u64);
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -1,22 +1,18 @@
 use anyhow::{anyhow, Context};
+use attachment_service::http::make_router;
+use attachment_service::metrics::preinitialize_metrics;
+use attachment_service::persistence::Persistence;
+use attachment_service::service::{Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT};
 use camino::Utf8PathBuf;
 use clap::Parser;
 use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
-use metrics::BuildInfo;
 use std::sync::Arc;
-use storage_controller::http::make_router;
-use storage_controller::metrics::preinitialize_metrics;
-use storage_controller::persistence::Persistence;
-use storage_controller::service::{
-    Config, Service, MAX_UNAVAILABLE_INTERVAL_DEFAULT, RECONCILER_CONCURRENCY_DEFAULT,
-};
 use tokio::signal::unix::SignalKind;
 use tokio_util::sync::CancellationToken;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};

-use utils::sentry_init::init_sentry;
 use utils::{project_build_tag, project_git_version, tcp_listener};

 project_git_version!(GIT_VERSION);
@@ -54,7 +50,7 @@ struct Cli {
    #[arg(short, long)]
    path: Option<Utf8PathBuf>,

-    /// URL to connect to postgres, like postgresql://localhost:1234/storage_controller
+    /// URL to connect to postgres, like postgresql://localhost:1234/attachment_service
    #[arg(long)]
    database_url: Option<String>,

@@ -65,14 +61,6 @@ struct Cli {
    /// Grace period before marking unresponsive pageserver offline
    #[arg(long)]
    max_unavailable_interval: Option<humantime::Duration>,
-
-    /// Maximum number of reconcilers that may run in parallel
-    #[arg(long)]
-    reconciler_concurrency: Option<usize>,
-
-    /// How long to wait for the initial database connection to be available.
-    #[arg(long, default_value = "5s")]
-    db_connect_timeout: humantime::Duration,
 }

 enum StrictMode {
@@ -170,8 +158,6 @@ fn main() -> anyhow::Result<()> {
        std::process::exit(1);
    }));

-    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
-
    tokio::runtime::Builder::new_current_thread()
        // We use spawn_blocking for database operations, so require approximately
        // as many blocking threads as we will open database connections.
@@ -203,11 +189,6 @@ async fn async_main() -> anyhow::Result<()> {
        args.listen
    );

-    let build_info = BuildInfo {
-        revision: GIT_VERSION,
-        build_tag: BUILD_TAG,
-    };
-
    let strict_mode = if args.dev {
        StrictMode::Dev
    } else {
@@ -252,14 +233,9 @@ async fn async_main() -> anyhow::Result<()> {
            .max_unavailable_interval
            .map(humantime::Duration::into)
            .unwrap_or(MAX_UNAVAILABLE_INTERVAL_DEFAULT),
-        reconciler_concurrency: args
-            .reconciler_concurrency
-            .unwrap_or(RECONCILER_CONCURRENCY_DEFAULT),
    };

    // After loading secrets & config, but before starting anything else, apply database migrations
-    Persistence::await_connection(&secrets.database_url, args.db_connect_timeout.into()).await?;
-
    migration_run(&secrets.database_url)
        .await
        .context("Running database migrations")?;
@@ -274,7 +250,7 @@ async fn async_main() -> anyhow::Result<()> {
    let auth = secrets
        .public_key
        .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
-    let router = make_router(service.clone(), auth, build_info)
+    let router = make_router(service.clone(), auth)
        .build()
        .map_err(|err| anyhow!(err))?;
    let router_service = utils::http::RouterService::new(router).unwrap();
--- a/control_plane/attachment_service/src/metrics.rs
+++ b/control_plane/attachment_service/src/metrics.rs
@@ -8,8 +8,10 @@
 //! The rest of the code defines label group types and deals with converting outer types to labels.
 //!
 use bytes::Bytes;
-use measured::{label::LabelValue, metric::histogram, FixedCardinalityLabel, MetricGroup};
-use metrics::NeonMetrics;
+use measured::{
+    label::{LabelValue, StaticLabelSet},
+    FixedCardinalityLabel, MetricGroup,
+};
 use once_cell::sync::Lazy;
 use std::sync::Mutex;

@@ -24,15 +26,13 @@ pub fn preinitialize_metrics() {

 pub(crate) struct StorageControllerMetrics {
    pub(crate) metrics_group: StorageControllerMetricGroup,
-    encoder: Mutex<measured::text::BufferedTextEncoder>,
+    encoder: Mutex<measured::text::TextEncoder>,
 }

 #[derive(measured::MetricGroup)]
-#[metric(new())]
 pub(crate) struct StorageControllerMetricGroup {
    /// Count of how many times we spawn a reconcile task
    pub(crate) storage_controller_reconcile_spawn: measured::Counter,
-
    /// Reconciler tasks completed, broken down by success/failure/cancelled
    pub(crate) storage_controller_reconcile_complete:
        measured::CounterVec<ReconcileCompleteLabelGroupSet>,
@@ -43,9 +43,7 @@ pub(crate) struct StorageControllerMetricGroup {
    /// HTTP request status counters for handled requests
    pub(crate) storage_controller_http_request_status:
        measured::CounterVec<HttpRequestStatusLabelGroupSet>,
-
    /// HTTP request handler latency across all status codes
-    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
    pub(crate) storage_controller_http_request_latency:
        measured::HistogramVec<HttpRequestLatencyLabelGroupSet, 5>,

@@ -57,7 +55,6 @@ pub(crate) struct StorageControllerMetricGroup {
    /// Latency of HTTP requests to the pageserver, broken down by pageserver
    /// node id, request name and method. This include both successful and unsuccessful
    /// requests.
-    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
    pub(crate) storage_controller_pageserver_request_latency:
        measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,

@@ -69,7 +66,6 @@ pub(crate) struct StorageControllerMetricGroup {
    /// Latency of pass-through HTTP requests to the pageserver, broken down by pageserver
    /// node id, request name and method. This include both successful and unsuccessful
    /// requests.
-    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
    pub(crate) storage_controller_passthrough_request_latency:
        measured::HistogramVec<PageserverRequestLabelGroupSet, 5>,

@@ -78,34 +74,76 @@ pub(crate) struct StorageControllerMetricGroup {
        measured::CounterVec<DatabaseQueryErrorLabelGroupSet>,

    /// Latency of database queries, broken down by operation.
-    #[metric(metadata = histogram::Thresholds::exponential_buckets(0.1, 2.0))]
    pub(crate) storage_controller_database_query_latency:
        measured::HistogramVec<DatabaseQueryLatencyLabelGroupSet, 5>,
 }

 impl StorageControllerMetrics {
-    pub(crate) fn encode(&self, neon_metrics: &NeonMetrics) -> Bytes {
+    pub(crate) fn encode(&self) -> Bytes {
        let mut encoder = self.encoder.lock().unwrap();
-        neon_metrics
-            .collect_group_into(&mut *encoder)
-            .unwrap_or_else(|infallible| match infallible {});
-        self.metrics_group
-            .collect_group_into(&mut *encoder)
-            .unwrap_or_else(|infallible| match infallible {});
+        self.metrics_group.collect_into(&mut *encoder);
        encoder.finish()
    }
 }

 impl Default for StorageControllerMetrics {
    fn default() -> Self {
-        let mut metrics_group = StorageControllerMetricGroup::new();
-        metrics_group
-            .storage_controller_reconcile_complete
-            .init_all_dense();
-
        Self {
-            metrics_group,
-            encoder: Mutex::new(measured::text::BufferedTextEncoder::new()),
+            metrics_group: StorageControllerMetricGroup::new(),
+            encoder: Mutex::new(measured::text::TextEncoder::new()),
+        }
+    }
+}
+
+impl StorageControllerMetricGroup {
+    pub(crate) fn new() -> Self {
+        Self {
+            storage_controller_reconcile_spawn: measured::Counter::new(),
+            storage_controller_reconcile_complete: measured::CounterVec::new(
+                ReconcileCompleteLabelGroupSet {
+                    status: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_schedule_optimization: measured::Counter::new(),
+            storage_controller_http_request_status: measured::CounterVec::new(
+                HttpRequestStatusLabelGroupSet {
+                    path: lasso::ThreadedRodeo::new(),
+                    method: StaticLabelSet::new(),
+                    status: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_http_request_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
+            storage_controller_pageserver_request_error: measured::CounterVec::new(
+                PageserverRequestLabelGroupSet {
+                    pageserver_id: lasso::ThreadedRodeo::new(),
+                    path: lasso::ThreadedRodeo::new(),
+                    method: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_pageserver_request_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
+            storage_controller_passthrough_request_error: measured::CounterVec::new(
+                PageserverRequestLabelGroupSet {
+                    pageserver_id: lasso::ThreadedRodeo::new(),
+                    path: lasso::ThreadedRodeo::new(),
+                    method: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_passthrough_request_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
+            storage_controller_database_query_error: measured::CounterVec::new(
+                DatabaseQueryErrorLabelGroupSet {
+                    operation: StaticLabelSet::new(),
+                    error_type: StaticLabelSet::new(),
+                },
+            ),
+            storage_controller_database_query_latency: measured::HistogramVec::new(
+                measured::metric::histogram::Thresholds::exponential_buckets(0.1, 2.0),
+            ),
        }
    }
 }
@@ -119,7 +157,7 @@ pub(crate) struct ReconcileCompleteLabelGroup {
 #[derive(measured::LabelGroup)]
 #[label(set = HttpRequestStatusLabelGroupSet)]
 pub(crate) struct HttpRequestStatusLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
    pub(crate) path: &'a str,
    pub(crate) method: Method,
    pub(crate) status: StatusCode,
@@ -128,21 +166,40 @@ pub(crate) struct HttpRequestStatusLabelGroup<'a> {
 #[derive(measured::LabelGroup)]
 #[label(set = HttpRequestLatencyLabelGroupSet)]
 pub(crate) struct HttpRequestLatencyLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
    pub(crate) path: &'a str,
    pub(crate) method: Method,
 }

+impl Default for HttpRequestLatencyLabelGroupSet {
+    fn default() -> Self {
+        Self {
+            path: lasso::ThreadedRodeo::new(),
+            method: StaticLabelSet::new(),
+        }
+    }
+}
+
 #[derive(measured::LabelGroup, Clone)]
 #[label(set = PageserverRequestLabelGroupSet)]
 pub(crate) struct PageserverRequestLabelGroup<'a> {
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
    pub(crate) pageserver_id: &'a str,
-    #[label(dynamic_with = lasso::ThreadedRodeo, default)]
+    #[label(dynamic_with = lasso::ThreadedRodeo)]
    pub(crate) path: &'a str,
    pub(crate) method: Method,
 }

+impl Default for PageserverRequestLabelGroupSet {
+    fn default() -> Self {
+        Self {
+            pageserver_id: lasso::ThreadedRodeo::new(),
+            path: lasso::ThreadedRodeo::new(),
+            method: StaticLabelSet::new(),
+        }
+    }
+}
+
 #[derive(measured::LabelGroup)]
 #[label(set = DatabaseQueryErrorLabelGroupSet)]
 pub(crate) struct DatabaseQueryErrorLabelGroup {
@@ -156,7 +213,7 @@ pub(crate) struct DatabaseQueryLatencyLabelGroup {
    pub(crate) operation: DatabaseOperation,
 }

-#[derive(FixedCardinalityLabel, Clone, Copy)]
+#[derive(FixedCardinalityLabel)]
 pub(crate) enum ReconcileOutcome {
    #[label(rename = "ok")]
    Success,
@@ -164,7 +221,7 @@ pub(crate) enum ReconcileOutcome {
    Cancel,
 }

-#[derive(FixedCardinalityLabel, Copy, Clone)]
+#[derive(FixedCardinalityLabel, Clone)]
 pub(crate) enum Method {
    Get,
    Put,
@@ -189,12 +246,11 @@ impl From<hyper::Method> for Method {
    }
 }

-#[derive(Clone, Copy)]
 pub(crate) struct StatusCode(pub(crate) hyper::http::StatusCode);

 impl LabelValue for StatusCode {
    fn visit<V: measured::label::LabelVisitor>(&self, v: V) -> V::Output {
-        v.write_int(self.0.as_u16() as i64)
+        v.write_int(self.0.as_u16() as u64)
    }
 }

@@ -212,7 +268,7 @@ impl FixedCardinalityLabel for StatusCode {
    }
 }

-#[derive(FixedCardinalityLabel, Clone, Copy)]
+#[derive(FixedCardinalityLabel)]
 pub(crate) enum DatabaseErrorLabel {
    Query,
    Connection,
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -1,14 +1,13 @@
 use std::{str::FromStr, time::Duration};

+use hyper::StatusCode;
 use pageserver_api::{
    controller_api::{
-        NodeAvailability, NodeDescribeResponse, NodeRegisterRequest, NodeSchedulingPolicy,
-        TenantLocateResponseShard,
+        NodeAvailability, NodeRegisterRequest, NodeSchedulingPolicy, TenantLocateResponseShard,
    },
    shard::TenantShardId,
 };
 use pageserver_client::mgmt_api;
-use reqwest::StatusCode;
 use serde::Serialize;
 use tokio_util::sync::CancellationToken;
 use utils::{backoff, id::NodeId};
@@ -257,19 +256,6 @@ impl Node {
        )
        .await
    }
-
-    /// Generate the simplified API-friendly description of a node's state
-    pub(crate) fn describe(&self) -> NodeDescribeResponse {
-        NodeDescribeResponse {
-            id: self.id,
-            availability: self.availability.into(),
-            scheduling: self.scheduling,
-            listen_http_addr: self.listen_http_addr.clone(),
-            listen_http_port: self.listen_http_port,
-            listen_pg_addr: self.listen_pg_addr.clone(),
-            listen_pg_port: self.listen_pg_port,
-        }
-    }
 }

 impl std::fmt::Display for Node {
--- a/control_plane/attachment_service/src/pageserver_client.rs
+++ b/control_plane/attachment_service/src/pageserver_client.rs
@@ -1,14 +1,13 @@
 use pageserver_api::{
    models::{
        LocationConfig, LocationConfigListResponse, PageserverUtilization, SecondaryProgress,
-        TenantScanRemoteStorageResponse, TenantShardSplitRequest, TenantShardSplitResponse,
-        TimelineCreateRequest, TimelineInfo,
+        TenantShardSplitRequest, TenantShardSplitResponse, TimelineCreateRequest, TimelineInfo,
    },
    shard::TenantShardId,
 };
 use pageserver_client::mgmt_api::{Client, Result};
 use reqwest::StatusCode;
-use utils::id::{NodeId, TenantId, TimelineId};
+use utils::id::{NodeId, TimelineId};

 /// Thin wrapper around [`pageserver_client::mgmt_api::Client`]. It allows the storage
 /// controller to collect metrics in a non-intrusive manner.
@@ -89,18 +88,6 @@ impl PageserverClient {
        )
    }

-    pub(crate) async fn tenant_scan_remote_storage(
-        &self,
-        tenant_id: TenantId,
-    ) -> Result<TenantScanRemoteStorageResponse> {
-        measured_request!(
-            "tenant_scan_remote_storage",
-            crate::metrics::Method::Get,
-            &self.node_id_label,
-            self.inner.tenant_scan_remote_storage(tenant_id).await
-        )
-    }
-
    pub(crate) async fn tenant_secondary_download(
        &self,
        tenant_id: TenantShardId,
@@ -114,27 +101,6 @@ impl PageserverClient {
        )
    }

-    pub(crate) async fn tenant_secondary_status(
-        &self,
-        tenant_shard_id: TenantShardId,
-    ) -> Result<SecondaryProgress> {
-        measured_request!(
-            "tenant_secondary_status",
-            crate::metrics::Method::Get,
-            &self.node_id_label,
-            self.inner.tenant_secondary_status(tenant_shard_id).await
-        )
-    }
-
-    pub(crate) async fn tenant_heatmap_upload(&self, tenant_id: TenantShardId) -> Result<()> {
-        measured_request!(
-            "tenant_heatmap_upload",
-            crate::metrics::Method::Post,
-            &self.node_id_label,
-            self.inner.tenant_heatmap_upload(tenant_id).await
-        )
-    }
-
    pub(crate) async fn location_config(
        &self,
        tenant_shard_id: TenantShardId,
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -2,7 +2,6 @@ pub(crate) mod split_state;
 use std::collections::HashMap;
 use std::str::FromStr;
 use std::time::Duration;
-use std::time::Instant;

 use self::split_state::SplitState;
 use camino::Utf8Path;
@@ -80,7 +79,7 @@ pub(crate) enum DatabaseError {
    Logical(String),
 }

-#[derive(measured::FixedCardinalityLabel, Copy, Clone)]
+#[derive(measured::FixedCardinalityLabel, Clone)]
 pub(crate) enum DatabaseOperation {
    InsertNode,
    UpdateNode,
@@ -145,31 +144,6 @@ impl Persistence {
        }
    }

-    /// A helper for use during startup, where we would like to tolerate concurrent restarts of the
-    /// database and the storage controller, therefore the database might not be available right away
-    pub async fn await_connection(
-        database_url: &str,
-        timeout: Duration,
-    ) -> Result<(), diesel::ConnectionError> {
-        let started_at = Instant::now();
-        loop {
-            match PgConnection::establish(database_url) {
-                Ok(_) => {
-                    tracing::info!("Connected to database.");
-                    return Ok(());
-                }
-                Err(e) => {
-                    if started_at.elapsed() > timeout {
-                        return Err(e);
-                    } else {
-                        tracing::info!("Database not yet available, waiting... ({e})");
-                        tokio::time::sleep(Duration::from_millis(100)).await;
-                    }
-                }
-            }
-        }
-    }
-
    /// Wraps `with_conn` in order to collect latency and error metrics
    async fn with_measured_conn<F, R>(&self, op: DatabaseOperation, func: F) -> DatabaseResult<R>
    where
@@ -179,7 +153,9 @@ impl Persistence {
        let latency = &METRICS_REGISTRY
            .metrics_group
            .storage_controller_database_query_latency;
-        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup { operation: op });
+        let _timer = latency.start_timer(DatabaseQueryLatencyLabelGroup {
+            operation: op.clone(),
+        });

        let res = self.with_conn(func).await;

@@ -720,7 +696,7 @@ impl Persistence {
    }
 }

-/// Parts of [`crate::tenant_shard::TenantShard`] that are stored durably
+/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
 #[derive(Queryable, Selectable, Insertable, Serialize, Deserialize, Clone, Eq, PartialEq)]
 #[diesel(table_name = crate::schema::tenant_shards)]
 pub(crate) struct TenantShardPersistence {
--- a/control_plane/attachment_service/src/persistence/split_state.rs
+++ b/control_plane/attachment_service/src/persistence/split_state.rs
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -1,12 +1,12 @@
 use crate::pageserver_client::PageserverClient;
 use crate::persistence::Persistence;
 use crate::service;
+use hyper::StatusCode;
 use pageserver_api::models::{
    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
 use pageserver_api::shard::{ShardIdentity, TenantShardId};
 use pageserver_client::mgmt_api;
-use reqwest::StatusCode;
 use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, Instant};
@@ -18,14 +18,14 @@ use utils::sync::gate::GateGuard;

 use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
-use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation};
+use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};

 const DEFAULT_HEATMAP_PERIOD: &str = "60s";

 /// Object with the lifetime of the background reconcile task that is created
 /// for tenants which have a difference between their intent and observed states.
 pub(super) struct Reconciler {
-    /// See [`crate::tenant_shard::TenantShard`] for the meanings of these fields: they are a snapshot
+    /// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
    /// of a tenant's state from when we spawned a reconcile task.
    pub(super) tenant_shard_id: TenantShardId,
    pub(crate) shard: ShardIdentity,
@@ -48,15 +48,11 @@ pub(super) struct Reconciler {

    /// To avoid stalling if the cloud control plane is unavailable, we may proceed
    /// past failures in [`ComputeHook::notify`], but we _must_ remember that we failed
-    /// so that we can set [`crate::tenant_shard::TenantShard::pending_compute_notification`] to ensure a later retry.
+    /// so that we can set [`crate::tenant_state::TenantState::pending_compute_notification`] to ensure a later retry.
    pub(crate) compute_notify_failure: bool,

-    /// Reconciler is responsible for keeping alive semaphore units that limit concurrency on how many
-    /// we will spawn.
-    pub(crate) _resource_units: ReconcileUnits,
-
    /// A means to abort background reconciliation: it is essential to
-    /// call this when something changes in the original TenantShard that
+    /// call this when something changes in the original TenantState that
    /// will make this reconciliation impossible or unnecessary, for
    /// example when a pageserver node goes offline, or the PlacementPolicy for
    /// the tenant is changed.
@@ -70,20 +66,7 @@ pub(super) struct Reconciler {
    pub(crate) persistence: Arc<Persistence>,
 }

-/// RAII resource units granted to a Reconciler, which it should keep alive until it finishes doing I/O
-pub(crate) struct ReconcileUnits {
-    _sem_units: tokio::sync::OwnedSemaphorePermit,
-}
-
-impl ReconcileUnits {
-    pub(crate) fn new(sem_units: tokio::sync::OwnedSemaphorePermit) -> Self {
-        Self {
-            _sem_units: sem_units,
-        }
-    }
-}
-
-/// This is a snapshot of [`crate::tenant_shard::IntentState`], but it does not do any
+/// This is a snapshot of [`crate::tenant_state::IntentState`], but it does not do any
 /// reference counting for Scheduler.  The IntentState is what the scheduler works with,
 /// and the TargetState is just the instruction for a particular Reconciler run.
 #[derive(Debug)]
@@ -767,10 +750,7 @@ impl Reconciler {
                // It is up to the caller whether they want to drop out on this error, but they don't have to:
                // in general we should avoid letting unavailability of the cloud control plane stop us from
                // making progress.
-                if !matches!(e, NotifyError::ShuttingDown) {
-                    tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
-                }
-
+                tracing::warn!("Failed to notify compute of attached pageserver {node}: {e}");
                // Set this flag so that in our ReconcileResult we will set the flag on the shard that it
                // needs to retry at some point.
                self.compute_notify_failure = true;
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -1,4 +1,4 @@
-use crate::{node::Node, tenant_shard::TenantShard};
+use crate::{node::Node, tenant_state::TenantState};
 use pageserver_api::controller_api::UtilizationScore;
 use serde::Serialize;
 use std::collections::HashMap;
@@ -27,7 +27,7 @@ pub enum MaySchedule {

 #[derive(Serialize)]
 struct SchedulerNode {
-    /// How many shards are currently scheduled on this node, via their [`crate::tenant_shard::IntentState`].
+    /// How many shards are currently scheduled on this node, via their [`crate::tenant_state::IntentState`].
    shard_count: usize,

    /// Whether this node is currently elegible to have new shards scheduled (this is derived
@@ -84,21 +84,7 @@ impl std::ops::Add for AffinityScore {
    }
 }

-/// Hint for whether this is a sincere attempt to schedule, or a speculative
-/// check for where we _would_ schedule (done during optimization)
-#[derive(Debug)]
-pub(crate) enum ScheduleMode {
-    Normal,
-    Speculative,
-}
-
-impl Default for ScheduleMode {
-    fn default() -> Self {
-        Self::Normal
-    }
-}
-
-// For carrying state between multiple calls to [`TenantShard::schedule`], e.g. when calling
+// For carrying state between multiple calls to [`TenantState::schedule`], e.g. when calling
 // it for many shards in the same tenant.
 #[derive(Debug, Default)]
 pub(crate) struct ScheduleContext {
@@ -107,8 +93,6 @@ pub(crate) struct ScheduleContext {

    /// Specifically how many _attached_ locations are on each node
    pub(crate) attached_nodes: HashMap<NodeId, usize>,
-
-    pub(crate) mode: ScheduleMode,
 }

 impl ScheduleContext {
@@ -163,7 +147,7 @@ impl Scheduler {
    pub(crate) fn consistency_check<'a>(
        &self,
        nodes: impl Iterator<Item = &'a Node>,
-        shards: impl Iterator<Item = &'a TenantShard>,
+        shards: impl Iterator<Item = &'a TenantState>,
    ) -> anyhow::Result<()> {
        let mut expect_nodes: HashMap<NodeId, SchedulerNode> = HashMap::new();
        for node in nodes {
@@ -345,34 +329,27 @@ impl Scheduler {
        scores.sort_by_key(|i| (i.1, i.2, i.0));

        if scores.is_empty() {
-            // After applying constraints, no pageservers were left.
-            if !matches!(context.mode, ScheduleMode::Speculative) {
-                // If this was not a speculative attempt, log details to understand why we couldn't
-                // schedule: this may help an engineer understand if some nodes are marked offline
-                // in a way that's preventing progress.
+            // After applying constraints, no pageservers were left.  We log some detail about
+            // the state of nodes to help understand why this happened.  This is not logged as an error because
+            // it is legitimately possible for enough nodes to be Offline to prevent scheduling a shard.
+            tracing::info!("Scheduling failure, while excluding {hard_exclude:?}, node states:");
+            for (node_id, node) in &self.nodes {
                tracing::info!(
-                    "Scheduling failure, while excluding {hard_exclude:?}, node states:"
+                    "Node {node_id}: may_schedule={} shards={}",
+                    node.may_schedule != MaySchedule::No,
+                    node.shard_count
                );
-                for (node_id, node) in &self.nodes {
-                    tracing::info!(
-                        "Node {node_id}: may_schedule={} shards={}",
-                        node.may_schedule != MaySchedule::No,
-                        node.shard_count
-                    );
-                }
            }
+
            return Err(ScheduleError::ImpossibleConstraint);
        }

        // Lowest score wins
        let node_id = scores.first().unwrap().0;
-
-        if !matches!(context.mode, ScheduleMode::Speculative) {
-            tracing::info!(
+        tracing::info!(
            "scheduler selected node {node_id} (elegible nodes {:?}, hard exclude: {hard_exclude:?}, soft exclude: {context:?})",
            scores.iter().map(|i| i.0 .0).collect::<Vec<_>>()
        );
-        }

        // Note that we do not update shard count here to reflect the scheduling: that
        // is IntentState's job when the scheduled location is used.
@@ -421,7 +398,7 @@ pub(crate) mod test_utils {
 mod tests {
    use super::*;

-    use crate::tenant_shard::IntentState;
+    use crate::tenant_state::IntentState;
    #[test]
    fn scheduler_basic() -> anyhow::Result<()> {
        let nodes = test_utils::make_test_nodes(2);
--- a/control_plane/attachment_service/src/schema.rs
+++ b/control_plane/attachment_service/src/schema.rs
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -7,7 +7,6 @@ use std::{
 use crate::{
    metrics::{self, ReconcileCompleteLabelGroup, ReconcileOutcome},
    persistence::TenantShardPersistence,
-    reconciler::ReconcileUnits,
    scheduler::{AffinityScore, MaySchedule, ScheduleContext},
 };
 use pageserver_api::controller_api::{PlacementPolicy, ShardSchedulingPolicy};
@@ -23,7 +22,7 @@ use utils::{
    generation::Generation,
    id::NodeId,
    seqwait::{SeqWait, SeqWaitError},
-    sync::gate::GateGuard,
+    sync::gate::Gate,
 };

 use crate::{
@@ -38,18 +37,12 @@ use crate::{
 };

 /// Serialization helper
-fn read_last_error<S, T>(v: &std::sync::Mutex<Option<T>>, serializer: S) -> Result<S::Ok, S::Error>
+fn read_mutex_content<S, T>(v: &std::sync::Mutex<T>, serializer: S) -> Result<S::Ok, S::Error>
 where
    S: serde::ser::Serializer,
-    T: std::fmt::Display,
+    T: Clone + std::fmt::Display,
 {
-    serializer.collect_str(
-        &v.lock()
-            .unwrap()
-            .as_ref()
-            .map(|e| format!("{e}"))
-            .unwrap_or("".to_string()),
-    )
+    serializer.collect_str(&v.lock().unwrap())
 }

 /// In-memory state for a particular tenant shard.
@@ -57,7 +50,7 @@ where
 /// This struct implement Serialize for debugging purposes, but is _not_ persisted
 /// itself: see [`crate::persistence`] for the subset of tenant shard state that is persisted.
 #[derive(Serialize)]
-pub(crate) struct TenantShard {
+pub(crate) struct TenantState {
    pub(crate) tenant_shard_id: TenantShardId,

    pub(crate) shard: ShardIdentity,
@@ -102,10 +95,6 @@ pub(crate) struct TenantShard {
    /// reconciliation, and timeline creation.
    pub(crate) splitting: SplitState,

-    /// If a tenant was enqueued for later reconcile due to hitting concurrency limit, this flag
-    /// is set. This flag is cleared when the tenant is popped off the delay queue.
-    pub(crate) delayed_reconcile: bool,
-
    /// Optionally wait for reconciliation to complete up to a particular
    /// sequence number.
    #[serde(skip)]
@@ -117,19 +106,15 @@ pub(crate) struct TenantShard {
    #[serde(skip)]
    pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,

-    /// The most recent error from a reconcile on this tenant.  This is a nested Arc
-    /// because:
-    ///  - ReconcileWaiters need to Arc-clone the overall object to read it later
-    ///  - ReconcileWaitError needs to use an `Arc<ReconcileError>` because we can construct
-    ///    many waiters for one shard, and the underlying error types are not Clone.
+    /// The most recent error from a reconcile on this tenant
    /// TODO: generalize to an array of recent events
    /// TOOD: use a ArcSwap instead of mutex for faster reads?
-    #[serde(serialize_with = "read_last_error")]
-    pub(crate) last_error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,
+    #[serde(serialize_with = "read_mutex_content")]
+    pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,

    /// If we have a pending compute notification that for some reason we weren't able to send,
-    /// set this to true. If this is set, calls to [`Self::get_reconcile_needed`] will return Yes
-    /// and trigger a Reconciler run.  This is the mechanism by which compute notifications are included in the scope
+    /// set this to true. If this is set, calls to [`Self::maybe_reconcile`] will run a task to retry
+    /// sending it.  This is the mechanism by which compute notifications are included in the scope
    /// of state that we publish externally in an eventually consistent way.
    pub(crate) pending_compute_notification: bool,

@@ -303,18 +288,18 @@ pub(crate) struct ReconcilerWaiter {

    seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
    error_seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
-    error: std::sync::Arc<std::sync::Mutex<Option<Arc<ReconcileError>>>>,
+    error: std::sync::Arc<std::sync::Mutex<String>>,
    seq: Sequence,
 }

 #[derive(thiserror::Error, Debug)]
-pub(crate) enum ReconcileWaitError {
+pub enum ReconcileWaitError {
    #[error("Timeout waiting for shard {0}")]
    Timeout(TenantShardId),
    #[error("shutting down")]
    Shutdown,
    #[error("Reconcile error on shard {0}: {1}")]
-    Failed(TenantShardId, Arc<ReconcileError>),
+    Failed(TenantShardId, String),
 }

 #[derive(Eq, PartialEq, Debug)]
@@ -325,28 +310,18 @@ pub(crate) struct ReplaceSecondary {

 #[derive(Eq, PartialEq, Debug)]
 pub(crate) struct MigrateAttachment {
-    pub(crate) old_attached_node_id: NodeId,
-    pub(crate) new_attached_node_id: NodeId,
+    old_attached_node_id: NodeId,
+    new_attached_node_id: NodeId,
 }

 #[derive(Eq, PartialEq, Debug)]
-pub(crate) enum ScheduleOptimizationAction {
+pub(crate) enum ScheduleOptimization {
    // Replace one of our secondary locations with a different node
    ReplaceSecondary(ReplaceSecondary),
    // Migrate attachment to an existing secondary location
    MigrateAttachment(MigrateAttachment),
 }

-#[derive(Eq, PartialEq, Debug)]
-pub(crate) struct ScheduleOptimization {
-    // What was the reconcile sequence when we generated this optimization?  The optimization
-    // should only be applied if the shard's sequence is still at this value, in case other changes
-    // happened between planning the optimization and applying it.
-    sequence: Sequence,
-
-    pub(crate) action: ScheduleOptimizationAction,
-}
-
 impl ReconcilerWaiter {
    pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
        tokio::select! {
@@ -362,8 +337,7 @@ impl ReconcilerWaiter {
                    SeqWaitError::Timeout => unreachable!()
                })?;

-                return Err(ReconcileWaitError::Failed(self.tenant_shard_id,
-                    self.error.lock().unwrap().clone().expect("If error_seq_wait was advanced error was set").clone()))
+                return Err(ReconcileWaitError::Failed(self.tenant_shard_id, self.error.lock().unwrap().clone()))
            }
        }

@@ -379,19 +353,8 @@ pub(crate) struct ReconcilerHandle {
    cancel: CancellationToken,
 }

-pub(crate) enum ReconcileNeeded {
-    /// shard either doesn't need reconciliation, or is forbidden from spawning a reconciler
-    /// in its current state (e.g. shard split in progress, or ShardSchedulingPolicy forbids it)
-    No,
-    /// shard has a reconciler running, and its intent hasn't changed since that one was
-    /// spawned: wait for the existing reconciler rather than spawning a new one.
-    WaitExisting(ReconcilerWaiter),
-    /// shard needs reconciliation: call into [`TenantShard::spawn_reconciler`]
-    Yes,
-}
-
 /// When a reconcile task completes, it sends this result object
-/// to be applied to the primary TenantShard.
+/// to be applied to the primary TenantState.
 pub(crate) struct ReconcileResult {
    pub(crate) sequence: Sequence,
    /// On errors, `observed` should be treated as an incompleted description
@@ -404,7 +367,7 @@ pub(crate) struct ReconcileResult {
    pub(crate) generation: Option<Generation>,
    pub(crate) observed: ObservedState,

-    /// Set [`TenantShard::pending_compute_notification`] from this flag
+    /// Set [`TenantState::pending_compute_notification`] from this flag
    pub(crate) pending_compute_notification: bool,
 }

@@ -416,7 +379,7 @@ impl ObservedState {
    }
 }

-impl TenantShard {
+impl TenantState {
    pub(crate) fn new(
        tenant_shard_id: TenantShardId,
        shard: ShardIdentity,
@@ -433,7 +396,6 @@ impl TenantShard {
            reconciler: None,
            splitting: SplitState::Idle,
            sequence: Sequence(1),
-            delayed_reconcile: false,
            waiter: Arc::new(SeqWait::new(Sequence(0))),
            error_waiter: Arc::new(SeqWait::new(Sequence(0))),
            last_error: Arc::default(),
@@ -685,13 +647,10 @@ impl TenantShard {
                        "Identified optimization: migrate attachment {attached}->{preferred_node} (secondaries {:?})",
                        self.intent.get_secondary()
                    );
-                    return Some(ScheduleOptimization {
-                        sequence: self.sequence,
-                        action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
-                            old_attached_node_id: attached,
-                            new_attached_node_id: *preferred_node,
-                        }),
-                    });
+                    return Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
+                        old_attached_node_id: attached,
+                        new_attached_node_id: *preferred_node,
+                    }));
                }
            } else {
                tracing::debug!(
@@ -749,37 +708,28 @@ impl TenantShard {
                    "Identified optimization: replace secondary {secondary}->{candidate_node} (current secondaries {:?})",
                    self.intent.get_secondary()
                );
-                return Some(ScheduleOptimization {
-                    sequence: self.sequence,
-                    action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
-                        old_node_id: *secondary,
-                        new_node_id: candidate_node,
-                    }),
-                });
+                return Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
+                    old_node_id: *secondary,
+                    new_node_id: candidate_node,
+                }));
            }
        }

        None
    }

-    /// Return true if the optimization was really applied: it will not be applied if the optimization's
-    /// sequence is behind this tenant shard's
    pub(crate) fn apply_optimization(
        &mut self,
        scheduler: &mut Scheduler,
        optimization: ScheduleOptimization,
-    ) -> bool {
-        if optimization.sequence != self.sequence {
-            return false;
-        }
-
+    ) {
        metrics::METRICS_REGISTRY
            .metrics_group
            .storage_controller_schedule_optimization
            .inc();

-        match optimization.action {
-            ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
+        match optimization {
+            ScheduleOptimization::MigrateAttachment(MigrateAttachment {
                old_attached_node_id,
                new_attached_node_id,
            }) => {
@@ -787,7 +737,7 @@ impl TenantShard {
                self.intent
                    .promote_attached(scheduler, new_attached_node_id);
            }
-            ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
+            ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
                old_node_id,
                new_node_id,
            }) => {
@@ -795,8 +745,6 @@ impl TenantShard {
                self.intent.push_secondary(scheduler, new_node_id);
            }
        }
-
-        true
    }

    /// Query whether the tenant's observed state for attached node matches its intent state, and if so,
@@ -883,10 +831,16 @@ impl TenantShard {

    #[allow(clippy::too_many_arguments)]
    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
-    pub(crate) fn get_reconcile_needed(
+    pub(crate) fn maybe_reconcile(
        &mut self,
+        result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
        pageservers: &Arc<HashMap<NodeId, Node>>,
-    ) -> ReconcileNeeded {
+        compute_hook: &Arc<ComputeHook>,
+        service_config: &service::Config,
+        persistence: &Arc<Persistence>,
+        gate: &Gate,
+        cancel: &CancellationToken,
+    ) -> Option<ReconcilerWaiter> {
        // If there are any ambiguous observed states, and the nodes they refer to are available,
        // we should reconcile to clean them up.
        let mut dirty_observed = false;
@@ -908,8 +862,8 @@ impl TenantShard {
            active_nodes_dirty || dirty_observed || self.pending_compute_notification;

        if !do_reconcile {
-            tracing::debug!("Not dirty, no reconciliation needed.");
-            return ReconcileNeeded::No;
+            tracing::info!("Not dirty, no reconciliation needed.");
+            return None;
        }

        // If we are currently splitting, then never start a reconciler task: the splitting logic
@@ -917,7 +871,7 @@ impl TenantShard {
        // up top, so that we only log this message if we would otherwise have done a reconciliation.
        if !matches!(self.splitting, SplitState::Idle) {
            tracing::info!("Refusing to reconcile, splitting in progress");
-            return ReconcileNeeded::No;
+            return None;
        }

        // Reconcile already in flight for the current sequence?
@@ -927,7 +881,7 @@ impl TenantShard {
                    "Reconciliation already in progress for sequence {:?}",
                    self.sequence,
                );
-                return ReconcileNeeded::WaitExisting(ReconcilerWaiter {
+                return Some(ReconcilerWaiter {
                    tenant_shard_id: self.tenant_shard_id,
                    seq_wait: self.waiter.clone(),
                    error_seq_wait: self.error_waiter.clone(),
@@ -946,67 +900,10 @@ impl TenantShard {
                // We only reach this point if there is work to do and we're going to skip
                // doing it: warn it obvious why this tenant isn't doing what it ought to.
                tracing::warn!("Skipping reconcile for policy {:?}", self.scheduling_policy);
-                return ReconcileNeeded::No;
+                return None;
            }
        }

-        ReconcileNeeded::Yes
-    }
-
-    /// Ensure the sequence number is set to a value where waiting for this value will make us wait
-    /// for the next reconcile: i.e. it is ahead of all completed or running reconcilers.
-    ///
-    /// Constructing a ReconcilerWaiter with the resulting sequence number gives the property
-    /// that the waiter will not complete until some future Reconciler is constructed and run.
-    fn ensure_sequence_ahead(&mut self) {
-        // Find the highest sequence for which a Reconciler has previously run or is currently
-        // running
-        let max_seen = std::cmp::max(
-            self.reconciler
-                .as_ref()
-                .map(|r| r.sequence)
-                .unwrap_or(Sequence(0)),
-            std::cmp::max(self.waiter.load(), self.error_waiter.load()),
-        );
-
-        if self.sequence <= max_seen {
-            self.sequence = max_seen.next();
-        }
-    }
-
-    /// Create a waiter that will wait for some future Reconciler that hasn't been spawned yet.
-    ///
-    /// This is appropriate when you can't spawn a reconciler (e.g. due to resource limits), but
-    /// you would like to wait on the next reconciler that gets spawned in the background.
-    pub(crate) fn future_reconcile_waiter(&mut self) -> ReconcilerWaiter {
-        self.ensure_sequence_ahead();
-
-        ReconcilerWaiter {
-            tenant_shard_id: self.tenant_shard_id,
-            seq_wait: self.waiter.clone(),
-            error_seq_wait: self.error_waiter.clone(),
-            error: self.last_error.clone(),
-            seq: self.sequence,
-        }
-    }
-
-    #[allow(clippy::too_many_arguments)]
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
-    pub(crate) fn spawn_reconciler(
-        &mut self,
-        result_tx: &tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
-        pageservers: &Arc<HashMap<NodeId, Node>>,
-        compute_hook: &Arc<ComputeHook>,
-        service_config: &service::Config,
-        persistence: &Arc<Persistence>,
-        units: ReconcileUnits,
-        gate_guard: GateGuard,
-        cancel: &CancellationToken,
-    ) -> Option<ReconcilerWaiter> {
-        // Reconcile in flight for a stale sequence?  Our sequence's task will wait for it before
-        // doing our sequence's work.
-        let old_handle = self.reconciler.take();
-
        // Build list of nodes from which the reconciler should detach
        let mut detach = Vec::new();
        for node_id in self.observed.locations.keys() {
@@ -1022,9 +919,18 @@ impl TenantShard {
            }
        }

+        // Reconcile in flight for a stale sequence?  Our sequence's task will wait for it before
+        // doing our sequence's work.
+        let old_handle = self.reconciler.take();
+
+        let Ok(gate_guard) = gate.enter() else {
+            // Shutting down, don't start a reconciler
+            return None;
+        };
+
        // Advance the sequence before spawning a reconciler, so that sequence waiters
        // can distinguish between before+after the reconcile completes.
-        self.ensure_sequence_ahead();
+        self.sequence = self.sequence.next();

        let reconciler_cancel = cancel.child_token();
        let reconciler_intent = TargetState::from_intent(pageservers, &self.intent);
@@ -1039,7 +945,6 @@ impl TenantShard {
            compute_hook: compute_hook.clone(),
            service_config: service_config.clone(),
            _gate_guard: gate_guard,
-            _resource_units: units,
            cancel: reconciler_cancel.clone(),
            persistence: persistence.clone(),
            compute_notify_failure: false,
@@ -1106,18 +1011,16 @@ impl TenantShard {
                        status: outcome_label,
                    });

-                // Constructing result implicitly drops Reconciler, freeing any ReconcileUnits before the Service might
-                // try and schedule more work in response to our result.
-                let result = ReconcileResult {
-                    sequence: reconcile_seq,
-                    result,
-                    tenant_shard_id: reconciler.tenant_shard_id,
-                    generation: reconciler.generation,
-                    observed: reconciler.observed,
-                    pending_compute_notification: reconciler.compute_notify_failure,
-                };
-
-                result_tx.send(result).ok();
+                result_tx
+                    .send(ReconcileResult {
+                        sequence: reconcile_seq,
+                        result,
+                        tenant_shard_id: reconciler.tenant_shard_id,
+                        generation: reconciler.generation,
+                        observed: reconciler.observed,
+                        pending_compute_notification: reconciler.compute_notify_failure,
+                    })
+                    .ok();
            }
            .instrument(reconciler_span),
        );
@@ -1186,13 +1089,6 @@ impl TenantShard {
        &self.scheduling_policy
    }

-    pub(crate) fn set_last_error(&mut self, sequence: Sequence, error: ReconcileError) {
-        // Ordering: always set last_error before advancing sequence, so that sequence
-        // waiters are guaranteed to see a Some value when they see an error.
-        *(self.last_error.lock().unwrap()) = Some(Arc::new(error));
-        self.error_waiter.advance(sequence);
-    }
-
    pub(crate) fn from_persistent(
        tsp: TenantShardPersistence,
        intent: IntentState,
@@ -1215,7 +1111,6 @@ impl TenantShard {
            error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
            last_error: Arc::default(),
            pending_compute_notification: false,
-            delayed_reconcile: false,
            scheduling_policy: serde_json::from_str(&tsp.scheduling_policy).unwrap(),
        })
    }
@@ -1248,7 +1143,7 @@ pub(crate) mod tests {

    use super::*;

-    fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantShard {
+    fn make_test_tenant_shard(policy: PlacementPolicy) -> TenantState {
        let tenant_id = TenantId::generate();
        let shard_number = ShardNumber(0);
        let shard_count = ShardCount::new(1);
@@ -1258,7 +1153,7 @@ pub(crate) mod tests {
            shard_number,
            shard_count,
        };
-        TenantShard::new(
+        TenantState::new(
            tenant_shard_id,
            ShardIdentity::new(
                shard_number,
@@ -1270,7 +1165,7 @@ pub(crate) mod tests {
        )
    }

-    fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantShard> {
+    fn make_test_tenant(policy: PlacementPolicy, shard_count: ShardCount) -> Vec<TenantState> {
        let tenant_id = TenantId::generate();

        (0..shard_count.count())
@@ -1282,7 +1177,7 @@ pub(crate) mod tests {
                    shard_number,
                    shard_count,
                };
-                TenantShard::new(
+                TenantState::new(
                    tenant_shard_id,
                    ShardIdentity::new(
                        shard_number,
@@ -1307,24 +1202,24 @@ pub(crate) mod tests {
        let mut scheduler = Scheduler::new(nodes.values());
        let mut context = ScheduleContext::default();

-        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
-        tenant_shard
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        tenant_state
            .schedule(&mut scheduler, &mut context)
            .expect("we have enough nodes, scheduling should work");

        // Expect to initially be schedule on to different nodes
-        assert_eq!(tenant_shard.intent.secondary.len(), 1);
-        assert!(tenant_shard.intent.attached.is_some());
+        assert_eq!(tenant_state.intent.secondary.len(), 1);
+        assert!(tenant_state.intent.attached.is_some());

-        let attached_node_id = tenant_shard.intent.attached.unwrap();
-        let secondary_node_id = *tenant_shard.intent.secondary.iter().last().unwrap();
+        let attached_node_id = tenant_state.intent.attached.unwrap();
+        let secondary_node_id = *tenant_state.intent.secondary.iter().last().unwrap();
        assert_ne!(attached_node_id, secondary_node_id);

        // Notifying the attached node is offline should demote it to a secondary
-        let changed = tenant_shard.intent.demote_attached(attached_node_id);
+        let changed = tenant_state.intent.demote_attached(attached_node_id);
        assert!(changed);
-        assert!(tenant_shard.intent.attached.is_none());
-        assert_eq!(tenant_shard.intent.secondary.len(), 2);
+        assert!(tenant_state.intent.attached.is_none());
+        assert_eq!(tenant_state.intent.secondary.len(), 2);

        // Update the scheduler state to indicate the node is offline
        nodes
@@ -1334,18 +1229,18 @@ pub(crate) mod tests {
        scheduler.node_upsert(nodes.get(&attached_node_id).unwrap());

        // Scheduling the node should promote the still-available secondary node to attached
-        tenant_shard
+        tenant_state
            .schedule(&mut scheduler, &mut context)
            .expect("active nodes are available");
-        assert_eq!(tenant_shard.intent.attached.unwrap(), secondary_node_id);
+        assert_eq!(tenant_state.intent.attached.unwrap(), secondary_node_id);

        // The original attached node should have been retained as a secondary
        assert_eq!(
-            *tenant_shard.intent.secondary.iter().last().unwrap(),
+            *tenant_state.intent.secondary.iter().last().unwrap(),
            attached_node_id
        );

-        tenant_shard.intent.clear(&mut scheduler);
+        tenant_state.intent.clear(&mut scheduler);

        Ok(())
    }
@@ -1355,48 +1250,48 @@ pub(crate) mod tests {
        let nodes = make_test_nodes(3);
        let mut scheduler = Scheduler::new(nodes.values());

-        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));

-        tenant_shard.observed.locations.insert(
+        tenant_state.observed.locations.insert(
            NodeId(3),
            ObservedStateLocation {
                conf: Some(LocationConfig {
                    mode: LocationConfigMode::AttachedMulti,
                    generation: Some(2),
                    secondary_conf: None,
-                    shard_number: tenant_shard.shard.number.0,
-                    shard_count: tenant_shard.shard.count.literal(),
-                    shard_stripe_size: tenant_shard.shard.stripe_size.0,
+                    shard_number: tenant_state.shard.number.0,
+                    shard_count: tenant_state.shard.count.literal(),
+                    shard_stripe_size: tenant_state.shard.stripe_size.0,
                    tenant_conf: TenantConfig::default(),
                }),
            },
        );

-        tenant_shard.observed.locations.insert(
+        tenant_state.observed.locations.insert(
            NodeId(2),
            ObservedStateLocation {
                conf: Some(LocationConfig {
                    mode: LocationConfigMode::AttachedStale,
                    generation: Some(1),
                    secondary_conf: None,
-                    shard_number: tenant_shard.shard.number.0,
-                    shard_count: tenant_shard.shard.count.literal(),
-                    shard_stripe_size: tenant_shard.shard.stripe_size.0,
+                    shard_number: tenant_state.shard.number.0,
+                    shard_count: tenant_state.shard.count.literal(),
+                    shard_stripe_size: tenant_state.shard.stripe_size.0,
                    tenant_conf: TenantConfig::default(),
                }),
            },
        );

-        tenant_shard.intent_from_observed(&mut scheduler);
+        tenant_state.intent_from_observed(&mut scheduler);

        // The highest generationed attached location gets used as attached
-        assert_eq!(tenant_shard.intent.attached, Some(NodeId(3)));
+        assert_eq!(tenant_state.intent.attached, Some(NodeId(3)));
        // Other locations get used as secondary
-        assert_eq!(tenant_shard.intent.secondary, vec![NodeId(2)]);
+        assert_eq!(tenant_state.intent.secondary, vec![NodeId(2)]);

-        scheduler.consistency_check(nodes.values(), [&tenant_shard].into_iter())?;
+        scheduler.consistency_check(nodes.values(), [&tenant_state].into_iter())?;

-        tenant_shard.intent.clear(&mut scheduler);
+        tenant_state.intent.clear(&mut scheduler);
        Ok(())
    }

@@ -1405,23 +1300,23 @@ pub(crate) mod tests {
        let nodes = make_test_nodes(3);
        let mut scheduler = Scheduler::new(nodes.values());

-        let mut tenant_shard = make_test_tenant_shard(PlacementPolicy::Attached(1));
+        let mut tenant_state = make_test_tenant_shard(PlacementPolicy::Attached(1));

        // In pause mode, schedule() shouldn't do anything
-        tenant_shard.scheduling_policy = ShardSchedulingPolicy::Pause;
-        assert!(tenant_shard
+        tenant_state.scheduling_policy = ShardSchedulingPolicy::Pause;
+        assert!(tenant_state
            .schedule(&mut scheduler, &mut ScheduleContext::default())
            .is_ok());
-        assert!(tenant_shard.intent.all_pageservers().is_empty());
+        assert!(tenant_state.intent.all_pageservers().is_empty());

        // In active mode, schedule() works
-        tenant_shard.scheduling_policy = ShardSchedulingPolicy::Active;
-        assert!(tenant_shard
+        tenant_state.scheduling_policy = ShardSchedulingPolicy::Active;
+        assert!(tenant_state
            .schedule(&mut scheduler, &mut ScheduleContext::default())
            .is_ok());
-        assert!(!tenant_shard.intent.all_pageservers().is_empty());
+        assert!(!tenant_state.intent.all_pageservers().is_empty());

-        tenant_shard.intent.clear(&mut scheduler);
+        tenant_state.intent.clear(&mut scheduler);
        Ok(())
    }

@@ -1452,13 +1347,10 @@ pub(crate) mod tests {
        // would be no other shards from the same tenant, and request to do so.
        assert_eq!(
            optimization_a,
-            Some(ScheduleOptimization {
-                sequence: shard_a.sequence,
-                action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
-                    old_attached_node_id: NodeId(1),
-                    new_attached_node_id: NodeId(2)
-                })
-            })
+            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
+                old_attached_node_id: NodeId(1),
+                new_attached_node_id: NodeId(2)
+            }))
        );

        // Note that these optimizing two shards in the same tenant with the same ScheduleContext is
@@ -1469,13 +1361,10 @@ pub(crate) mod tests {
        let optimization_b = shard_b.optimize_attachment(&nodes, &schedule_context);
        assert_eq!(
            optimization_b,
-            Some(ScheduleOptimization {
-                sequence: shard_b.sequence,
-                action: ScheduleOptimizationAction::MigrateAttachment(MigrateAttachment {
-                    old_attached_node_id: NodeId(1),
-                    new_attached_node_id: NodeId(3)
-                })
-            })
+            Some(ScheduleOptimization::MigrateAttachment(MigrateAttachment {
+                old_attached_node_id: NodeId(1),
+                new_attached_node_id: NodeId(3)
+            }))
        );

        // Applying these optimizations should result in the end state proposed
@@ -1519,13 +1408,10 @@ pub(crate) mod tests {
        // same tenant should generate an optimization to move one away
        assert_eq!(
            optimization_a,
-            Some(ScheduleOptimization {
-                sequence: shard_a.sequence,
-                action: ScheduleOptimizationAction::ReplaceSecondary(ReplaceSecondary {
-                    old_node_id: NodeId(3),
-                    new_node_id: NodeId(4)
-                })
-            })
+            Some(ScheduleOptimization::ReplaceSecondary(ReplaceSecondary {
+                old_node_id: NodeId(3),
+                new_node_id: NodeId(4)
+            }))
        );

        shard_a.apply_optimization(&mut scheduler, optimization_a.unwrap());
@@ -1543,7 +1429,7 @@ pub(crate) mod tests {
    fn optimize_til_idle(
        nodes: &HashMap<NodeId, Node>,
        scheduler: &mut Scheduler,
-        shards: &mut [TenantShard],
+        shards: &mut [TenantState],
    ) {
        let mut loop_n = 0;
        loop {
--- a/control_plane/src/background_process.rs
+++ b/control_plane/src/background_process.rs
@@ -86,10 +86,7 @@ where
        .stdout(process_log_file)
        .stderr(same_file_for_stderr)
        .args(args);
-
-    let filled_cmd = fill_env_vars_prefixed_neon(fill_remote_storage_secrets_vars(
-        fill_rust_env_vars(background_command),
-    ));
+    let filled_cmd = fill_remote_storage_secrets_vars(fill_rust_env_vars(background_command));
    filled_cmd.envs(envs);

    let pid_file_to_check = match &initial_pid_file {
@@ -271,15 +268,6 @@ fn fill_remote_storage_secrets_vars(mut cmd: &mut Command) -> &mut Command {
    cmd
 }

-fn fill_env_vars_prefixed_neon(mut cmd: &mut Command) -> &mut Command {
-    for (var, val) in std::env::vars() {
-        if var.starts_with("NEON_PAGESERVER_") {
-            cmd = cmd.env(var, val);
-        }
-    }
-    cmd
-}
-
 /// Add a `pre_exec` to the cmd that, inbetween fork() and exec(),
 /// 1. Claims a pidfile with a fcntl lock on it and
 /// 2. Sets up the pidfile's file descriptor so that it (and the lock)
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -14,15 +14,17 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::storage_controller::StorageController;
 use control_plane::{broker, local_env};
-use pageserver_api::config::{
-    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
-    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
+use pageserver_api::controller_api::{
+    NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy, PlacementPolicy,
 };
-use pageserver_api::controller_api::PlacementPolicy;
 use pageserver_api::models::{
    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
 };
 use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
+use pageserver_api::{
+    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
+    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
+};
 use postgres_backend::AuthType;
 use postgres_connection::parse_host_port;
 use safekeeper_api::{
@@ -133,7 +135,7 @@ fn main() -> Result<()> {
        let subcommand_result = match sub_name {
            "tenant" => rt.block_on(handle_tenant(sub_args, &mut env)),
            "timeline" => rt.block_on(handle_timeline(sub_args, &mut env)),
-            "start" => rt.block_on(handle_start_all(&env)),
+            "start" => rt.block_on(handle_start_all(sub_args, &env)),
            "stop" => rt.block_on(handle_stop_all(sub_args, &env)),
            "pageserver" => rt.block_on(handle_pageserver(sub_args, &env)),
            "storage_controller" => rt.block_on(handle_storage_controller(sub_args, &env)),
@@ -358,13 +360,6 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
        default_conf(*num_pageservers)
    };

-    let pageserver_config: toml_edit::Document =
-        if let Some(path) = init_match.get_one::<PathBuf>("pageserver-config") {
-            std::fs::read_to_string(path)?.parse()?
-        } else {
-            toml_edit::Document::new()
-        };
-
    let pg_version = init_match
        .get_one::<u32>("pg-version")
        .copied()
@@ -382,7 +377,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {
    // Initialize pageserver, create initial tenant and timeline.
    for ps_conf in &env.pageservers {
        PageServerNode::from_env(&env, ps_conf)
-            .initialize(&pageserver_config)
+            .initialize(&pageserver_config_overrides(init_match))
            .unwrap_or_else(|e| {
                eprintln!("pageserver init failed: {e:?}");
                exit(1);
@@ -404,6 +399,15 @@ fn get_default_pageserver(env: &local_env::LocalEnv) -> PageServerNode {
    PageServerNode::from_env(env, ps_conf)
 }

+fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
+    init_match
+        .get_many::<String>("pageserver-config-override")
+        .into_iter()
+        .flatten()
+        .map(String::as_str)
+        .collect()
+}
+
 async fn handle_tenant(
    tenant_match: &ArgMatches,
    env: &mut local_env::LocalEnv,
@@ -415,54 +419,6 @@ async fn handle_tenant(
                println!("{} {:?}", t.id, t.state);
            }
        }
-        Some(("import", import_match)) => {
-            let tenant_id = parse_tenant_id(import_match)?.unwrap_or_else(TenantId::generate);
-
-            let storage_controller = StorageController::from_env(env);
-            let create_response = storage_controller.tenant_import(tenant_id).await?;
-
-            let shard_zero = create_response
-                .shards
-                .first()
-                .expect("Import response omitted shards");
-
-            let attached_pageserver_id = shard_zero.node_id;
-            let pageserver =
-                PageServerNode::from_env(env, env.get_pageserver_conf(attached_pageserver_id)?);
-
-            println!(
-                "Imported tenant {tenant_id}, attached to pageserver {attached_pageserver_id}"
-            );
-
-            let timelines = pageserver
-                .http_client
-                .list_timelines(shard_zero.shard_id)
-                .await?;
-
-            // Pick a 'main' timeline that has no ancestors, the rest will get arbitrary names
-            let main_timeline = timelines
-                .iter()
-                .find(|t| t.ancestor_timeline_id.is_none())
-                .expect("No timelines found")
-                .timeline_id;
-
-            let mut branch_i = 0;
-            for timeline in timelines.iter() {
-                let branch_name = if timeline.timeline_id == main_timeline {
-                    "main".to_string()
-                } else {
-                    branch_i += 1;
-                    format!("branch_{branch_i}")
-                };
-
-                println!(
-                    "Importing timeline {tenant_id}/{} as branch {branch_name}",
-                    timeline.timeline_id
-                );
-
-                env.register_branch_mapping(branch_name, tenant_id, timeline.timeline_id)?;
-            }
-        }
        Some(("create", create_match)) => {
            let tenant_conf: HashMap<_, _> = create_match
                .get_many::<String>("config")
@@ -1066,7 +1022,10 @@ fn get_pageserver(env: &local_env::LocalEnv, args: &ArgMatches) -> Result<PageSe
 async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<()> {
    match sub_match.subcommand() {
        Some(("start", subcommand_args)) => {
-            if let Err(e) = get_pageserver(env, subcommand_args)?.start().await {
+            if let Err(e) = get_pageserver(env, subcommand_args)?
+                .start(&pageserver_config_overrides(subcommand_args))
+                .await
+            {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
@@ -1092,12 +1051,30 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
                exit(1);
            }

-            if let Err(e) = pageserver.start().await {
+            if let Err(e) = pageserver
+                .start(&pageserver_config_overrides(subcommand_args))
+                .await
+            {
                eprintln!("pageserver start failed: {e}");
                exit(1);
            }
        }

+        Some(("set-state", subcommand_args)) => {
+            let pageserver = get_pageserver(env, subcommand_args)?;
+            let scheduling = subcommand_args.get_one("scheduling");
+            let availability = subcommand_args.get_one("availability");
+
+            let storage_controller = StorageController::from_env(env);
+            storage_controller
+                .node_configure(NodeConfigureRequest {
+                    node_id: pageserver.conf.id,
+                    scheduling: scheduling.cloned(),
+                    availability: availability.cloned(),
+                })
+                .await?;
+        }
+
        Some(("status", subcommand_args)) => {
            match get_pageserver(env, subcommand_args)?.check_status().await {
                Ok(_) => println!("Page server is up and running"),
@@ -1219,7 +1196,7 @@ async fn handle_safekeeper(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
    Ok(())
 }

-async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {
+async fn handle_start_all(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::Result<()> {
    // Endpoints are not started automatically

    broker::start_broker_process(env).await?;
@@ -1236,7 +1213,10 @@ async fn handle_start_all(env: &local_env::LocalEnv) -> anyhow::Result<()> {

    for ps_conf in &env.pageservers {
        let pageserver = PageServerNode::from_env(env, ps_conf);
-        if let Err(e) = pageserver.start().await {
+        if let Err(e) = pageserver
+            .start(&pageserver_config_overrides(sub_match))
+            .await
+        {
            eprintln!("pageserver {} start failed: {:#}", ps_conf.id, e);
            try_stop_all(env, true).await;
            exit(1);
@@ -1268,7 +1248,7 @@ async fn try_stop_all(env: &local_env::LocalEnv, immediate: bool) {
    match ComputeControlPlane::load(env.clone()) {
        Ok(cplane) => {
            for (_k, node) in cplane.endpoints {
-                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast" }, false) {
+                if let Err(e) = node.stop(if immediate { "immediate" } else { "fast " }, false) {
                    eprintln!("postgres stop failed: {e:#}");
                }
            }
@@ -1377,6 +1357,13 @@ fn cli() -> Command {
        .required(false)
        .value_name("stop-mode");

+    let pageserver_config_args = Arg::new("pageserver-config-override")
+        .long("pageserver-config-override")
+        .num_args(1)
+        .action(ArgAction::Append)
+        .help("Additional pageserver's configuration options or overrides, refer to pageserver's 'config-override' CLI parameter docs for more")
+        .required(false);
+
    let remote_ext_config_args = Arg::new("remote-ext-config")
        .long("remote-ext-config")
        .num_args(1)
@@ -1432,21 +1419,14 @@ fn cli() -> Command {
        .subcommand(
            Command::new("init")
                .about("Initialize a new Neon repository, preparing configs for services to start with")
+                .arg(pageserver_config_args.clone())
                .arg(num_pageservers_arg.clone())
                .arg(
                    Arg::new("config")
                        .long("config")
                        .required(false)
                        .value_parser(value_parser!(PathBuf))
-                        .value_name("config")
-                )
-                .arg(
-                    Arg::new("pageserver-config")
-                        .long("pageserver-config")
-                        .required(false)
-                        .value_parser(value_parser!(PathBuf))
-                        .value_name("pageserver-config")
-                        .help("Merge the provided pageserver config into the one generated by neon_local."),
+                        .value_name("config"),
                )
                .arg(pg_version_arg.clone())
                .arg(force_arg)
@@ -1454,7 +1434,6 @@ fn cli() -> Command {
        .subcommand(
            Command::new("timeline")
            .about("Manage timelines")
-            .arg_required_else_help(true)
            .subcommand(Command::new("list")
                .about("List all timelines, available to this pageserver")
                .arg(tenant_id_arg.clone()))
@@ -1517,8 +1496,6 @@ fn cli() -> Command {
            .subcommand(Command::new("config")
                .arg(tenant_id_arg.clone())
                .arg(Arg::new("config").short('c').num_args(1).action(ArgAction::Append).required(false)))
-            .subcommand(Command::new("import").arg(tenant_id_arg.clone().required(true))
-                .about("Import a tenant that is present in remote storage, and create branches for its timelines"))
        )
        .subcommand(
            Command::new("pageserver")
@@ -1528,6 +1505,7 @@ fn cli() -> Command {
                .subcommand(Command::new("status"))
                .subcommand(Command::new("start")
                    .about("Start local pageserver")
+                    .arg(pageserver_config_args.clone())
                )
                .subcommand(Command::new("stop")
                    .about("Stop local pageserver")
@@ -1535,14 +1513,21 @@ fn cli() -> Command {
                )
                .subcommand(Command::new("restart")
                    .about("Restart local pageserver")
+                    .arg(pageserver_config_args.clone())
+                )
+                .subcommand(Command::new("set-state")
+                    .arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
+                    .arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
+                    .about("Set scheduling or availability state of pageserver node")
+                    .arg(pageserver_config_args.clone())
                )
        )
        .subcommand(
            Command::new("storage_controller")
                .arg_required_else_help(true)
                .about("Manage storage_controller")
-                .subcommand(Command::new("start").about("Start storage controller"))
-                .subcommand(Command::new("stop").about("Stop storage controller")
+                .subcommand(Command::new("start").about("Start local pageserver").arg(pageserver_config_args.clone()))
+                .subcommand(Command::new("stop").about("Stop local pageserver")
                            .arg(stop_mode_arg.clone()))
        )
        .subcommand(
@@ -1647,6 +1632,7 @@ fn cli() -> Command {
        .subcommand(
            Command::new("start")
                .about("Start page server and safekeepers")
+                .arg(pageserver_config_args)
        )
        .subcommand(
            Command::new("stop")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -554,7 +554,6 @@ impl Endpoint {
            format_version: 1.0,
            operation_uuid: None,
            features: self.features.clone(),
-            swap_size_bytes: None,
            cluster: Cluster {
                cluster_id: None, // project ID: not used
                name: None,       // project name: not used
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -17,7 +17,6 @@ use std::net::Ipv4Addr;
 use std::net::SocketAddr;
 use std::path::{Path, PathBuf};
 use std::process::{Command, Stdio};
-use std::time::Duration;
 use utils::{
    auth::{encode_from_key_file, Claims},
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -67,10 +66,6 @@ pub struct LocalEnv {

    pub broker: NeonBroker,

-    // Configuration for the storage controller (1 per neon_local environment)
-    #[serde(default)]
-    pub storage_controller: NeonStorageControllerConf,
-
    /// This Vec must always contain at least one pageserver
    pub pageservers: Vec<PageServerConf>,

@@ -103,29 +98,6 @@ pub struct NeonBroker {
    pub listen_addr: SocketAddr,
 }

-/// Broker config for cluster internal communication.
-#[derive(Serialize, Deserialize, PartialEq, Eq, Clone, Debug)]
-#[serde(default)]
-pub struct NeonStorageControllerConf {
-    /// Heartbeat timeout before marking a node offline
-    #[serde(with = "humantime_serde")]
-    pub max_unavailable: Duration,
-}
-
-impl NeonStorageControllerConf {
-    // Use a shorter pageserver unavailability interval than the default to speed up tests.
-    const DEFAULT_MAX_UNAVAILABLE_INTERVAL: std::time::Duration =
-        std::time::Duration::from_secs(10);
-}
-
-impl Default for NeonStorageControllerConf {
-    fn default() -> Self {
-        Self {
-            max_unavailable: Self::DEFAULT_MAX_UNAVAILABLE_INTERVAL,
-        }
-    }
-}
-
 // Dummy Default impl to satisfy Deserialize derive.
 impl Default for NeonBroker {
    fn default() -> Self {
@@ -157,8 +129,6 @@ pub struct PageServerConf {

    pub(crate) virtual_file_io_engine: Option<String>,
    pub(crate) get_vectored_impl: Option<String>,
-    pub(crate) get_impl: Option<String>,
-    pub(crate) validate_vectored_get: Option<bool>,
 }

 impl Default for PageServerConf {
@@ -171,8 +141,6 @@ impl Default for PageServerConf {
            http_auth_type: AuthType::Trust,
            virtual_file_io_engine: None,
            get_vectored_impl: None,
-            get_impl: None,
-            validate_vectored_get: None,
        }
    }
 }
@@ -188,7 +156,6 @@ pub struct SafekeeperConf {
    pub remote_storage: Option<String>,
    pub backup_threads: Option<u32>,
    pub auth_enabled: bool,
-    pub listen_addr: Option<String>,
 }

 impl Default for SafekeeperConf {
@@ -202,7 +169,6 @@ impl Default for SafekeeperConf {
            remote_storage: None,
            backup_threads: None,
            auth_enabled: false,
-            listen_addr: None,
        }
    }
 }
@@ -382,10 +348,7 @@ impl LocalEnv {

        // Find neon binaries.
        if env.neon_distrib_dir == Path::new("") {
-            env::current_exe()?
-                .parent()
-                .unwrap()
-                .clone_into(&mut env.neon_distrib_dir);
+            env.neon_distrib_dir = env::current_exe()?.parent().unwrap().to_owned();
        }

        if env.pageservers.is_empty() {
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -4,6 +4,7 @@
 //!
 //!   .neon/
 //!
+use std::borrow::Cow;
 use std::collections::HashMap;

 use std::io;
@@ -76,7 +77,7 @@ impl PageServerNode {
    /// Merge overrides provided by the user on the command line with our default overides derived from neon_local configuration.
    ///
    /// These all end up on the command line of the `pageserver` binary.
-    fn neon_local_overrides(&self, cli_overrides: &toml_edit::Document) -> Vec<String> {
+    fn neon_local_overrides(&self, cli_overrides: &[&str]) -> Vec<String> {
        // FIXME: the paths should be shell-escaped to handle paths with spaces, quotas etc.
        let pg_distrib_dir_param = format!(
            "pg_distrib_dir='{}'",
@@ -91,8 +92,6 @@ impl PageServerNode {
            http_auth_type,
            virtual_file_io_engine,
            get_vectored_impl,
-            get_impl,
-            validate_vectored_get,
        } = &self.conf;

        let id = format!("id={}", id);
@@ -112,16 +111,6 @@ impl PageServerNode {
        } else {
            String::new()
        };
-        let get_impl = if let Some(get_impl) = get_impl {
-            format!("get_impl='{get_impl}'")
-        } else {
-            String::new()
-        };
-        let validate_vectored_get = if let Some(validate_vectored_get) = validate_vectored_get {
-            format!("validate_vectored_get={validate_vectored_get}")
-        } else {
-            String::new()
-        };

        let broker_endpoint_param = format!("broker_endpoint='{}'", self.env.broker.client_url());

@@ -135,8 +124,6 @@ impl PageServerNode {
            broker_endpoint_param,
            virtual_file_io_engine,
            get_vectored_impl,
-            get_impl,
-            validate_vectored_get,
        ];

        if let Some(control_plane_api) = &self.env.control_plane_api {
@@ -156,7 +143,10 @@ impl PageServerNode {
            }
        }

-        if !cli_overrides.contains_key("remote_storage") {
+        if !cli_overrides
+            .iter()
+            .any(|c| c.starts_with("remote_storage"))
+        {
            overrides.push(format!(
                "remote_storage={{local_path='../{PAGESERVER_REMOTE_STORAGE_DIR}'}}"
            ));
@@ -169,13 +159,13 @@ impl PageServerNode {
        }

        // Apply the user-provided overrides
-        overrides.push(cli_overrides.to_string());
+        overrides.extend(cli_overrides.iter().map(|&c| c.to_owned()));

        overrides
    }

    /// Initializes a pageserver node by creating its config with the overrides provided.
-    pub fn initialize(&self, config_overrides: &toml_edit::Document) -> anyhow::Result<()> {
+    pub fn initialize(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        // First, run `pageserver --init` and wait for it to write a config into FS and exit.
        self.pageserver_init(config_overrides)
            .with_context(|| format!("Failed to run init for pageserver node {}", self.conf.id))
@@ -193,11 +183,11 @@ impl PageServerNode {
            .expect("non-Unicode path")
    }

-    pub async fn start(&self) -> anyhow::Result<()> {
-        self.start_node().await
+    pub async fn start(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
+        self.start_node(config_overrides, false).await
    }

-    fn pageserver_init(&self, config_overrides: &toml_edit::Document) -> anyhow::Result<()> {
+    fn pageserver_init(&self, config_overrides: &[&str]) -> anyhow::Result<()> {
        let datadir = self.repo_path();
        let node_id = self.conf.id;
        println!(
@@ -215,18 +205,11 @@ impl PageServerNode {
        let datadir_path_str = datadir.to_str().with_context(|| {
            format!("Cannot start pageserver node {node_id} in path that has no string representation: {datadir:?}")
        })?;
+        let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
+        args.push(Cow::Borrowed("--init"));

-        // `pageserver --init` merges the `--config-override`s into a built-in default config,
-        // then writes out the merged product to `pageserver.toml`.
-        // TODO: just write the full `pageserver.toml` and get rid of `--config-override`.
-        let mut args = vec!["--init", "--workdir", datadir_path_str];
-        let overrides = self.neon_local_overrides(config_overrides);
-        for piece in &overrides {
-            args.push("--config-override");
-            args.push(piece);
-        }
        let init_output = Command::new(self.env.pageserver_bin())
-            .args(args)
+            .args(args.iter().map(Cow::as_ref))
            .envs(self.pageserver_env_variables()?)
            .output()
            .with_context(|| format!("Failed to run pageserver init for node {node_id}"))?;
@@ -251,13 +234,12 @@ impl PageServerNode {
        // situation: the metadata is written by some other script.
        std::fs::write(
            metadata_path,
-            serde_json::to_vec(&pageserver_api::config::NodeMetadata {
-                postgres_host: "localhost".to_string(),
-                postgres_port: self.pg_connection_config.port(),
-                http_host: "localhost".to_string(),
-                http_port,
-                other: HashMap::new(),
-            })
+            serde_json::to_vec(&serde_json::json!({
+                "host": "localhost",
+                "port": self.pg_connection_config.port(),
+                "http_host": "localhost",
+                "http_port": http_port,
+            }))
            .unwrap(),
        )
        .expect("Failed to write metadata file");
@@ -265,7 +247,11 @@ impl PageServerNode {
        Ok(())
    }

-    async fn start_node(&self) -> anyhow::Result<()> {
+    async fn start_node(
+        &self,
+        config_overrides: &[&str],
+        update_config: bool,
+    ) -> anyhow::Result<()> {
        // TODO: using a thread here because start_process() is not async but we need to call check_status()
        let datadir = self.repo_path();
        print!(
@@ -282,12 +268,15 @@ impl PageServerNode {
                self.conf.id, datadir,
            )
        })?;
-        let args = vec!["-D", datadir_path_str];
+        let mut args = self.pageserver_basic_args(config_overrides, datadir_path_str);
+        if update_config {
+            args.push(Cow::Borrowed("--update-config"));
+        }
        background_process::start_process(
            "pageserver",
            &datadir,
            &self.env.pageserver_bin(),
-            args,
+            args.iter().map(Cow::as_ref),
            self.pageserver_env_variables()?,
            background_process::InitialPidFile::Expect(self.pid_file()),
            || async {
@@ -304,6 +293,22 @@ impl PageServerNode {
        Ok(())
    }

+    fn pageserver_basic_args<'a>(
+        &self,
+        config_overrides: &'a [&'a str],
+        datadir_path_str: &'a str,
+    ) -> Vec<Cow<'a, str>> {
+        let mut args = vec![Cow::Borrowed("-D"), Cow::Borrowed(datadir_path_str)];
+
+        let overrides = self.neon_local_overrides(config_overrides);
+        for config_override in overrides {
+            args.push(Cow::Borrowed("-c"));
+            args.push(Cow::Owned(config_override));
+        }
+
+        args
+    }
+
    fn pageserver_env_variables(&self) -> anyhow::Result<Vec<(String, String)>> {
        // FIXME: why is this tied to pageserver's auth type? Whether or not the safekeeper
        // needs a token, and how to generate that token, seems independent to whether
@@ -429,11 +434,6 @@ impl PageServerNode {
                .map(serde_json::from_str)
                .transpose()
                .context("parse `timeline_get_throttle` from json")?,
-            switch_to_aux_file_v2: settings
-                .remove("switch_to_aux_file_v2")
-                .map(|x| x.parse::<bool>())
-                .transpose()
-                .context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
@@ -552,11 +552,6 @@ impl PageServerNode {
                    .map(serde_json::from_str)
                    .transpose()
                    .context("parse `timeline_get_throttle` from json")?,
-                switch_to_aux_file_v2: settings
-                    .remove("switch_to_aux_file_v2")
-                    .map(|x| x.parse::<bool>())
-                    .transpose()
-                    .context("Failed to parse 'switch_to_aux_file_v2' as bool")?,
            }
        };

--- a/control_plane/src/safekeeper.rs
+++ b/control_plane/src/safekeeper.rs
@@ -70,31 +70,24 @@ pub struct SafekeeperNode {
    pub pg_connection_config: PgConnectionConfig,
    pub env: LocalEnv,
    pub http_client: reqwest::Client,
-    pub listen_addr: String,
    pub http_base_url: String,
 }

 impl SafekeeperNode {
    pub fn from_env(env: &LocalEnv, conf: &SafekeeperConf) -> SafekeeperNode {
-        let listen_addr = if let Some(ref listen_addr) = conf.listen_addr {
-            listen_addr.clone()
-        } else {
-            "127.0.0.1".to_string()
-        };
        SafekeeperNode {
            id: conf.id,
            conf: conf.clone(),
-            pg_connection_config: Self::safekeeper_connection_config(&listen_addr, conf.pg_port),
+            pg_connection_config: Self::safekeeper_connection_config(conf.pg_port),
            env: env.clone(),
            http_client: reqwest::Client::new(),
-            http_base_url: format!("http://{}:{}/v1", listen_addr, conf.http_port),
-            listen_addr,
+            http_base_url: format!("http://127.0.0.1:{}/v1", conf.http_port),
        }
    }

    /// Construct libpq connection string for connecting to this safekeeper.
-    fn safekeeper_connection_config(addr: &str, port: u16) -> PgConnectionConfig {
-        PgConnectionConfig::new_host_port(url::Host::parse(addr).unwrap(), port)
+    fn safekeeper_connection_config(port: u16) -> PgConnectionConfig {
+        PgConnectionConfig::new_host_port(url::Host::parse("127.0.0.1").unwrap(), port)
    }

    pub fn datadir_path_by_id(env: &LocalEnv, sk_id: NodeId) -> PathBuf {
@@ -118,8 +111,8 @@ impl SafekeeperNode {
        );
        io::stdout().flush().unwrap();

-        let listen_pg = format!("{}:{}", self.listen_addr, self.conf.pg_port);
-        let listen_http = format!("{}:{}", self.listen_addr, self.conf.http_port);
+        let listen_pg = format!("127.0.0.1:{}", self.conf.pg_port);
+        let listen_http = format!("127.0.0.1:{}", self.conf.http_port);
        let id = self.id;
        let datadir = self.datadir_path();

@@ -146,7 +139,7 @@ impl SafekeeperNode {
            availability_zone,
        ];
        if let Some(pg_tenant_only_port) = self.conf.pg_tenant_only_port {
-            let listen_pg_tenant_only = format!("{}:{}", self.listen_addr, pg_tenant_only_port);
+            let listen_pg_tenant_only = format!("127.0.0.1:{}", pg_tenant_only_port);
            args.extend(["--listen-pg-tenant-only".to_owned(), listen_pg_tenant_only]);
        }
        if !self.conf.sync {
--- a/control_plane/src/storage_controller.rs
+++ b/control_plane/src/storage_controller.rs
@@ -1,8 +1,6 @@
-use crate::{
-    background_process,
-    local_env::{LocalEnv, NeonStorageControllerConf},
-};
+use crate::{background_process, local_env::LocalEnv};
 use camino::{Utf8Path, Utf8PathBuf};
+use hyper::Method;
 use pageserver_api::{
    controller_api::{
        NodeConfigureRequest, NodeRegisterRequest, TenantCreateResponse, TenantLocateResponse,
@@ -16,7 +14,6 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api::ResponseErrorMessageExt;
 use postgres_backend::AuthType;
-use reqwest::Method;
 use serde::{de::DeserializeOwned, Deserialize, Serialize};
 use std::{fs, str::FromStr};
 use tokio::process::Command;
@@ -35,13 +32,15 @@ pub struct StorageController {
    public_key: Option<String>,
    postgres_port: u16,
    client: reqwest::Client,
-    config: NeonStorageControllerConf,
 }

 const COMMAND: &str = "storage_controller";

 const STORAGE_CONTROLLER_POSTGRES_VERSION: u32 = 16;

+// Use a shorter pageserver unavailability interval than the default to speed up tests.
+const NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL: std::time::Duration = std::time::Duration::from_secs(10);
+
 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
    pub tenant_shard_id: TenantShardId,
@@ -136,7 +135,6 @@ impl StorageController {
            client: reqwest::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
-            config: env.storage_controller.clone(),
        }
    }

@@ -274,6 +272,8 @@ impl StorageController {
        // Run migrations on every startup, in case something changed.
        let database_url = self.setup_database().await?;

+        let max_unavailable: humantime::Duration = NEON_LOCAL_MAX_UNAVAILABLE_INTERVAL.into();
+
        let mut args = vec![
            "-l",
            &self.listen,
@@ -283,7 +283,7 @@ impl StorageController {
            "--database-url",
            &database_url,
            "--max-unavailable-interval",
-            &humantime::Duration::from(self.config.max_unavailable).to_string(),
+            &max_unavailable.to_string(),
        ]
        .into_iter()
        .map(|s| s.to_string())
@@ -379,7 +379,7 @@ impl StorageController {
    /// Simple HTTP request wrapper for calling into storage controller
    async fn dispatch<RQ, RS>(
        &self,
-        method: reqwest::Method,
+        method: hyper::Method,
        path: String,
        body: Option<RQ>,
    ) -> anyhow::Result<RS>
@@ -472,16 +472,6 @@ impl StorageController {
            .await
    }

-    #[instrument(skip(self))]
-    pub async fn tenant_import(&self, tenant_id: TenantId) -> anyhow::Result<TenantCreateResponse> {
-        self.dispatch::<(), TenantCreateResponse>(
-            Method::POST,
-            format!("debug/v1/tenant/{tenant_id}/import"),
-            None,
-        )
-        .await
-    }
-
    #[instrument(skip(self))]
    pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
        self.dispatch::<(), _>(
--- a/control_plane/storcon_cli/Cargo.toml
+++ b/control_plane/storcon_cli/Cargo.toml
@@ -1,23 +0,0 @@
-[package]
-name = "storcon_cli"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-
-[dependencies]
-anyhow.workspace = true
-clap.workspace = true
-comfy-table.workspace = true
-hyper.workspace = true
-pageserver_api.workspace = true
-pageserver_client.workspace = true
-reqwest.workspace = true
-serde.workspace = true
-serde_json = { workspace = true, features = ["raw_value"] }
-thiserror.workspace = true
-tokio.workspace = true
-tracing.workspace = true
-utils.workspace = true
-workspace_hack.workspace = true
-
--- a/control_plane/storcon_cli/src/main.rs
+++ b/control_plane/storcon_cli/src/main.rs
@@ -1,680 +0,0 @@
-use std::{collections::HashMap, str::FromStr, time::Duration};
-
-use clap::{Parser, Subcommand};
-use pageserver_api::{
-    controller_api::{
-        NodeAvailabilityWrapper, NodeDescribeResponse, ShardSchedulingPolicy,
-        TenantDescribeResponse, TenantPolicyRequest,
-    },
-    models::{
-        LocationConfigSecondary, ShardParameters, TenantConfig, TenantConfigRequest,
-        TenantCreateRequest, TenantShardSplitRequest, TenantShardSplitResponse,
-    },
-    shard::{ShardStripeSize, TenantShardId},
-};
-use pageserver_client::mgmt_api::{self, ResponseErrorMessageExt};
-use reqwest::{Method, StatusCode, Url};
-use serde::{de::DeserializeOwned, Serialize};
-use utils::id::{NodeId, TenantId};
-
-use pageserver_api::controller_api::{
-    NodeConfigureRequest, NodeRegisterRequest, NodeSchedulingPolicy, PlacementPolicy,
-    TenantLocateResponse, TenantShardMigrateRequest, TenantShardMigrateResponse,
-};
-
-#[derive(Subcommand, Debug)]
-enum Command {
-    /// Register a pageserver with the storage controller.  This shouldn't usually be necessary,
-    /// since pageservers auto-register when they start up
-    NodeRegister {
-        #[arg(long)]
-        node_id: NodeId,
-
-        #[arg(long)]
-        listen_pg_addr: String,
-        #[arg(long)]
-        listen_pg_port: u16,
-
-        #[arg(long)]
-        listen_http_addr: String,
-        #[arg(long)]
-        listen_http_port: u16,
-    },
-
-    /// Modify a node's configuration in the storage controller
-    NodeConfigure {
-        #[arg(long)]
-        node_id: NodeId,
-
-        /// Availability is usually auto-detected based on heartbeats.  Set 'offline' here to
-        /// manually mark a node offline
-        #[arg(long)]
-        availability: Option<NodeAvailabilityArg>,
-        /// Scheduling policy controls whether tenant shards may be scheduled onto this node.
-        #[arg(long)]
-        scheduling: Option<NodeSchedulingPolicy>,
-    },
-    /// Modify a tenant's policies in the storage controller
-    TenantPolicy {
-        #[arg(long)]
-        tenant_id: TenantId,
-        /// Placement policy controls whether a tenant is `detached`, has only a secondary location (`secondary`),
-        /// or is in the normal attached state with N secondary locations (`attached:N`)
-        #[arg(long)]
-        placement: Option<PlacementPolicyArg>,
-        /// Scheduling policy enables pausing the controller's scheduling activity involving this tenant.  `active` is normal,
-        /// `essential` disables optimization scheduling changes, `pause` disables all scheduling changes, and `stop` prevents
-        /// all reconciliation activity including for scheduling changes already made.  `pause` and `stop` can make a tenant
-        /// unavailable, and are only for use in emergencies.
-        #[arg(long)]
-        scheduling: Option<ShardSchedulingPolicyArg>,
-    },
-    /// List nodes known to the storage controller
-    Nodes {},
-    /// List tenants known to the storage controller
-    Tenants {},
-    /// Create a new tenant in the storage controller, and by extension on pageservers.
-    TenantCreate {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-    /// Delete a tenant in the storage controller, and by extension on pageservers.
-    TenantDelete {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-    /// Split an existing tenant into a higher number of shards than its current shard count.
-    TenantShardSplit {
-        #[arg(long)]
-        tenant_id: TenantId,
-        #[arg(long)]
-        shard_count: u8,
-        /// Optional, in 8kiB pages.  e.g. set 2048 for 16MB stripes.
-        #[arg(long)]
-        stripe_size: Option<u32>,
-    },
-    /// Migrate the attached location for a tenant shard to a specific pageserver.
-    TenantShardMigrate {
-        #[arg(long)]
-        tenant_shard_id: TenantShardId,
-        #[arg(long)]
-        node: NodeId,
-    },
-    /// Modify the pageserver tenant configuration of a tenant: this is the configuration structure
-    /// that is passed through to pageservers, and does not affect storage controller behavior.
-    TenantConfig {
-        #[arg(long)]
-        tenant_id: TenantId,
-        #[arg(long)]
-        config: String,
-    },
-    /// Attempt to balance the locations for a tenant across pageservers.  This is a client-side
-    /// alternative to the storage controller's scheduling optimization behavior.
-    TenantScatter {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-    /// Print details about a particular tenant, including all its shards' states.
-    TenantDescribe {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-    /// For a tenant which hasn't been onboarded to the storage controller yet, add it in secondary
-    /// mode so that it can warm up content on a pageserver.
-    TenantWarmup {
-        #[arg(long)]
-        tenant_id: TenantId,
-    },
-}
-
-#[derive(Parser)]
-#[command(
-    author,
-    version,
-    about,
-    long_about = "CLI for Storage Controller Support/Debug"
-)]
-#[command(arg_required_else_help(true))]
-struct Cli {
-    #[arg(long)]
-    /// URL to storage controller.  e.g. http://127.0.0.1:1234 when using `neon_local`
-    api: Url,
-
-    #[arg(long)]
-    /// JWT token for authenticating with storage controller.  Depending on the API used, this
-    /// should have either `pageserverapi` or `admin` scopes: for convenience, you should mint
-    /// a token with both scopes to use with this tool.
-    jwt: Option<String>,
-
-    #[command(subcommand)]
-    command: Command,
-}
-
-#[derive(Debug, Clone)]
-struct PlacementPolicyArg(PlacementPolicy);
-
-impl FromStr for PlacementPolicyArg {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "detached" => Ok(Self(PlacementPolicy::Detached)),
-            "secondary" => Ok(Self(PlacementPolicy::Secondary)),
-            _ if s.starts_with("attached:") => {
-                let mut splitter = s.split(':');
-                let _prefix = splitter.next().unwrap();
-                match splitter.next().and_then(|s| s.parse::<usize>().ok()) {
-                    Some(n) => Ok(Self(PlacementPolicy::Attached(n))),
-                    None => Err(anyhow::anyhow!(
-                        "Invalid format '{s}', a valid example is 'attached:1'"
-                    )),
-                }
-            }
-            _ => Err(anyhow::anyhow!(
-                "Unknown placement policy '{s}', try detached,secondary,attached:<n>"
-            )),
-        }
-    }
-}
-
-#[derive(Debug, Clone)]
-struct ShardSchedulingPolicyArg(ShardSchedulingPolicy);
-
-impl FromStr for ShardSchedulingPolicyArg {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self(ShardSchedulingPolicy::Active)),
-            "essential" => Ok(Self(ShardSchedulingPolicy::Essential)),
-            "pause" => Ok(Self(ShardSchedulingPolicy::Pause)),
-            "stop" => Ok(Self(ShardSchedulingPolicy::Stop)),
-            _ => Err(anyhow::anyhow!(
-                "Unknown scheduling policy '{s}', try active,essential,pause,stop"
-            )),
-        }
-    }
-}
-
-#[derive(Debug, Clone)]
-struct NodeAvailabilityArg(NodeAvailabilityWrapper);
-
-impl FromStr for NodeAvailabilityArg {
-    type Err = anyhow::Error;
-
-    fn from_str(s: &str) -> Result<Self, Self::Err> {
-        match s {
-            "active" => Ok(Self(NodeAvailabilityWrapper::Active)),
-            "offline" => Ok(Self(NodeAvailabilityWrapper::Offline)),
-            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
-        }
-    }
-}
-
-struct Client {
-    base_url: Url,
-    jwt_token: Option<String>,
-    client: reqwest::Client,
-}
-
-impl Client {
-    fn new(base_url: Url, jwt_token: Option<String>) -> Self {
-        Self {
-            base_url,
-            jwt_token,
-            client: reqwest::ClientBuilder::new()
-                .build()
-                .expect("Failed to construct http client"),
-        }
-    }
-
-    /// Simple HTTP request wrapper for calling into storage controller
-    async fn dispatch<RQ, RS>(
-        &self,
-        method: Method,
-        path: String,
-        body: Option<RQ>,
-    ) -> mgmt_api::Result<RS>
-    where
-        RQ: Serialize + Sized,
-        RS: DeserializeOwned + Sized,
-    {
-        // The configured URL has the /upcall path prefix for pageservers to use: we will strip that out
-        // for general purpose API access.
-        let url = Url::from_str(&format!(
-            "http://{}:{}/{path}",
-            self.base_url.host_str().unwrap(),
-            self.base_url.port().unwrap()
-        ))
-        .unwrap();
-
-        let mut builder = self.client.request(method, url);
-        if let Some(body) = body {
-            builder = builder.json(&body)
-        }
-        if let Some(jwt_token) = &self.jwt_token {
-            builder = builder.header(
-                reqwest::header::AUTHORIZATION,
-                format!("Bearer {jwt_token}"),
-            );
-        }
-
-        let response = builder.send().await.map_err(mgmt_api::Error::ReceiveBody)?;
-        let response = response.error_from_body().await?;
-
-        response
-            .json()
-            .await
-            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)
-    }
-}
-
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
-    let cli = Cli::parse();
-
-    let storcon_client = Client::new(cli.api.clone(), cli.jwt.clone());
-
-    let mut trimmed = cli.api.to_string();
-    trimmed.pop();
-    let vps_client = mgmt_api::Client::new(trimmed, cli.jwt.as_deref());
-
-    match cli.command {
-        Command::NodeRegister {
-            node_id,
-            listen_pg_addr,
-            listen_pg_port,
-            listen_http_addr,
-            listen_http_port,
-        } => {
-            storcon_client
-                .dispatch::<_, ()>(
-                    Method::POST,
-                    "control/v1/node".to_string(),
-                    Some(NodeRegisterRequest {
-                        node_id,
-                        listen_pg_addr,
-                        listen_pg_port,
-                        listen_http_addr,
-                        listen_http_port,
-                    }),
-                )
-                .await?;
-        }
-        Command::TenantCreate { tenant_id } => {
-            vps_client
-                .tenant_create(&TenantCreateRequest {
-                    new_tenant_id: TenantShardId::unsharded(tenant_id),
-                    generation: None,
-                    shard_parameters: ShardParameters::default(),
-                    placement_policy: Some(PlacementPolicy::Attached(1)),
-                    config: TenantConfig::default(),
-                })
-                .await?;
-        }
-        Command::TenantDelete { tenant_id } => {
-            let status = vps_client
-                .tenant_delete(TenantShardId::unsharded(tenant_id))
-                .await?;
-            tracing::info!("Delete status: {}", status);
-        }
-        Command::Nodes {} => {
-            let resp = storcon_client
-                .dispatch::<(), Vec<NodeDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/node".to_string(),
-                    None,
-                )
-                .await?;
-            let mut table = comfy_table::Table::new();
-            table.set_header(["Id", "Hostname", "Scheduling", "Availability"]);
-            for node in resp {
-                table.add_row([
-                    format!("{}", node.id),
-                    node.listen_http_addr,
-                    format!("{:?}", node.scheduling),
-                    format!("{:?}", node.availability),
-                ]);
-            }
-            println!("{table}");
-        }
-        Command::NodeConfigure {
-            node_id,
-            availability,
-            scheduling,
-        } => {
-            let req = NodeConfigureRequest {
-                node_id,
-                availability: availability.map(|a| a.0),
-                scheduling,
-            };
-            storcon_client
-                .dispatch::<_, ()>(
-                    Method::PUT,
-                    format!("control/v1/node/{node_id}/config"),
-                    Some(req),
-                )
-                .await?;
-        }
-        Command::Tenants {} => {
-            let resp = storcon_client
-                .dispatch::<(), Vec<TenantDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/tenant".to_string(),
-                    None,
-                )
-                .await?;
-            let mut table = comfy_table::Table::new();
-            table.set_header([
-                "TenantId",
-                "ShardCount",
-                "StripeSize",
-                "Placement",
-                "Scheduling",
-            ]);
-            for tenant in resp {
-                let shard_zero = tenant.shards.into_iter().next().unwrap();
-                table.add_row([
-                    format!("{}", tenant.tenant_id),
-                    format!("{}", shard_zero.tenant_shard_id.shard_count.literal()),
-                    format!("{:?}", tenant.stripe_size),
-                    format!("{:?}", tenant.policy),
-                    format!("{:?}", shard_zero.scheduling_policy),
-                ]);
-            }
-
-            println!("{table}");
-        }
-        Command::TenantPolicy {
-            tenant_id,
-            placement,
-            scheduling,
-        } => {
-            let req = TenantPolicyRequest {
-                scheduling: scheduling.map(|s| s.0),
-                placement: placement.map(|p| p.0),
-            };
-            storcon_client
-                .dispatch::<_, ()>(
-                    Method::PUT,
-                    format!("control/v1/tenant/{tenant_id}/policy"),
-                    Some(req),
-                )
-                .await?;
-        }
-        Command::TenantShardSplit {
-            tenant_id,
-            shard_count,
-            stripe_size,
-        } => {
-            let req = TenantShardSplitRequest {
-                new_shard_count: shard_count,
-                new_stripe_size: stripe_size.map(ShardStripeSize),
-            };
-
-            let response = storcon_client
-                .dispatch::<TenantShardSplitRequest, TenantShardSplitResponse>(
-                    Method::PUT,
-                    format!("control/v1/tenant/{tenant_id}/shard_split"),
-                    Some(req),
-                )
-                .await?;
-            println!(
-                "Split tenant {} into {} shards: {}",
-                tenant_id,
-                shard_count,
-                response
-                    .new_shards
-                    .iter()
-                    .map(|s| format!("{:?}", s))
-                    .collect::<Vec<_>>()
-                    .join(",")
-            );
-        }
-        Command::TenantShardMigrate {
-            tenant_shard_id,
-            node,
-        } => {
-            let req = TenantShardMigrateRequest {
-                tenant_shard_id,
-                node_id: node,
-            };
-
-            storcon_client
-                .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
-                    Method::PUT,
-                    format!("control/v1/tenant/{tenant_shard_id}/migrate"),
-                    Some(req),
-                )
-                .await?;
-        }
-        Command::TenantConfig { tenant_id, config } => {
-            let tenant_conf = serde_json::from_str(&config)?;
-
-            vps_client
-                .tenant_config(&TenantConfigRequest {
-                    tenant_id,
-                    config: tenant_conf,
-                })
-                .await?;
-        }
-        Command::TenantScatter { tenant_id } => {
-            // Find the shards
-            let locate_response = storcon_client
-                .dispatch::<(), TenantLocateResponse>(
-                    Method::GET,
-                    format!("control/v1/tenant/{tenant_id}/locate"),
-                    None,
-                )
-                .await?;
-            let shards = locate_response.shards;
-
-            let mut node_to_shards: HashMap<NodeId, Vec<TenantShardId>> = HashMap::new();
-            let shard_count = shards.len();
-            for s in shards {
-                let entry = node_to_shards.entry(s.node_id).or_default();
-                entry.push(s.shard_id);
-            }
-
-            // Load list of available nodes
-            let nodes_resp = storcon_client
-                .dispatch::<(), Vec<NodeDescribeResponse>>(
-                    Method::GET,
-                    "control/v1/node".to_string(),
-                    None,
-                )
-                .await?;
-
-            for node in nodes_resp {
-                if matches!(node.availability, NodeAvailabilityWrapper::Active) {
-                    node_to_shards.entry(node.id).or_default();
-                }
-            }
-
-            let max_shard_per_node = shard_count / node_to_shards.len();
-
-            loop {
-                let mut migrate_shard = None;
-                for shards in node_to_shards.values_mut() {
-                    if shards.len() > max_shard_per_node {
-                        // Pick the emptiest
-                        migrate_shard = Some(shards.pop().unwrap());
-                    }
-                }
-                let Some(migrate_shard) = migrate_shard else {
-                    break;
-                };
-
-                // Pick the emptiest node to migrate to
-                let mut destinations = node_to_shards
-                    .iter()
-                    .map(|(k, v)| (k, v.len()))
-                    .collect::<Vec<_>>();
-                destinations.sort_by_key(|i| i.1);
-                let (destination_node, destination_count) = *destinations.first().unwrap();
-                if destination_count + 1 > max_shard_per_node {
-                    // Even the emptiest destination doesn't have space: we're done
-                    break;
-                }
-                let destination_node = *destination_node;
-
-                node_to_shards
-                    .get_mut(&destination_node)
-                    .unwrap()
-                    .push(migrate_shard);
-
-                println!("Migrate {} -> {} ...", migrate_shard, destination_node);
-
-                storcon_client
-                    .dispatch::<TenantShardMigrateRequest, TenantShardMigrateResponse>(
-                        Method::PUT,
-                        format!("control/v1/tenant/{migrate_shard}/migrate"),
-                        Some(TenantShardMigrateRequest {
-                            tenant_shard_id: migrate_shard,
-                            node_id: destination_node,
-                        }),
-                    )
-                    .await?;
-                println!("Migrate {} -> {} OK", migrate_shard, destination_node);
-            }
-
-            // Spread the shards across the nodes
-        }
-        Command::TenantDescribe { tenant_id } => {
-            let describe_response = storcon_client
-                .dispatch::<(), TenantDescribeResponse>(
-                    Method::GET,
-                    format!("control/v1/tenant/{tenant_id}"),
-                    None,
-                )
-                .await?;
-            let shards = describe_response.shards;
-            let mut table = comfy_table::Table::new();
-            table.set_header(["Shard", "Attached", "Secondary", "Last error", "status"]);
-            for shard in shards {
-                let secondary = shard
-                    .node_secondary
-                    .iter()
-                    .map(|n| format!("{}", n))
-                    .collect::<Vec<_>>()
-                    .join(",");
-
-                let mut status_parts = Vec::new();
-                if shard.is_reconciling {
-                    status_parts.push("reconciling");
-                }
-
-                if shard.is_pending_compute_notification {
-                    status_parts.push("pending_compute");
-                }
-
-                if shard.is_splitting {
-                    status_parts.push("splitting");
-                }
-                let status = status_parts.join(",");
-
-                table.add_row([
-                    format!("{}", shard.tenant_shard_id),
-                    shard
-                        .node_attached
-                        .map(|n| format!("{}", n))
-                        .unwrap_or(String::new()),
-                    secondary,
-                    shard.last_error,
-                    status,
-                ]);
-            }
-            println!("{table}");
-        }
-        Command::TenantWarmup { tenant_id } => {
-            let describe_response = storcon_client
-                .dispatch::<(), TenantDescribeResponse>(
-                    Method::GET,
-                    format!("control/v1/tenant/{tenant_id}"),
-                    None,
-                )
-                .await;
-            match describe_response {
-                Ok(describe) => {
-                    if matches!(describe.policy, PlacementPolicy::Secondary) {
-                        // Fine: it's already known to controller in secondary mode: calling
-                        // again to put it into secondary mode won't cause problems.
-                    } else {
-                        anyhow::bail!("Tenant already present with policy {:?}", describe.policy);
-                    }
-                }
-                Err(mgmt_api::Error::ApiError(StatusCode::NOT_FOUND, _)) => {
-                    // Fine: this tenant isn't know to the storage controller yet.
-                }
-                Err(e) => {
-                    // Unexpected API error
-                    return Err(e.into());
-                }
-            }
-
-            vps_client
-                .location_config(
-                    TenantShardId::unsharded(tenant_id),
-                    pageserver_api::models::LocationConfig {
-                        mode: pageserver_api::models::LocationConfigMode::Secondary,
-                        generation: None,
-                        secondary_conf: Some(LocationConfigSecondary { warm: true }),
-                        shard_number: 0,
-                        shard_count: 0,
-                        shard_stripe_size: ShardParameters::DEFAULT_STRIPE_SIZE.0,
-                        tenant_conf: TenantConfig::default(),
-                    },
-                    None,
-                    true,
-                )
-                .await?;
-
-            let describe_response = storcon_client
-                .dispatch::<(), TenantDescribeResponse>(
-                    Method::GET,
-                    format!("control/v1/tenant/{tenant_id}"),
-                    None,
-                )
-                .await?;
-
-            let secondary_ps_id = describe_response
-                .shards
-                .first()
-                .unwrap()
-                .node_secondary
-                .first()
-                .unwrap();
-
-            println!("Tenant {tenant_id} warming up on pageserver {secondary_ps_id}");
-            loop {
-                let (status, progress) = vps_client
-                    .tenant_secondary_download(
-                        TenantShardId::unsharded(tenant_id),
-                        Some(Duration::from_secs(10)),
-                    )
-                    .await?;
-                println!(
-                    "Progress: {}/{} layers, {}/{} bytes",
-                    progress.layers_downloaded,
-                    progress.layers_total,
-                    progress.bytes_downloaded,
-                    progress.bytes_total
-                );
-                match status {
-                    StatusCode::OK => {
-                        println!("Download complete");
-                        break;
-                    }
-                    StatusCode::ACCEPTED => {
-                        // Loop
-                    }
-                    _ => {
-                        anyhow::bail!("Unexpected download status: {status}");
-                    }
-                }
-            }
-        }
-    }
-
-    Ok(())
-}
--- a/diesel.toml
+++ b/diesel.toml
@@ -2,8 +2,8 @@
 # see https://diesel.rs/guides/configuring-diesel-cli

 [print_schema]
-file = "storage_controller/src/schema.rs"
+file = "control_plane/attachment_service/src/schema.rs"
 custom_type_derives = ["diesel::query_builder::QueryId"]

 [migrations_directory]
-dir = "storage_controller/migrations"
+dir = "control_plane/attachment_service/migrations"
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -7,11 +7,6 @@ Below you will find a brief overview of each subdir in the source tree in alphab
 Neon storage broker, providing messaging between safekeepers and pageservers.
 [storage_broker.md](./storage_broker.md)

-`storage_controller`:
-
-Neon storage controller, manages a cluster of pageservers and exposes an API that enables
-managing a many-sharded tenant as a single entity.
-
 `/control_plane`:

 Local control plane.
--- a/docs/storage_controller.md
+++ b/docs/storage_controller.md
@@ -1,150 +0,0 @@
-# Storage Controller
-
-## Concepts
-
-The storage controller sits between administrative API clients and pageservers, and handles the details of mapping tenants to pageserver tenant shards. For example, creating a tenant is one API call to the storage controller,
-which is mapped into many API calls to many pageservers (for multiple shards, and for secondary locations).
-
-It implements a pageserver-compatible API that may be used for CRUD operations on tenants and timelines, translating these requests into appropriate operations on the shards within a tenant, which may be on many different pageservers. Using this API, the storage controller may be used in the same way as the pageserver's administrative HTTP API, hiding
-the underlying details of how data is spread across multiple nodes.
-
-The storage controller also manages generations, high availability (via secondary locations) and live migrations for tenants under its management. This is done with a reconciliation loop pattern, where tenants have an “intent” state and a “reconcile” task that tries to make the outside world match the intent.
-
-## APIs
-
-The storage controller’s HTTP server implements four logically separate APIs:
-
- `/v1/...` path is the pageserver-compatible API. This has to be at the path root because that’s where clients expect to find it on a pageserver.
- `/control/v1/...` path is the storage controller’s API, which enables operations such as registering and management pageservers, or executing shard splits.
- `/debug/v1/...` path contains endpoints which are either exclusively used in tests, or are for use by engineers when supporting a deployed system.
- `/upcall/v1/...` path contains endpoints that are called by pageservers. This includes the `/re-attach` and `/validate` APIs used by pageservers
-  to ensure data safety with generation numbers.
-
-The API is authenticated with a JWT token, and tokens must have scope `pageserverapi` (i.e. the same scope as pageservers’ APIs).
-
-See the `http.rs` file in the source for where the HTTP APIs are implemented.
-
-## Database
-
-The storage controller uses a postgres database to persist a subset of its state. Note that the storage controller does _not_ keep all its state in the database: this is a design choice to enable most operations to be done efficiently in memory, rather than having to read from the database. See `persistence.rs` for a more comprehensive comment explaining what we do and do not persist: a useful metaphor is that we persist objects like tenants and nodes, but we do not
-persist the _relationships_ between them: the attachment state of a tenant's shards to nodes is kept in memory and
-rebuilt on startup.
-
-The file `persistence.rs` contains all the code for accessing the database, and has a large doc comment that goes into more detail about exactly what we persist and why.
-
-The `diesel` crate is used for defining models & migrations.
-
-Running a local cluster with `cargo neon` automatically starts a vanilla postgress process to host the storage controller’s database.
-
-### Diesel tip: migrations
-
-If you need to modify the database schema, here’s how to create a migration:
-
- Install the diesel CLI with `cargo install diesel_cli`
- Use `diesel migration generate <name>` to create a new migration
- Populate the SQL files in the `migrations/` subdirectory
- Use `DATABASE_URL=... diesel migration run` to apply the migration you just wrote: this will update the `[schema.rs](http://schema.rs)` file automatically.
-  - This requires a running database: the easiest way to do that is to just run `cargo neon init ; cargo neon start`, which will leave a database available at `postgresql://localhost:1235/attachment_service`
- Commit the migration files and the changes to schema.rs
- If you need to iterate, you can rewind migrations with `diesel migration revert -a` and then `diesel migration run` again.
- The migrations are build into the storage controller binary, and automatically run at startup after it is deployed, so once you’ve committed a migration no further steps are needed.
-
-## storcon_cli
-
-The `storcon_cli` tool enables interactive management of the storage controller. This is usually
-only necessary for debug, but may also be used to manage nodes (e.g. marking a node as offline).
-
-`storcon_cli --help` includes details on commands.
-
-# Deploying
-
-This section is aimed at engineers deploying the storage controller outside of Neon's cloud platform, as
-part of a self-hosted system.
-
-_General note: since the default `neon_local` environment includes a storage controller, this is a useful
-reference when figuring out deployment._
-
-## Database
-
-It is **essential** that the database used by the storage controller is durable (**do not store it on ephemeral
-local disk**). This database contains pageserver generation numbers, which are essential to data safety on the pageserver.
-
-The resource requirements for the database are very low: a single CPU core and 1GiB of memory should work well for most deployments. The physical size of the database is typically under a gigabyte.
-
-Set the URL to the database using the `--database-url` CLI option.
-
-There is no need to run migrations manually: the storage controller automatically applies migrations
-when it starts up.
-
-## Configure pageservers to use the storage controller
-
-1. The pageserver `control_plane_api` and `control_plane_api_token` should be set in the `pageserver.toml` file. The API setting should
-   point to the "upcall" prefix, for example `http://127.0.0.1:1234/upcall/v1/` is used in neon_local clusters.
-2. Create a `metadata.json` file in the same directory as `pageserver.toml`: this enables the pageserver to automatically register itself
-   with the storage controller when it starts up. See the example below for the format of this file.
-
-### Example `metadata.json`
-
-```
-{"host":"acmehost.localdomain","http_host":"acmehost.localdomain","http_port":9898,"port":64000}
-```
-
- `port` and `host` refer to the _postgres_ port and host, and these must be accessible from wherever
-  postgres runs.
- `http_port` and `http_host` refer to the pageserver's HTTP api, this must be accessible from where
-  the storage controller runs.
-
-## Handle compute notifications.
-
-The storage controller independently moves tenant attachments between pageservers in response to
-changes such as a pageserver node becoming unavailable, or the tenant's shard count changing. To enable
-postgres clients to handle such changes, the storage controller calls an API hook when a tenant's pageserver
-location changes.
-
-The hook is configured using the storage controller's `--compute-hook-url` CLI option. If the hook requires
-JWT auth, the token may be provided with `--control-plane-jwt-token`. The hook will be invoked with a `PUT` request.
-
-In the Neon cloud service, this hook is implemented by Neon's internal cloud control plane. In `neon_local` systems
-the storage controller integrates directly with neon_local to reconfigure local postgres processes instead of calling
-the compute hook.
-
-When implementing an on-premise Neon deployment, you must implement a service that handles the compute hook. This is not complicated:
-the request body has format of the `ComputeHookNotifyRequest` structure, provided below for convenience.
-
-```
-struct ComputeHookNotifyRequestShard {
-    node_id: NodeId,
-    shard_number: ShardNumber,
-}
-
-struct ComputeHookNotifyRequest {
-    tenant_id: TenantId,
-    stripe_size: Option<ShardStripeSize>,
-    shards: Vec<ComputeHookNotifyRequestShard>,
-}
-```
-
-When a notification is received:
-
-1. Modify postgres configuration for this tenant:
-
-   - set `neon.pageserver_connstr` to a comma-separated list of postgres connection strings to pageservers according to the `shards` list. The
-     shards identified by `NodeId` must be converted to the address+port of the node.
-   - if stripe_size is not None, set `neon.stripe_size` to this value
-
-2. Send SIGHUP to postgres to reload configuration
-3. Respond with 200 to the notification request. Do not return success if postgres was not updated: if an error is returned, the controller
-   will retry the notification until it succeeds..
-
-### Example notification body
-
-```
-{
-  "tenant_id": "1f359dd625e519a1a4e8d7509690f6fc",
-  "stripe_size": 32768,
-  "shards": [
-      {"node_id": 344, "shard_number": 0},
-      {"node_id": 722, "shard_number": 1},
-  ],
-}
-```
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -33,23 +33,6 @@ pub struct ComputeSpec {
    #[serde(default)]
    pub features: Vec<ComputeFeature>,

-    /// If compute_ctl was passed `--resize-swap-on-bind`, a value of `Some(_)` instructs
-    /// compute_ctl to `/neonvm/bin/resize-swap` with the given size, when the spec is first
-    /// received.
-    ///
-    /// Both this field and `--resize-swap-on-bind` are required, so that the control plane's
-    /// spec generation doesn't need to be aware of the actual compute it's running on, while
-    /// guaranteeing gradual rollout of swap. Otherwise, without `--resize-swap-on-bind`, we could
-    /// end up trying to resize swap in VMs without it -- or end up *not* resizing swap, thus
-    /// giving every VM much more swap than it should have (32GiB).
-    ///
-    /// Eventually we may remove `--resize-swap-on-bind` and exclusively use `swap_size_bytes` for
-    /// enabling the swap resizing behavior once rollout is complete.
-    ///
-    /// See neondatabase/cloud#12047 for more.
-    #[serde(default)]
-    pub swap_size_bytes: Option<u64>,
-
    /// Expected cluster state at the end of transition process.
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,
--- a/libs/metrics/Cargo.toml
+++ b/libs/metrics/Cargo.toml
@@ -10,13 +10,11 @@ libc.workspace = true
 once_cell.workspace = true
 chrono.workspace = true
 twox-hash.workspace = true
-measured.workspace = true

 workspace_hack.workspace = true

 [target.'cfg(target_os = "linux")'.dependencies]
 procfs.workspace = true
-measured-process.workspace = true

 [dev-dependencies]
 rand = "0.8"
--- a/libs/metrics/src/hll.rs
+++ b/libs/metrics/src/hll.rs
@@ -7,19 +7,14 @@
 //! use significantly less memory than this, but can only approximate the cardinality.

 use std::{
-    hash::{BuildHasher, BuildHasherDefault, Hash},
-    sync::atomic::AtomicU8,
+    collections::HashMap,
+    hash::{BuildHasher, BuildHasherDefault, Hash, Hasher},
+    sync::{atomic::AtomicU8, Arc, RwLock},
 };

-use measured::{
-    label::{LabelGroupVisitor, LabelName, LabelValue, LabelVisitor},
-    metric::{
-        group::{Encoding, MetricValue},
-        name::MetricNameEncoder,
-        Metric, MetricType, MetricVec,
-    },
-    text::TextEncoder,
-    LabelGroup,
+use prometheus::{
+    core::{self, Describer},
+    proto, Opts,
 };
 use twox_hash::xxh3;

@@ -98,25 +93,203 @@ macro_rules! register_hll {
 /// ```
 ///
 /// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
-pub type HyperLogLogVec<L, const N: usize> = MetricVec<HyperLogLogState<N>, L>;
-pub type HyperLogLog<const N: usize> = Metric<HyperLogLogState<N>>;
-
-pub struct HyperLogLogState<const N: usize> {
-    shards: [AtomicU8; N],
+#[derive(Clone)]
+pub struct HyperLogLogVec<const N: usize> {
+    core: Arc<HyperLogLogVecCore<N>>,
 }
-impl<const N: usize> Default for HyperLogLogState<N> {
-    fn default() -> Self {
-        #[allow(clippy::declare_interior_mutable_const)]
-        const ZERO: AtomicU8 = AtomicU8::new(0);
-        Self { shards: [ZERO; N] }
+
+struct HyperLogLogVecCore<const N: usize> {
+    pub children: RwLock<HashMap<u64, HyperLogLog<N>, BuildHasherDefault<xxh3::Hash64>>>,
+    pub desc: core::Desc,
+    pub opts: Opts,
+}
+
+impl<const N: usize> core::Collector for HyperLogLogVec<N> {
+    fn desc(&self) -> Vec<&core::Desc> {
+        vec![&self.core.desc]
+    }
+
+    fn collect(&self) -> Vec<proto::MetricFamily> {
+        let mut m = proto::MetricFamily::default();
+        m.set_name(self.core.desc.fq_name.clone());
+        m.set_help(self.core.desc.help.clone());
+        m.set_field_type(proto::MetricType::GAUGE);
+
+        let mut metrics = Vec::new();
+        for child in self.core.children.read().unwrap().values() {
+            child.core.collect_into(&mut metrics);
+        }
+        m.set_metric(metrics);
+
+        vec![m]
    }
 }

-impl<const N: usize> MetricType for HyperLogLogState<N> {
-    type Metadata = ();
+impl<const N: usize> HyperLogLogVec<N> {
+    /// Create a new [`HyperLogLogVec`] based on the provided
+    /// [`Opts`] and partitioned by the given label names. At least one label name must be
+    /// provided.
+    pub fn new(opts: Opts, label_names: &[&str]) -> prometheus::Result<Self> {
+        assert!(N.is_power_of_two());
+        let variable_names = label_names.iter().map(|s| (*s).to_owned()).collect();
+        let opts = opts.variable_labels(variable_names);
+
+        let desc = opts.describe()?;
+        let v = HyperLogLogVecCore {
+            children: RwLock::new(HashMap::default()),
+            desc,
+            opts,
+        };
+
+        Ok(Self { core: Arc::new(v) })
+    }
+
+    /// `get_metric_with_label_values` returns the [`HyperLogLog<P>`] for the given slice
+    /// of label values (same order as the VariableLabels in Desc). If that combination of
+    /// label values is accessed for the first time, a new [`HyperLogLog<P>`] is created.
+    ///
+    /// An error is returned if the number of label values is not the same as the
+    /// number of VariableLabels in Desc.
+    pub fn get_metric_with_label_values(
+        &self,
+        vals: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        self.core.get_metric_with_label_values(vals)
+    }
+
+    /// `with_label_values` works as `get_metric_with_label_values`, but panics if an error
+    /// occurs.
+    pub fn with_label_values(&self, vals: &[&str]) -> HyperLogLog<N> {
+        self.get_metric_with_label_values(vals).unwrap()
+    }
 }

-impl<const N: usize> HyperLogLogState<N> {
+impl<const N: usize> HyperLogLogVecCore<N> {
+    pub fn get_metric_with_label_values(
+        &self,
+        vals: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        let h = self.hash_label_values(vals)?;
+
+        if let Some(metric) = self.children.read().unwrap().get(&h).cloned() {
+            return Ok(metric);
+        }
+
+        self.get_or_create_metric(h, vals)
+    }
+
+    pub(crate) fn hash_label_values(&self, vals: &[&str]) -> prometheus::Result<u64> {
+        if vals.len() != self.desc.variable_labels.len() {
+            return Err(prometheus::Error::InconsistentCardinality {
+                expect: self.desc.variable_labels.len(),
+                got: vals.len(),
+            });
+        }
+
+        let mut h = xxh3::Hash64::default();
+        for val in vals {
+            h.write(val.as_bytes());
+        }
+
+        Ok(h.finish())
+    }
+
+    fn get_or_create_metric(
+        &self,
+        hash: u64,
+        label_values: &[&str],
+    ) -> prometheus::Result<HyperLogLog<N>> {
+        let mut children = self.children.write().unwrap();
+        // Check exist first.
+        if let Some(metric) = children.get(&hash).cloned() {
+            return Ok(metric);
+        }
+
+        let metric = HyperLogLog::with_opts_and_label_values(&self.opts, label_values)?;
+        children.insert(hash, metric.clone());
+        Ok(metric)
+    }
+}
+
+/// HLL is a probabilistic cardinality measure.
+///
+/// How to use this time-series for a metric name `my_metrics_total_hll`:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (my_metrics_total_hll{}) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// If you want an estimate over time, you can use the following query:
+///
+/// ```promql
+/// # harmonic mean
+/// 1 / (
+///     sum (
+///         2 ^ -(
+///             # HLL merge operation
+///             max (
+///                 max_over_time(my_metrics_total_hll{}[$__rate_interval])
+///             ) by (hll_shard, other_labels...)
+///         )
+///     ) without (hll_shard)
+/// )
+/// * alpha
+/// * shards_count
+/// * shards_count
+/// ```
+///
+/// In the case of low cardinality, you might want to use the linear counting approximation:
+///
+/// ```promql
+/// # LinearCounting(m, V) = m log (m / V)
+/// shards_count * ln(shards_count /
+///     # calculate V = how many shards contain a 0
+///     count(max (proxy_connecting_endpoints{}) by (hll_shard, protocol) == 0) without (hll_shard)
+/// )
+/// ```
+///
+/// See <https://en.wikipedia.org/wiki/HyperLogLog#Practical_considerations> for estimates on alpha
+#[derive(Clone)]
+pub struct HyperLogLog<const N: usize> {
+    core: Arc<HyperLogLogCore<N>>,
+}
+
+impl<const N: usize> HyperLogLog<N> {
+    /// Create a [`HyperLogLog`] with the `name` and `help` arguments.
+    pub fn new<S1: Into<String>, S2: Into<String>>(name: S1, help: S2) -> prometheus::Result<Self> {
+        assert!(N.is_power_of_two());
+        let opts = Opts::new(name, help);
+        Self::with_opts(opts)
+    }
+
+    /// Create a [`HyperLogLog`] with the `opts` options.
+    pub fn with_opts(opts: Opts) -> prometheus::Result<Self> {
+        Self::with_opts_and_label_values(&opts, &[])
+    }
+
+    fn with_opts_and_label_values(opts: &Opts, label_values: &[&str]) -> prometheus::Result<Self> {
+        let desc = opts.describe()?;
+        let labels = make_label_pairs(&desc, label_values)?;
+
+        let v = HyperLogLogCore {
+            shards: [0; N].map(AtomicU8::new),
+            desc,
+            labels,
+        };
+        Ok(Self { core: Arc::new(v) })
+    }
+
    pub fn measure(&self, item: &impl Hash) {
        // changing the hasher will break compatibility with previous measurements.
        self.record(BuildHasherDefault::<xxh3::Hash64>::default().hash_one(item));
@@ -126,11 +299,42 @@ impl<const N: usize> HyperLogLogState<N> {
        let p = N.ilog2() as u8;
        let j = hash & (N as u64 - 1);
        let rho = (hash >> p).leading_zeros() as u8 + 1 - p;
-        self.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
+        self.core.shards[j as usize].fetch_max(rho, std::sync::atomic::Ordering::Relaxed);
+    }
+}
+
+struct HyperLogLogCore<const N: usize> {
+    shards: [AtomicU8; N],
+    desc: core::Desc,
+    labels: Vec<proto::LabelPair>,
+}
+
+impl<const N: usize> core::Collector for HyperLogLog<N> {
+    fn desc(&self) -> Vec<&core::Desc> {
+        vec![&self.core.desc]
    }

-    fn take_sample(&self) -> [u8; N] {
-        self.shards.each_ref().map(|x| {
+    fn collect(&self) -> Vec<proto::MetricFamily> {
+        let mut m = proto::MetricFamily::default();
+        m.set_name(self.core.desc.fq_name.clone());
+        m.set_help(self.core.desc.help.clone());
+        m.set_field_type(proto::MetricType::GAUGE);
+
+        let mut metrics = Vec::new();
+        self.core.collect_into(&mut metrics);
+        m.set_metric(metrics);
+
+        vec![m]
+    }
+}
+
+impl<const N: usize> HyperLogLogCore<N> {
+    fn collect_into(&self, metrics: &mut Vec<proto::Metric>) {
+        self.shards.iter().enumerate().for_each(|(i, x)| {
+            let mut shard_label = proto::LabelPair::default();
+            shard_label.set_name("hll_shard".to_owned());
+            shard_label.set_value(format!("{i}"));
+
            // We reset the counter to 0 so we can perform a cardinality measure over any time slice in prometheus.

            // This seems like it would be a race condition,
@@ -140,90 +344,85 @@ impl<const N: usize> HyperLogLogState<N> {

            // TODO: maybe we shouldn't reset this on every collect, instead, only after a time window.
            // this would mean that a dev port-forwarding the metrics url won't break the sampling.
-            x.swap(0, std::sync::atomic::Ordering::Relaxed)
+            let v = x.swap(0, std::sync::atomic::Ordering::Relaxed);
+
+            let mut m = proto::Metric::default();
+            let mut c = proto::Gauge::default();
+            c.set_value(v as f64);
+            m.set_gauge(c);
+
+            let mut labels = Vec::with_capacity(self.labels.len() + 1);
+            labels.extend_from_slice(&self.labels);
+            labels.push(shard_label);
+
+            m.set_label(labels);
+            metrics.push(m);
        })
    }
 }
-impl<W: std::io::Write, const N: usize> measured::metric::MetricEncoding<TextEncoder<W>>
-    for HyperLogLogState<N>
-{
-    fn write_type(
-        name: impl MetricNameEncoder,
-        enc: &mut TextEncoder<W>,
-    ) -> Result<(), std::io::Error> {
-        enc.write_type(&name, measured::text::MetricType::Gauge)
+
+fn make_label_pairs(
+    desc: &core::Desc,
+    label_values: &[&str],
+) -> prometheus::Result<Vec<proto::LabelPair>> {
+    if desc.variable_labels.len() != label_values.len() {
+        return Err(prometheus::Error::InconsistentCardinality {
+            expect: desc.variable_labels.len(),
+            got: label_values.len(),
+        });
    }
-    fn collect_into(
-        &self,
-        _: &(),
-        labels: impl LabelGroup,
-        name: impl MetricNameEncoder,
-        enc: &mut TextEncoder<W>,
-    ) -> Result<(), std::io::Error> {
-        struct I64(i64);
-        impl LabelValue for I64 {
-            fn visit<V: LabelVisitor>(&self, v: V) -> V::Output {
-                v.write_int(self.0)
-            }
-        }

-        struct HllShardLabel {
-            hll_shard: i64,
-        }
-
-        impl LabelGroup for HllShardLabel {
-            fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
-                const LE: &LabelName = LabelName::from_str("hll_shard");
-                v.write_value(LE, &I64(self.hll_shard));
-            }
-        }
-
-        self.take_sample()
-            .into_iter()
-            .enumerate()
-            .try_for_each(|(hll_shard, val)| {
-                enc.write_metric_value(
-                    name.by_ref(),
-                    labels.by_ref().compose_with(HllShardLabel {
-                        hll_shard: hll_shard as i64,
-                    }),
-                    MetricValue::Int(val as i64),
-                )
-            })
+    let total_len = desc.variable_labels.len() + desc.const_label_pairs.len();
+    if total_len == 0 {
+        return Ok(vec![]);
    }
+
+    if desc.variable_labels.is_empty() {
+        return Ok(desc.const_label_pairs.clone());
+    }
+
+    let mut label_pairs = Vec::with_capacity(total_len);
+    for (i, n) in desc.variable_labels.iter().enumerate() {
+        let mut label_pair = proto::LabelPair::default();
+        label_pair.set_name(n.clone());
+        label_pair.set_value(label_values[i].to_owned());
+        label_pairs.push(label_pair);
+    }
+
+    for label_pair in &desc.const_label_pairs {
+        label_pairs.push(label_pair.clone());
+    }
+    label_pairs.sort();
+    Ok(label_pairs)
 }

 #[cfg(test)]
 mod tests {
    use std::collections::HashSet;

-    use measured::{label::StaticLabelSet, FixedCardinalityLabel};
+    use prometheus::{proto, Opts};
    use rand::{rngs::StdRng, Rng, SeedableRng};
    use rand_distr::{Distribution, Zipf};

    use crate::HyperLogLogVec;

-    #[derive(FixedCardinalityLabel, Clone, Copy)]
-    #[label(singleton = "x")]
-    enum Label {
-        A,
-        B,
+    fn collect(hll: &HyperLogLogVec<32>) -> Vec<proto::Metric> {
+        let mut metrics = vec![];
+        hll.core
+            .children
+            .read()
+            .unwrap()
+            .values()
+            .for_each(|c| c.core.collect_into(&mut metrics));
+        metrics
    }
-
-    fn collect(hll: &HyperLogLogVec<StaticLabelSet<Label>, 32>) -> ([u8; 32], [u8; 32]) {
-        // cannot go through the `hll.collect_family_into` interface yet...
-        // need to see if I can fix the conflicting impls problem in measured.
-        (
-            hll.get_metric(hll.with_labels(Label::A)).take_sample(),
-            hll.get_metric(hll.with_labels(Label::B)).take_sample(),
-        )
-    }
-
-    fn get_cardinality(samples: &[[u8; 32]]) -> f64 {
+    fn get_cardinality(metrics: &[proto::Metric], filter: impl Fn(&proto::Metric) -> bool) -> f64 {
        let mut buckets = [0.0; 32];
-        for &sample in samples {
-            for (i, m) in sample.into_iter().enumerate() {
-                buckets[i] = f64::max(buckets[i], m as f64);
+        for metric in metrics.chunks_exact(32) {
+            if filter(&metric[0]) {
+                for (i, m) in metric.iter().enumerate() {
+                    buckets[i] = f64::max(buckets[i], m.get_gauge().get_value());
+                }
            }
        }

@@ -238,7 +437,7 @@ mod tests {
    }

    fn test_cardinality(n: usize, dist: impl Distribution<f64>) -> ([usize; 3], [f64; 3]) {
-        let hll = HyperLogLogVec::<StaticLabelSet<Label>, 32>::new();
+        let hll = HyperLogLogVec::<32>::new(Opts::new("foo", "bar"), &["x"]).unwrap();

        let mut iter = StdRng::seed_from_u64(0x2024_0112).sample_iter(dist);
        let mut set_a = HashSet::new();
@@ -246,20 +445,18 @@ mod tests {

        for x in iter.by_ref().take(n) {
            set_a.insert(x.to_bits());
-            hll.get_metric(hll.with_labels(Label::A))
-                .measure(&x.to_bits());
+            hll.with_label_values(&["a"]).measure(&x.to_bits());
        }
        for x in iter.by_ref().take(n) {
            set_b.insert(x.to_bits());
-            hll.get_metric(hll.with_labels(Label::B))
-                .measure(&x.to_bits());
+            hll.with_label_values(&["b"]).measure(&x.to_bits());
        }
        let merge = &set_a | &set_b;

-        let (a, b) = collect(&hll);
-        let len = get_cardinality(&[a, b]);
-        let len_a = get_cardinality(&[a]);
-        let len_b = get_cardinality(&[b]);
+        let metrics = collect(&hll);
+        let len = get_cardinality(&metrics, |_| true);
+        let len_a = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "a");
+        let len_b = get_cardinality(&metrics, |l| l.get_label()[0].get_value() == "b");

        ([merge.len(), set_a.len(), set_b.len()], [len, len_a, len_b])
    }
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -4,17 +4,6 @@
 //! a default registry.
 #![deny(clippy::undocumented_unsafe_blocks)]

-use measured::{
-    label::{LabelGroupSet, LabelGroupVisitor, LabelName, NoLabels},
-    metric::{
-        counter::CounterState,
-        gauge::GaugeState,
-        group::{Encoding, MetricValue},
-        name::{MetricName, MetricNameEncoder},
-        MetricEncoding, MetricFamilyEncoding,
-    },
-    FixedCardinalityLabel, LabelGroup, MetricGroup,
-};
 use once_cell::sync::Lazy;
 use prometheus::core::{
    Atomic, AtomicU64, Collector, GenericCounter, GenericCounterVec, GenericGauge, GenericGaugeVec,
@@ -22,7 +11,6 @@ use prometheus::core::{
 pub use prometheus::opts;
 pub use prometheus::register;
 pub use prometheus::Error;
-use prometheus::Registry;
 pub use prometheus::{core, default_registry, proto};
 pub use prometheus::{exponential_buckets, linear_buckets};
 pub use prometheus::{register_counter_vec, Counter, CounterVec};
@@ -35,12 +23,13 @@ pub use prometheus::{register_int_counter_vec, IntCounterVec};
 pub use prometheus::{register_int_gauge, IntGauge};
 pub use prometheus::{register_int_gauge_vec, IntGaugeVec};
 pub use prometheus::{Encoder, TextEncoder};
+use prometheus::{Registry, Result};

 pub mod launch_timestamp;
 mod wrappers;
 pub use wrappers::{CountedReader, CountedWriter};
 mod hll;
-pub use hll::{HyperLogLog, HyperLogLogState, HyperLogLogVec};
+pub use hll::{HyperLogLog, HyperLogLogVec};
 #[cfg(target_os = "linux")]
 pub mod more_process_metrics;

@@ -70,7 +59,7 @@ static INTERNAL_REGISTRY: Lazy<Registry> = Lazy::new(Registry::new);
 /// Register a collector in the internal registry. MUST be called before the first call to `gather()`.
 /// Otherwise, we can have a deadlock in the `gather()` call, trying to register a new collector
 /// while holding the lock.
-pub fn register_internal(c: Box<dyn Collector>) -> prometheus::Result<()> {
+pub fn register_internal(c: Box<dyn Collector>) -> Result<()> {
    INTERNAL_REGISTRY.register(c)
 }

@@ -107,127 +96,6 @@ pub const DISK_WRITE_SECONDS_BUCKETS: &[f64] = &[
    0.000_050, 0.000_100, 0.000_500, 0.001, 0.003, 0.005, 0.01, 0.05, 0.1, 0.3, 0.5,
 ];

-pub struct BuildInfo {
-    pub revision: &'static str,
-    pub build_tag: &'static str,
-}
-
-// todo: allow label group without the set
-impl LabelGroup for BuildInfo {
-    fn visit_values(&self, v: &mut impl LabelGroupVisitor) {
-        const REVISION: &LabelName = LabelName::from_str("revision");
-        v.write_value(REVISION, &self.revision);
-        const BUILD_TAG: &LabelName = LabelName::from_str("build_tag");
-        v.write_value(BUILD_TAG, &self.build_tag);
-    }
-}
-
-impl<T: Encoding> MetricFamilyEncoding<T> for BuildInfo
-where
-    GaugeState: MetricEncoding<T>,
-{
-    fn collect_family_into(
-        &self,
-        name: impl measured::metric::name::MetricNameEncoder,
-        enc: &mut T,
-    ) -> Result<(), T::Err> {
-        enc.write_help(&name, "Build/version information")?;
-        GaugeState::write_type(&name, enc)?;
-        GaugeState {
-            count: std::sync::atomic::AtomicI64::new(1),
-        }
-        .collect_into(&(), self, name, enc)
-    }
-}
-
-#[derive(MetricGroup)]
-#[metric(new(build_info: BuildInfo))]
-pub struct NeonMetrics {
-    #[cfg(target_os = "linux")]
-    #[metric(namespace = "process")]
-    #[metric(init = measured_process::ProcessCollector::for_self())]
-    process: measured_process::ProcessCollector,
-
-    #[metric(namespace = "libmetrics")]
-    #[metric(init = LibMetrics::new(build_info))]
-    libmetrics: LibMetrics,
-}
-
-#[derive(MetricGroup)]
-#[metric(new(build_info: BuildInfo))]
-pub struct LibMetrics {
-    #[metric(init = build_info)]
-    build_info: BuildInfo,
-
-    #[metric(flatten)]
-    rusage: Rusage,
-
-    serve_count: CollectionCounter,
-}
-
-fn write_gauge<Enc: Encoding>(
-    x: i64,
-    labels: impl LabelGroup,
-    name: impl MetricNameEncoder,
-    enc: &mut Enc,
-) -> Result<(), Enc::Err> {
-    enc.write_metric_value(name, labels, MetricValue::Int(x))
-}
-
-#[derive(Default)]
-struct Rusage;
-
-#[derive(FixedCardinalityLabel, Clone, Copy)]
-#[label(singleton = "io_operation")]
-enum IoOp {
-    Read,
-    Write,
-}
-
-impl<T: Encoding> MetricGroup<T> for Rusage
-where
-    GaugeState: MetricEncoding<T>,
-{
-    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
-        const DISK_IO: &MetricName = MetricName::from_str("disk_io_bytes_total");
-        const MAXRSS: &MetricName = MetricName::from_str("maxrss_kb");
-
-        let ru = get_rusage_stats();
-
-        enc.write_help(
-            DISK_IO,
-            "Bytes written and read from disk, grouped by the operation (read|write)",
-        )?;
-        GaugeState::write_type(DISK_IO, enc)?;
-        write_gauge(ru.ru_inblock * BYTES_IN_BLOCK, IoOp::Read, DISK_IO, enc)?;
-        write_gauge(ru.ru_oublock * BYTES_IN_BLOCK, IoOp::Write, DISK_IO, enc)?;
-
-        enc.write_help(MAXRSS, "Memory usage (Maximum Resident Set Size)")?;
-        GaugeState::write_type(MAXRSS, enc)?;
-        write_gauge(ru.ru_maxrss, IoOp::Read, MAXRSS, enc)?;
-
-        Ok(())
-    }
-}
-
-#[derive(Default)]
-struct CollectionCounter(CounterState);
-
-impl<T: Encoding> MetricFamilyEncoding<T> for CollectionCounter
-where
-    CounterState: MetricEncoding<T>,
-{
-    fn collect_family_into(
-        &self,
-        name: impl measured::metric::name::MetricNameEncoder,
-        enc: &mut T,
-    ) -> Result<(), T::Err> {
-        self.0.inc();
-        enc.write_help(&name, "Number of metric requests made")?;
-        self.0.collect_into(&(), NoLabels, name, enc)
-    }
-}
-
 pub fn set_build_info_metric(revision: &str, build_tag: &str) {
    let metric = register_int_gauge_vec!(
        "libmetrics_build_info",
@@ -237,7 +105,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
    .expect("Failed to register build info metric");
    metric.with_label_values(&[revision, build_tag]).set(1);
 }
-const BYTES_IN_BLOCK: i64 = 512;

 // Records I/O stats in a "cross-platform" way.
 // Compiles both on macOS and Linux, but current macOS implementation always returns 0 as values for I/O stats.
@@ -250,22 +117,14 @@ const BYTES_IN_BLOCK: i64 = 512;
 fn update_rusage_metrics() {
    let rusage_stats = get_rusage_stats();

+    const BYTES_IN_BLOCK: i64 = 512;
    DISK_IO_BYTES
        .with_label_values(&["read"])
        .set(rusage_stats.ru_inblock * BYTES_IN_BLOCK);
    DISK_IO_BYTES
        .with_label_values(&["write"])
        .set(rusage_stats.ru_oublock * BYTES_IN_BLOCK);
-
-    // On macOS, the unit of maxrss is bytes; on Linux, it's kilobytes. https://stackoverflow.com/a/59915669
-    #[cfg(target_os = "macos")]
-    {
-        MAXRSS_KB.set(rusage_stats.ru_maxrss / 1024);
-    }
-    #[cfg(not(target_os = "macos"))]
-    {
-        MAXRSS_KB.set(rusage_stats.ru_maxrss);
-    }
+    MAXRSS_KB.set(rusage_stats.ru_maxrss);
 }

 fn get_rusage_stats() -> libc::rusage {
@@ -292,7 +151,6 @@ macro_rules! register_int_counter_pair_vec {
        }
    }};
 }
-
 /// Create an [`IntCounterPair`] and registers to default registry.
 #[macro_export(local_inner_macros)]
 macro_rules! register_int_counter_pair {
@@ -330,10 +188,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
    ///
    /// An error is returned if the number of label values is not the same as the
    /// number of VariableLabels in Desc.
-    pub fn get_metric_with_label_values(
-        &self,
-        vals: &[&str],
-    ) -> prometheus::Result<GenericCounterPair<P>> {
+    pub fn get_metric_with_label_values(&self, vals: &[&str]) -> Result<GenericCounterPair<P>> {
        Ok(GenericCounterPair {
            inc: self.inc.get_metric_with_label_values(vals)?,
            dec: self.dec.get_metric_with_label_values(vals)?,
@@ -346,7 +201,7 @@ impl<P: Atomic> GenericCounterPairVec<P> {
        self.get_metric_with_label_values(vals).unwrap()
    }

-    pub fn remove_label_values(&self, res: &mut [prometheus::Result<()>; 2], vals: &[&str]) {
+    pub fn remove_label_values(&self, res: &mut [Result<()>; 2], vals: &[&str]) {
        res[0] = self.inc.remove_label_values(vals);
        res[1] = self.dec.remove_label_values(vals);
    }
@@ -430,171 +285,3 @@ pub type IntCounterPair = GenericCounterPair<AtomicU64>;

 /// A guard for [`IntCounterPair`] that will decrement the gauge on drop
 pub type IntCounterPairGuard = GenericCounterPairGuard<AtomicU64>;
-
-pub trait CounterPairAssoc {
-    const INC_NAME: &'static MetricName;
-    const DEC_NAME: &'static MetricName;
-
-    const INC_HELP: &'static str;
-    const DEC_HELP: &'static str;
-
-    type LabelGroupSet: LabelGroupSet;
-}
-
-pub struct CounterPairVec<A: CounterPairAssoc> {
-    vec: measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
-}
-
-impl<A: CounterPairAssoc> Default for CounterPairVec<A>
-where
-    A::LabelGroupSet: Default,
-{
-    fn default() -> Self {
-        Self {
-            vec: Default::default(),
-        }
-    }
-}
-
-impl<A: CounterPairAssoc> CounterPairVec<A> {
-    pub fn guard(
-        &self,
-        labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
-    ) -> MeasuredCounterPairGuard<'_, A> {
-        let id = self.vec.with_labels(labels);
-        self.vec.get_metric(id).inc.inc();
-        MeasuredCounterPairGuard { vec: &self.vec, id }
-    }
-    pub fn inc(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
-        let id = self.vec.with_labels(labels);
-        self.vec.get_metric(id).inc.inc();
-    }
-    pub fn dec(&self, labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>) {
-        let id = self.vec.with_labels(labels);
-        self.vec.get_metric(id).dec.inc();
-    }
-    pub fn remove_metric(
-        &self,
-        labels: <A::LabelGroupSet as LabelGroupSet>::Group<'_>,
-    ) -> Option<MeasuredCounterPairState> {
-        let id = self.vec.with_labels(labels);
-        self.vec.remove_metric(id)
-    }
-}
-
-impl<T, A> ::measured::metric::group::MetricGroup<T> for CounterPairVec<A>
-where
-    T: ::measured::metric::group::Encoding,
-    A: CounterPairAssoc,
-    ::measured::metric::counter::CounterState: ::measured::metric::MetricEncoding<T>,
-{
-    fn collect_group_into(&self, enc: &mut T) -> Result<(), T::Err> {
-        // write decrement first to avoid a race condition where inc - dec < 0
-        T::write_help(enc, A::DEC_NAME, A::DEC_HELP)?;
-        self.vec
-            .collect_family_into(A::DEC_NAME, &mut Dec(&mut *enc))?;
-
-        T::write_help(enc, A::INC_NAME, A::INC_HELP)?;
-        self.vec
-            .collect_family_into(A::INC_NAME, &mut Inc(&mut *enc))?;
-
-        Ok(())
-    }
-}
-
-#[derive(MetricGroup, Default)]
-pub struct MeasuredCounterPairState {
-    pub inc: CounterState,
-    pub dec: CounterState,
-}
-
-impl measured::metric::MetricType for MeasuredCounterPairState {
-    type Metadata = ();
-}
-
-pub struct MeasuredCounterPairGuard<'a, A: CounterPairAssoc> {
-    vec: &'a measured::metric::MetricVec<MeasuredCounterPairState, A::LabelGroupSet>,
-    id: measured::metric::LabelId<A::LabelGroupSet>,
-}
-
-impl<A: CounterPairAssoc> Drop for MeasuredCounterPairGuard<'_, A> {
-    fn drop(&mut self) {
-        self.vec.get_metric(self.id).dec.inc();
-    }
-}
-
-/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the inc counter to the inner encoder.
-struct Inc<T>(T);
-/// [`MetricEncoding`] for [`MeasuredCounterPairState`] that only writes the dec counter to the inner encoder.
-struct Dec<T>(T);
-
-impl<T: Encoding> Encoding for Inc<T> {
-    type Err = T::Err;
-
-    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
-        self.0.write_help(name, help)
-    }
-
-    fn write_metric_value(
-        &mut self,
-        name: impl MetricNameEncoder,
-        labels: impl LabelGroup,
-        value: MetricValue,
-    ) -> Result<(), Self::Err> {
-        self.0.write_metric_value(name, labels, value)
-    }
-}
-
-impl<T: Encoding> MetricEncoding<Inc<T>> for MeasuredCounterPairState
-where
-    CounterState: MetricEncoding<T>,
-{
-    fn write_type(name: impl MetricNameEncoder, enc: &mut Inc<T>) -> Result<(), T::Err> {
-        CounterState::write_type(name, &mut enc.0)
-    }
-    fn collect_into(
-        &self,
-        metadata: &(),
-        labels: impl LabelGroup,
-        name: impl MetricNameEncoder,
-        enc: &mut Inc<T>,
-    ) -> Result<(), T::Err> {
-        self.inc.collect_into(metadata, labels, name, &mut enc.0)
-    }
-}
-
-impl<T: Encoding> Encoding for Dec<T> {
-    type Err = T::Err;
-
-    fn write_help(&mut self, name: impl MetricNameEncoder, help: &str) -> Result<(), Self::Err> {
-        self.0.write_help(name, help)
-    }
-
-    fn write_metric_value(
-        &mut self,
-        name: impl MetricNameEncoder,
-        labels: impl LabelGroup,
-        value: MetricValue,
-    ) -> Result<(), Self::Err> {
-        self.0.write_metric_value(name, labels, value)
-    }
-}
-
-/// Write the dec counter to the encoder
-impl<T: Encoding> MetricEncoding<Dec<T>> for MeasuredCounterPairState
-where
-    CounterState: MetricEncoding<T>,
-{
-    fn write_type(name: impl MetricNameEncoder, enc: &mut Dec<T>) -> Result<(), T::Err> {
-        CounterState::write_type(name, &mut enc.0)
-    }
-    fn collect_into(
-        &self,
-        metadata: &(),
-        labels: impl LabelGroup,
-        name: impl MetricNameEncoder,
-        enc: &mut Dec<T>,
-    ) -> Result<(), T::Err> {
-        self.dec.collect_into(metadata, labels, name, &mut enc.0)
-    }
-}
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -1,31 +0,0 @@
-use std::collections::HashMap;
-
-use const_format::formatcp;
-
-#[cfg(test)]
-mod tests;
-
-pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
-pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
-pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
-pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
-
-// Certain metadata (e.g. externally-addressable name, AZ) is delivered
-// as a separate structure.  This information is not neeed by the pageserver
-// itself, it is only used for registering the pageserver with the control
-// plane and/or storage controller.
-//
-#[derive(PartialEq, Eq, Debug, serde::Serialize, serde::Deserialize)]
-pub struct NodeMetadata {
-    #[serde(rename = "host")]
-    pub postgres_host: String,
-    #[serde(rename = "port")]
-    pub postgres_port: u16,
-    pub http_host: String,
-    pub http_port: u16,
-
-    // Deployment tools may write fields to the metadata file beyond what we
-    // use in this type: this type intentionally only names fields that require.
-    #[serde(flatten)]
-    pub other: HashMap<String, serde_json::Value>,
-}
--- a/libs/pageserver_api/src/config/tests.rs
+++ b/libs/pageserver_api/src/config/tests.rs
@@ -1,22 +0,0 @@
-use super::*;
-
-#[test]
-fn test_node_metadata_v1_backward_compatibilty() {
-    let v1 = serde_json::to_vec(&serde_json::json!({
-        "host": "localhost",
-        "port": 23,
-        "http_host": "localhost",
-        "http_port": 42,
-    }));
-
-    assert_eq!(
-        serde_json::from_slice::<NodeMetadata>(&v1.unwrap()).unwrap(),
-        NodeMetadata {
-            postgres_host: "localhost".to_string(),
-            postgres_port: 23,
-            http_host: "localhost".to_string(),
-            http_port: 42,
-            other: HashMap::new(),
-        }
-    )
-}
--- a/libs/pageserver_api/src/controller_api.rs
+++ b/libs/pageserver_api/src/controller_api.rs
@@ -2,9 +2,9 @@ use std::str::FromStr;

 /// Request/response types for the storage controller
 /// API (`/control/v1` prefix).  Implemented by the server
-/// in [`storage_controller::http`]
+/// in [`attachment_service::http`]
 use serde::{Deserialize, Serialize};
-use utils::id::{NodeId, TenantId};
+use utils::id::NodeId;

 use crate::{
    models::{ShardParameters, TenantConfig},
@@ -68,27 +68,12 @@ pub struct TenantLocateResponse {

 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponse {
-    pub tenant_id: TenantId,
    pub shards: Vec<TenantDescribeResponseShard>,
    pub stripe_size: ShardStripeSize,
    pub policy: PlacementPolicy,
    pub config: TenantConfig,
 }

-#[derive(Serialize, Deserialize)]
-pub struct NodeDescribeResponse {
-    pub id: NodeId,
-
-    pub availability: NodeAvailabilityWrapper,
-    pub scheduling: NodeSchedulingPolicy,
-
-    pub listen_http_addr: String,
-    pub listen_http_port: u16,
-
-    pub listen_pg_addr: String,
-    pub listen_pg_port: u16,
-}
-
 #[derive(Serialize, Deserialize)]
 pub struct TenantDescribeResponseShard {
    pub tenant_shard_id: TenantShardId,
@@ -104,8 +89,6 @@ pub struct TenantDescribeResponseShard {
    pub is_pending_compute_notification: bool,
    /// A shard split is currently underway
    pub is_splitting: bool,
-
-    pub scheduling_policy: ShardSchedulingPolicy,
 }

 /// Explicitly migrating a particular shard is a low level operation
@@ -120,7 +103,7 @@ pub struct TenantShardMigrateRequest {
 /// Utilisation score indicating how good a candidate a pageserver
 /// is for scheduling the next tenant. See [`crate::models::PageserverUtilization`].
 /// Lower values are better.
-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord, Debug)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
 pub struct UtilizationScore(pub u64);

 impl UtilizationScore {
@@ -129,7 +112,7 @@ impl UtilizationScore {
    }
 }

-#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
+#[derive(Serialize, Clone, Copy)]
 #[serde(into = "NodeAvailabilityWrapper")]
 pub enum NodeAvailability {
    // Normal, happy state
@@ -152,7 +135,7 @@ impl Eq for NodeAvailability {}
 // This wrapper provides serde functionality and it should only be used to
 // communicate with external callers which don't know or care about the
 // utilisation score of the pageserver it is targeting.
-#[derive(Serialize, Deserialize, Clone, Copy, Debug)]
+#[derive(Serialize, Deserialize, Clone)]
 pub enum NodeAvailabilityWrapper {
    Active,
    Offline,
@@ -178,6 +161,21 @@ impl From<NodeAvailability> for NodeAvailabilityWrapper {
    }
 }

+impl FromStr for NodeAvailability {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            // This is used when parsing node configuration requests from neon-local.
+            // Assume the worst possible utilisation score
+            // and let it get updated via the heartbeats.
+            "active" => Ok(Self::Active(UtilizationScore::worst())),
+            "offline" => Ok(Self::Offline),
+            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
+        }
+    }
+}
+
 #[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
 pub enum ShardSchedulingPolicy {
    // Normal mode: the tenant's scheduled locations may be updated at will, including
@@ -204,7 +202,7 @@ impl Default for ShardSchedulingPolicy {
    }
 }

-#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq, Debug)]
+#[derive(Serialize, Deserialize, Clone, Copy, Eq, PartialEq)]
 pub enum NodeSchedulingPolicy {
    Active,
    Filling,
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -1,6 +1,5 @@
 use anyhow::{bail, Result};
 use byteorder::{ByteOrder, BE};
-use bytes::BufMut;
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::{Oid, TransactionId};
 use serde::{Deserialize, Serialize};
@@ -22,107 +21,15 @@ pub struct Key {
    pub field6: u32,
 }

-/// The storage key size.
 pub const KEY_SIZE: usize = 18;

-/// The metadata key size. 2B fewer than the storage key size because field2 is not fully utilized.
-/// See [`Key::to_i128`] for more information on the encoding.
-pub const METADATA_KEY_SIZE: usize = 16;
-
-/// The key prefix start range for the metadata keys. All keys with the first byte >= 0x40 is a metadata key.
-pub const METADATA_KEY_BEGIN_PREFIX: u8 = 0x60;
-pub const METADATA_KEY_END_PREFIX: u8 = 0x7F;
-
-/// The (reserved) key prefix of relation sizes.
-pub const RELATION_SIZE_PREFIX: u8 = 0x61;
-
-/// The key prefix of AUX file keys.
-pub const AUX_KEY_PREFIX: u8 = 0x62;
-
-/// Check if the key falls in the range of metadata keys.
-pub const fn is_metadata_key_slice(key: &[u8]) -> bool {
-    key[0] >= METADATA_KEY_BEGIN_PREFIX && key[0] < METADATA_KEY_END_PREFIX
-}
-
 impl Key {
-    /// Check if the key falls in the range of metadata keys.
-    pub const fn is_metadata_key(&self) -> bool {
-        self.field1 >= METADATA_KEY_BEGIN_PREFIX && self.field1 < METADATA_KEY_END_PREFIX
-    }
-
-    /// Encode a metadata key to a storage key.
-    pub fn from_metadata_key_fixed_size(key: &[u8; METADATA_KEY_SIZE]) -> Self {
-        assert!(is_metadata_key_slice(key), "key not in metadata key range");
-        Key {
-            field1: key[0],
-            field2: u16::from_be_bytes(key[1..3].try_into().unwrap()) as u32,
-            field3: u32::from_be_bytes(key[3..7].try_into().unwrap()),
-            field4: u32::from_be_bytes(key[7..11].try_into().unwrap()),
-            field5: key[11],
-            field6: u32::from_be_bytes(key[12..16].try_into().unwrap()),
-        }
-    }
-
-    /// Encode a metadata key to a storage key.
-    pub fn from_metadata_key(key: &[u8]) -> Self {
-        Self::from_metadata_key_fixed_size(key.try_into().expect("expect 16 byte metadata key"))
-    }
-
-    /// Extract a metadata key to a writer. The result should always be 16 bytes.
-    pub fn extract_metadata_key_to_writer(&self, mut writer: impl BufMut) {
-        writer.put_u8(self.field1);
-        assert!(self.field2 <= 0xFFFF);
-        writer.put_u16(self.field2 as u16);
-        writer.put_u32(self.field3);
-        writer.put_u32(self.field4);
-        writer.put_u8(self.field5);
-        writer.put_u32(self.field6);
-    }
-
-    /// Get the range of metadata keys.
-    pub const fn metadata_key_range() -> Range<Self> {
-        Key {
-            field1: METADATA_KEY_BEGIN_PREFIX,
-            field2: 0,
-            field3: 0,
-            field4: 0,
-            field5: 0,
-            field6: 0,
-        }..Key {
-            field1: METADATA_KEY_END_PREFIX,
-            field2: 0,
-            field3: 0,
-            field4: 0,
-            field5: 0,
-            field6: 0,
-        }
-    }
-
-    /// Get the range of aux keys.
-    pub fn metadata_aux_key_range() -> Range<Self> {
-        Key {
-            field1: AUX_KEY_PREFIX,
-            field2: 0,
-            field3: 0,
-            field4: 0,
-            field5: 0,
-            field6: 0,
-        }..Key {
-            field1: AUX_KEY_PREFIX + 1,
-            field2: 0,
-            field3: 0,
-            field4: 0,
-            field5: 0,
-            field6: 0,
-        }
-    }
-
    /// 'field2' is used to store tablespaceid for relations and small enum numbers for other relish.
    /// As long as Neon does not support tablespace (because of lack of access to local file system),
    /// we can assume that only some predefined namespace OIDs are used which can fit in u16
    pub fn to_i128(&self) -> i128 {
        assert!(self.field2 < 0xFFFF || self.field2 == 0xFFFFFFFF || self.field2 == 0x22222222);
-        (((self.field1 & 0x7F) as i128) << 120)
+        (((self.field1 & 0xf) as i128) << 120)
            | (((self.field2 & 0xFFFF) as i128) << 104)
            | ((self.field3 as i128) << 72)
            | ((self.field4 as i128) << 40)
@@ -132,7 +39,7 @@ impl Key {

    pub const fn from_i128(x: i128) -> Self {
        Key {
-            field1: ((x >> 120) & 0x7F) as u8,
+            field1: ((x >> 120) & 0xf) as u8,
            field2: ((x >> 104) & 0xFFFF) as u32,
            field3: (x >> 72) as u32,
            field4: (x >> 40) as u32,
@@ -141,11 +48,11 @@ impl Key {
        }
    }

-    pub const fn next(&self) -> Key {
+    pub fn next(&self) -> Key {
        self.add(1)
    }

-    pub const fn add(&self, x: u32) -> Key {
+    pub fn add(&self, x: u32) -> Key {
        let mut key = *self;

        let r = key.field6.overflowing_add(x);
@@ -174,8 +81,6 @@ impl Key {
        key
    }

-    /// Convert a 18B slice to a key. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::from_metadata_key`] instead.
    pub fn from_slice(b: &[u8]) -> Self {
        Key {
            field1: b[0],
@@ -187,8 +92,6 @@ impl Key {
        }
    }

-    /// Convert a key to a 18B slice. This function should not be used for metadata keys because field2 is handled differently.
-    /// Use [`Key::extract_metadata_key_to_writer`] instead.
    pub fn write_to_byte_slice(&self, buf: &mut [u8]) {
        buf[0] = self.field1;
        BE::write_u32(&mut buf[1..5], self.field2);
@@ -572,17 +475,12 @@ pub const AUX_FILES_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.

-/// Non inherited range for vectored get.
-pub const NON_INHERITED_RANGE: Range<Key> = AUX_FILES_KEY..AUX_FILES_KEY.next();
-/// Sparse keyspace range for vectored get. Missing key error will be ignored for this range.
-pub const NON_INHERITED_SPARSE_RANGE: Range<Key> = Key::metadata_key_range();
-
 // AUX_FILES currently stores only data for logical replication (slots etc), and
 // we don't preserve these on a branch because safekeepers can't follow timeline
 // switch (and generally it likely should be optional), so ignore these.
 #[inline(always)]
 pub fn is_inherited_key(key: Key) -> bool {
-    !NON_INHERITED_RANGE.contains(&key) && !NON_INHERITED_SPARSE_RANGE.contains(&key)
+    key != AUX_FILES_KEY
 }

 #[inline(always)]
@@ -658,14 +556,11 @@ impl std::str::FromStr for Key {
 mod tests {
    use std::str::FromStr;

-    use crate::key::is_metadata_key_slice;
    use crate::key::Key;

    use rand::Rng;
    use rand::SeedableRng;

-    use super::AUX_KEY_PREFIX;
-
    #[test]
    fn display_fromstr_bijection() {
        let mut rng = rand::rngs::StdRng::seed_from_u64(42);
@@ -681,16 +576,4 @@ mod tests {

        assert_eq!(key, Key::from_str(&format!("{key}")).unwrap());
    }
-
-    #[test]
-    fn test_metadata_keys() {
-        let mut metadata_key = vec![AUX_KEY_PREFIX];
-        metadata_key.extend_from_slice(&[0xFF; 15]);
-        let encoded_key = Key::from_metadata_key(&metadata_key);
-        let mut output_key = Vec::new();
-        encoded_key.extract_metadata_key_to_writer(&mut output_key);
-        assert_eq!(metadata_key, output_key);
-        assert!(encoded_key.is_metadata_key());
-        assert!(is_metadata_key_slice(&metadata_key));
-    }
 }
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -1,10 +1,7 @@
 use postgres_ffi::BLCKSZ;
 use std::ops::Range;

-use crate::{
-    key::Key,
-    shard::{ShardCount, ShardIdentity},
-};
+use crate::key::Key;
 use itertools::Itertools;

 ///
@@ -17,279 +14,44 @@ pub struct KeySpace {
    pub ranges: Vec<Range<Key>>,
 }

-/// A wrapper type for sparse keyspaces.
-#[derive(Clone, Debug, Default, PartialEq, Eq)]
-pub struct SparseKeySpace(pub KeySpace);
-
-/// Represents a contiguous half-open range of the keyspace, masked according to a particular
-/// ShardNumber's stripes: within this range of keys, only some "belong" to the current
-/// shard.
-///
-/// When we iterate over keys within this object, we will skip any keys that don't belong
-/// to this shard.
-///
-/// The start + end keys may not belong to the shard: these specify where layer files should
-/// start  + end, but we will never actually read/write those keys.
-#[derive(Clone, Debug, PartialEq, Eq)]
-pub struct ShardedRange<'a> {
-    pub shard_identity: &'a ShardIdentity,
-    pub range: Range<Key>,
-}
-
-// Calculate the size of a range within the blocks of the same relation, or spanning only the
-// top page in the previous relation's space.
-fn contiguous_range_len(range: &Range<Key>) -> u32 {
-    debug_assert!(is_contiguous_range(range));
-    if range.start.field6 == 0xffffffff {
-        range.end.field6 + 1
-    } else {
-        range.end.field6 - range.start.field6
-    }
-}
-
-/// Return true if this key range includes only keys in the same relation's data blocks, or
-/// just spanning one relation and the logical size (0xffffffff) block of the relation before it.
-///
-/// Contiguous in this context means we know the keys are in use _somewhere_, but it might not
-/// be on our shard.  Later in ShardedRange we do the extra work to figure out how much
-/// of a given contiguous range is present on one shard.
-///
-/// This matters, because:
-/// - Within such ranges, keys are used contiguously.  Outside such ranges it is sparse.
-/// - Within such ranges, we may calculate distances using simple subtraction of field6.
-fn is_contiguous_range(range: &Range<Key>) -> bool {
-    range.start.field1 == range.end.field1
-        && range.start.field2 == range.end.field2
-        && range.start.field3 == range.end.field3
-        && range.start.field4 == range.end.field4
-        && (range.start.field5 == range.end.field5
-            || (range.start.field6 == 0xffffffff && range.start.field5 + 1 == range.end.field5))
-}
-
-impl<'a> ShardedRange<'a> {
-    pub fn new(range: Range<Key>, shard_identity: &'a ShardIdentity) -> Self {
-        Self {
-            shard_identity,
-            range,
-        }
-    }
-
-    /// Break up this range into chunks, each of which has at least one local key in it if the
-    /// total range has at least one local key.
-    pub fn fragment(self, target_nblocks: u32) -> Vec<(u32, Range<Key>)> {
-        // Optimization for single-key case (e.g. logical size keys)
-        if self.range.end == self.range.start.add(1) {
-            return vec![(
-                if self.shard_identity.is_key_disposable(&self.range.start) {
-                    0
-                } else {
-                    1
-                },
-                self.range,
-            )];
-        }
-
-        if !is_contiguous_range(&self.range) {
-            // Ranges that span relations are not fragmented.  We only get these ranges as a result
-            // of operations that act on existing layers, so we trust that the existing range is
-            // reasonably small.
-            return vec![(u32::MAX, self.range)];
-        }
-
-        let mut fragments: Vec<(u32, Range<Key>)> = Vec::new();
-
-        let mut cursor = self.range.start;
-        while cursor < self.range.end {
-            let advance_by = self.distance_to_next_boundary(cursor);
-            let is_fragment_disposable = self.shard_identity.is_key_disposable(&cursor);
-
-            // If the previous fragment is undersized, then we seek to consume enough
-            // blocks to complete it.
-            let (want_blocks, merge_last_fragment) = match fragments.last_mut() {
-                Some(frag) if frag.0 < target_nblocks => (target_nblocks - frag.0, Some(frag)),
-                Some(frag) => {
-                    // Prev block is complete, want the full number.
-                    (
-                        target_nblocks,
-                        if is_fragment_disposable {
-                            // If this current range will be empty (not shard-local data), we will merge into previous
-                            Some(frag)
-                        } else {
-                            None
-                        },
-                    )
-                }
-                None => {
-                    // First iteration, want the full number
-                    (target_nblocks, None)
-                }
-            };
-
-            let advance_by = if is_fragment_disposable {
-                advance_by
-            } else {
-                std::cmp::min(advance_by, want_blocks)
-            };
-
-            let next_cursor = cursor.add(advance_by);
-
-            let this_frag = (
-                if is_fragment_disposable {
-                    0
-                } else {
-                    advance_by
-                },
-                cursor..next_cursor,
-            );
-            cursor = next_cursor;
-
-            if let Some(last_fragment) = merge_last_fragment {
-                // Previous fragment was short or this one is empty, merge into it
-                last_fragment.0 += this_frag.0;
-                last_fragment.1.end = this_frag.1.end;
-            } else {
-                fragments.push(this_frag);
-            }
-        }
-
-        fragments
-    }
-
-    /// Estimate the physical pages that are within this range, on this shard.  This returns
-    /// u32::MAX if the range spans relations: this return value should be interpreted as "large".
-    pub fn page_count(&self) -> u32 {
-        // Special cases for single keys like logical sizes
-        if self.range.end == self.range.start.add(1) {
-            return if self.shard_identity.is_key_disposable(&self.range.start) {
-                0
-            } else {
-                1
-            };
-        }
-
-        // We can only do an authentic calculation of contiguous key ranges
-        if !is_contiguous_range(&self.range) {
-            return u32::MAX;
-        }
-
-        // Special case for single sharded tenants: our logical and physical sizes are the same
-        if self.shard_identity.count < ShardCount::new(2) {
-            return contiguous_range_len(&self.range);
-        }
-
-        // Normal path: step through stripes and part-stripes in the range, evaluate whether each one belongs
-        // to Self, and add the stripe's block count to our total if so.
-        let mut result: u64 = 0;
-        let mut cursor = self.range.start;
-        while cursor < self.range.end {
-            // Count up to the next stripe_size boundary or end of range
-            let advance_by = self.distance_to_next_boundary(cursor);
-
-            // If this blocks in this stripe belong to us, add them to our count
-            if !self.shard_identity.is_key_disposable(&cursor) {
-                result += advance_by as u64;
-            }
-
-            cursor = cursor.add(advance_by);
-        }
-
-        if result > u32::MAX as u64 {
-            u32::MAX
-        } else {
-            result as u32
-        }
-    }
-
-    /// Advance the cursor to the next potential fragment boundary: this is either
-    /// a stripe boundary, or the end of the range.
-    fn distance_to_next_boundary(&self, cursor: Key) -> u32 {
-        let distance_to_range_end = contiguous_range_len(&(cursor..self.range.end));
-
-        if self.shard_identity.count < ShardCount::new(2) {
-            // Optimization: don't bother stepping through stripes if the tenant isn't sharded.
-            return distance_to_range_end;
-        }
-
-        if cursor.field6 == 0xffffffff {
-            // We are wrapping from one relation's logical size to the next relation's first data block
-            return 1;
-        }
-
-        let stripe_index = cursor.field6 / self.shard_identity.stripe_size.0;
-        let stripe_remainder = self.shard_identity.stripe_size.0
-            - (cursor.field6 - stripe_index * self.shard_identity.stripe_size.0);
-
-        if cfg!(debug_assertions) {
-            // We should never overflow field5 and field6 -- our callers check this earlier
-            // and would have returned their u32::MAX cases if the input range violated this.
-            let next_cursor = cursor.add(stripe_remainder);
-            debug_assert!(
-                next_cursor.field1 == cursor.field1
-                    && next_cursor.field2 == cursor.field2
-                    && next_cursor.field3 == cursor.field3
-                    && next_cursor.field4 == cursor.field4
-                    && next_cursor.field5 == cursor.field5
-            )
-        }
-
-        std::cmp::min(stripe_remainder, distance_to_range_end)
-    }
-
-    /// Whereas `page_count` estimates the number of pages physically in this range on this shard,
-    /// this function simply calculates the number of pages in the space, without accounting for those
-    /// pages that would not actually be stored on this node.
-    ///
-    /// Don't use this function in code that works with physical entities like layer files.
-    fn raw_size(range: &Range<Key>) -> u32 {
-        if is_contiguous_range(range) {
-            contiguous_range_len(range)
-        } else {
-            u32::MAX
-        }
-    }
-}
-
 impl KeySpace {
-    /// Create a key space with a single range.
-    pub fn single(key_range: Range<Key>) -> Self {
-        Self {
-            ranges: vec![key_range],
-        }
-    }
-
+    ///
    /// Partition a key space into roughly chunks of roughly 'target_size' bytes
    /// in each partition.
    ///
-    pub fn partition(&self, shard_identity: &ShardIdentity, target_size: u64) -> KeyPartitioning {
+    pub fn partition(&self, target_size: u64) -> KeyPartitioning {
        // Assume that each value is 8k in size.
-        let target_nblocks = (target_size / BLCKSZ as u64) as u32;
+        let target_nblocks = (target_size / BLCKSZ as u64) as usize;

        let mut parts = Vec::new();
        let mut current_part = Vec::new();
        let mut current_part_size: usize = 0;
        for range in &self.ranges {
-            // While doing partitioning, wrap the range in ShardedRange so that our size calculations
-            // will respect shard striping rather than assuming all keys within a range are present.
-            let range = ShardedRange::new(range.clone(), shard_identity);
-
-            // Chunk up the range into parts that each contain up to target_size local blocks
-            for (frag_on_shard_size, frag_range) in range.fragment(target_nblocks) {
-                // If appending the next contiguous range in the keyspace to the current
-                // partition would cause it to be too large, and our current partition
-                // covers at least one block that is physically present in this shard,
-                // then start a new partition
-                if current_part_size + frag_on_shard_size as usize > target_nblocks as usize
-                    && current_part_size > 0
-                {
-                    parts.push(KeySpace {
-                        ranges: current_part,
-                    });
-                    current_part = Vec::new();
-                    current_part_size = 0;
-                }
-                current_part.push(frag_range.start..frag_range.end);
-                current_part_size += frag_on_shard_size as usize;
+            // If appending the next contiguous range in the keyspace to the current
+            // partition would cause it to be too large, start a new partition.
+            let this_size = key_range_size(range) as usize;
+            if current_part_size + this_size > target_nblocks && !current_part.is_empty() {
+                parts.push(KeySpace {
+                    ranges: current_part,
+                });
+                current_part = Vec::new();
+                current_part_size = 0;
            }
+
+            // If the next range is larger than 'target_size', split it into
+            // 'target_size' chunks.
+            let mut remain_size = this_size;
+            let mut start = range.start;
+            while remain_size > target_nblocks {
+                let next = start.add(target_nblocks as u32);
+                parts.push(KeySpace {
+                    ranges: vec![start..next],
+                });
+                start = next;
+                remain_size -= target_nblocks
+            }
+            current_part.push(start..range.end);
+            current_part_size += remain_size;
        }

        // add last partition that wasn't full yet.
@@ -302,10 +64,6 @@ impl KeySpace {
        KeyPartitioning { parts }
    }

-    pub fn is_empty(&self) -> bool {
-        self.total_raw_size() == 0
-    }
-
    /// Merge another keyspace into the current one.
    /// Note: the keyspaces must not ovelap (enforced via assertions)
    pub fn merge(&mut self, other: &KeySpace) {
@@ -336,13 +94,12 @@ impl KeySpace {

    /// Remove all keys in `other` from `self`.
    /// This can involve splitting or removing of existing ranges.
-    /// Returns the removed keyspace
-    pub fn remove_overlapping_with(&mut self, other: &KeySpace) -> KeySpace {
+    pub fn remove_overlapping_with(&mut self, other: &KeySpace) {
        let (self_start, self_end) = match (self.start(), self.end()) {
            (Some(start), Some(end)) => (start, end),
            _ => {
                // self is empty
-                return KeySpace::default();
+                return;
            }
        };

@@ -355,37 +112,30 @@ impl KeySpace {
            .skip_while(|range| self_start >= range.end)
            .take_while(|range| self_end > range.start);

-        let mut removed_accum = KeySpaceRandomAccum::new();
        for range in other_ranges {
            while let Some(overlap_at) = self.overlaps_at(range) {
                let overlapped = self.ranges[overlap_at].clone();

                if overlapped.start < range.start && overlapped.end <= range.end {
                    // Higher part of the range is completely overlapped.
-                    removed_accum.add_range(range.start..self.ranges[overlap_at].end);
                    self.ranges[overlap_at].end = range.start;
                }
                if overlapped.start >= range.start && overlapped.end > range.end {
                    // Lower part of the range is completely overlapped.
-                    removed_accum.add_range(self.ranges[overlap_at].start..range.end);
                    self.ranges[overlap_at].start = range.end;
                }
                if overlapped.start < range.start && overlapped.end > range.end {
                    // Middle part of the range is overlapped.
-                    removed_accum.add_range(range.clone());
                    self.ranges[overlap_at].end = range.start;
                    self.ranges
                        .insert(overlap_at + 1, range.end..overlapped.end);
                }
                if overlapped.start >= range.start && overlapped.end <= range.end {
                    // Whole range is overlapped
-                    removed_accum.add_range(self.ranges[overlap_at].clone());
                    self.ranges.remove(overlap_at);
                }
            }
        }
-
-        removed_accum.to_keyspace()
    }

    pub fn start(&self) -> Option<Key> {
@@ -396,11 +146,11 @@ impl KeySpace {
        self.ranges.last().map(|range| range.end)
    }

-    /// The size of the keyspace in pages, before accounting for sharding
-    pub fn total_raw_size(&self) -> usize {
+    #[allow(unused)]
+    pub fn total_size(&self) -> usize {
        self.ranges
            .iter()
-            .map(|range| ShardedRange::raw_size(range) as usize)
+            .map(|range| key_range_size(range) as usize)
            .sum()
    }

@@ -420,11 +170,6 @@ impl KeySpace {
    pub fn overlaps(&self, range: &Range<Key>) -> bool {
        self.overlaps_at(range).is_some()
    }
-
-    /// Check if the keyspace contains a key
-    pub fn contains(&self, key: &Key) -> bool {
-        self.overlaps(&(*key..key.next()))
-    }
 }

 ///
@@ -439,33 +184,10 @@ pub struct KeyPartitioning {
    pub parts: Vec<KeySpace>,
 }

-/// Represents a partitioning of the sparse key space.
-#[derive(Clone, Debug, Default)]
-pub struct SparseKeyPartitioning {
-    pub parts: Vec<SparseKeySpace>,
-}
-
 impl KeyPartitioning {
    pub fn new() -> Self {
        KeyPartitioning { parts: Vec::new() }
    }
-
-    /// Convert a key partitioning to a sparse partition.
-    pub fn into_sparse(self) -> SparseKeyPartitioning {
-        SparseKeyPartitioning {
-            parts: self.parts.into_iter().map(SparseKeySpace).collect(),
-        }
-    }
-}
-
-impl SparseKeyPartitioning {
-    /// Note: use this function with caution. Attempt to handle a sparse keyspace in the same way as a dense keyspace will
-    /// cause long/dead loops.
-    pub fn into_dense(self) -> KeyPartitioning {
-        KeyPartitioning {
-            parts: self.parts.into_iter().map(|x| x.0).collect(),
-        }
-    }
 }

 ///
@@ -497,7 +219,7 @@ impl KeySpaceAccum {

    #[inline(always)]
    pub fn add_range(&mut self, range: Range<Key>) {
-        self.size += ShardedRange::raw_size(&range) as u64;
+        self.size += key_range_size(&range) as u64;

        match self.accum.as_mut() {
            Some(accum) => {
@@ -529,9 +251,7 @@ impl KeySpaceAccum {
        std::mem::take(self).to_keyspace()
    }

-    // The total number of keys in this object, ignoring any sharding effects that might cause some of
-    // the keys to be omitted in storage on this shard.
-    pub fn raw_size(&self) -> u64 {
+    pub fn size(&self) -> u64 {
        self.size
    }
 }
@@ -587,19 +307,36 @@ impl KeySpaceRandomAccum {
    }
 }

+#[inline(always)]
+pub fn key_range_size(key_range: &Range<Key>) -> u32 {
+    let start = key_range.start;
+    let end = key_range.end;
+
+    if end.field1 != start.field1
+        || end.field2 != start.field2
+        || end.field3 != start.field3
+        || end.field4 != start.field4
+    {
+        return u32::MAX;
+    }
+
+    let start = (start.field5 as u64) << 32 | start.field6 as u64;
+    let end = (end.field5 as u64) << 32 | end.field6 as u64;
+
+    let diff = end - start;
+    if diff > u32::MAX as u64 {
+        u32::MAX
+    } else {
+        diff as u32
+    }
+}
+
 pub fn singleton_range(key: Key) -> Range<Key> {
    key..key.next()
 }

 #[cfg(test)]
 mod tests {
-    use rand::{RngCore, SeedableRng};
-
-    use crate::{
-        models::ShardParameters,
-        shard::{ShardCount, ShardNumber},
-    };
-
    use super::*;
    use std::fmt::Write;

@@ -642,17 +379,14 @@ mod tests {
            accum.add_range(range.clone());
        }

-        let expected_size: u64 = ranges
-            .iter()
-            .map(|r| ShardedRange::raw_size(r) as u64)
-            .sum();
-        assert_eq!(accum.raw_size(), expected_size);
+        let expected_size: u64 = ranges.iter().map(|r| key_range_size(r) as u64).sum();
+        assert_eq!(accum.size(), expected_size);

        assert_ks_eq(&accum.consume_keyspace(), ranges.clone());
-        assert_eq!(accum.raw_size(), 0);
+        assert_eq!(accum.size(), 0);

        assert_ks_eq(&accum.consume_keyspace(), vec![]);
-        assert_eq!(accum.raw_size(), 0);
+        assert_eq!(accum.size(), 0);

        for range in &ranges {
            accum.add_range(range.clone());
@@ -819,16 +553,7 @@ mod tests {
                Key::from_i128(11)..Key::from_i128(13),
            ],
        };
-        let removed = key_space1.remove_overlapping_with(&key_space2);
-        let removed_expected = KeySpace {
-            ranges: vec![
-                Key::from_i128(2)..Key::from_i128(3),
-                Key::from_i128(6)..Key::from_i128(7),
-                Key::from_i128(11)..Key::from_i128(12),
-            ],
-        };
-        assert_eq!(removed, removed_expected);
-
+        key_space1.remove_overlapping_with(&key_space2);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -858,17 +583,7 @@ mod tests {
                Key::from_i128(14)..Key::from_i128(17),
            ],
        };
-
-        let removed = key_space1.remove_overlapping_with(&key_space2);
-        let removed_expected = KeySpace {
-            ranges: vec![
-                Key::from_i128(3)..Key::from_i128(5),
-                Key::from_i128(8)..Key::from_i128(10),
-                Key::from_i128(14)..Key::from_i128(15),
-            ],
-        };
-        assert_eq!(removed, removed_expected);
-
+        key_space1.remove_overlapping_with(&key_space2);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -895,11 +610,7 @@ mod tests {
                Key::from_i128(15)..Key::from_i128(17),
            ],
        };
-
-        let removed = key_space1.remove_overlapping_with(&key_space2);
-        let removed_expected = KeySpace::default();
-        assert_eq!(removed, removed_expected);
-
+        key_space1.remove_overlapping_with(&key_space2);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -926,17 +637,7 @@ mod tests {
        let key_space2 = KeySpace {
            ranges: vec![Key::from_i128(9)..Key::from_i128(19)],
        };
-
-        let removed = key_space1.remove_overlapping_with(&key_space2);
-        let removed_expected = KeySpace {
-            ranges: vec![
-                Key::from_i128(9)..Key::from_i128(10),
-                Key::from_i128(12)..Key::from_i128(15),
-                Key::from_i128(17)..Key::from_i128(19),
-            ],
-        };
-        assert_eq!(removed, removed_expected);
-
+        key_space1.remove_overlapping_with(&key_space2);
        assert_eq!(
            key_space1.ranges,
            vec![
@@ -949,412 +650,4 @@ mod tests {
            ]
        );
    }
-    #[test]
-    fn sharded_range_relation_gap() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-
-        let range = ShardedRange::new(
-            Range {
-                start: Key::from_hex("000000067F00000005000040100300000000").unwrap(),
-                end: Key::from_hex("000000067F00000005000040130000004000").unwrap(),
-            },
-            &shard_identity,
-        );
-
-        // Key range spans relations, expect MAX
-        assert_eq!(range.page_count(), u32::MAX);
-    }
-
-    #[test]
-    fn shard_identity_keyspaces_single_key() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-
-        let range = ShardedRange::new(
-            Range {
-                start: Key::from_hex("000000067f000000010000007000ffffffff").unwrap(),
-                end: Key::from_hex("000000067f00000001000000700100000000").unwrap(),
-            },
-            &shard_identity,
-        );
-        // Single-key range on logical size key
-        assert_eq!(range.page_count(), 1);
-    }
-
-    /// Test the helper that we use to identify ranges which go outside the data blocks of a single relation
-    #[test]
-    fn contiguous_range_check() {
-        assert!(!is_contiguous_range(
-            &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
-                ..Key::from_hex("000000067f00000001000004df0100000003").unwrap())
-        ),);
-
-        // The ranges goes all the way up to the 0xffffffff, including it: this is
-        // not considered a rel block range because 0xffffffff stores logical sizes,
-        // not blocks.
-        assert!(!is_contiguous_range(
-            &(Key::from_hex("000000067f00000001000004df00fffffffe").unwrap()
-                ..Key::from_hex("000000067f00000001000004df0100000000").unwrap())
-        ),);
-
-        // Keys within the normal data region of a relation
-        assert!(is_contiguous_range(
-            &(Key::from_hex("000000067f00000001000004df0000000000").unwrap()
-                ..Key::from_hex("000000067f00000001000004df0000000080").unwrap())
-        ),);
-
-        // The logical size key of one forkno, then some blocks in the next
-        assert!(is_contiguous_range(
-            &(Key::from_hex("000000067f00000001000004df00ffffffff").unwrap()
-                ..Key::from_hex("000000067f00000001000004df0100000080").unwrap())
-        ),);
-    }
-
-    #[test]
-    fn shard_identity_keyspaces_forkno_gap() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-
-        let range = ShardedRange::new(
-            Range {
-                start: Key::from_hex("000000067f00000001000004df00fffffffe").unwrap(),
-                end: Key::from_hex("000000067f00000001000004df0100000003").unwrap(),
-            },
-            &shard_identity,
-        );
-
-        // Range spanning the end of one forkno and the start of the next: we do not attempt to
-        // calculate a valid size, because we have no way to know if they keys between start
-        // and end are actually in use.
-        assert_eq!(range.page_count(), u32::MAX);
-    }
-
-    #[test]
-    fn shard_identity_keyspaces_one_relation() {
-        for shard_number in 0..4 {
-            let shard_identity = ShardIdentity::new(
-                ShardNumber(shard_number),
-                ShardCount::new(4),
-                ShardParameters::DEFAULT_STRIPE_SIZE,
-            )
-            .unwrap();
-
-            let range = ShardedRange::new(
-                Range {
-                    start: Key::from_hex("000000067f00000001000000ae0000000000").unwrap(),
-                    end: Key::from_hex("000000067f00000001000000ae0000000001").unwrap(),
-                },
-                &shard_identity,
-            );
-
-            // Very simple case: range covering block zero of one relation, where that block maps to shard zero
-            if shard_number == 0 {
-                assert_eq!(range.page_count(), 1);
-            } else {
-                // Other shards should perceive the range's size as zero
-                assert_eq!(range.page_count(), 0);
-            }
-        }
-    }
-
-    /// Test helper: construct a ShardedRange and call fragment() on it, returning
-    /// the total page count in the range and the fragments.
-    fn do_fragment(
-        range_start: Key,
-        range_end: Key,
-        shard_identity: &ShardIdentity,
-        target_nblocks: u32,
-    ) -> (u32, Vec<(u32, Range<Key>)>) {
-        let range = ShardedRange::new(
-            Range {
-                start: range_start,
-                end: range_end,
-            },
-            shard_identity,
-        );
-
-        let page_count = range.page_count();
-        let fragments = range.fragment(target_nblocks);
-
-        // Invariant: we always get at least one fragment
-        assert!(!fragments.is_empty());
-
-        // Invariant: the first/last fragment start/end should equal the input start/end
-        assert_eq!(fragments.first().unwrap().1.start, range_start);
-        assert_eq!(fragments.last().unwrap().1.end, range_end);
-
-        if page_count > 0 {
-            // Invariant: every fragment must contain at least one shard-local page, if the
-            // total range contains at least one shard-local page
-            let all_nonzero = fragments.iter().all(|f| f.0 > 0);
-            if !all_nonzero {
-                eprintln!("Found a zero-length fragment: {:?}", fragments);
-            }
-            assert!(all_nonzero);
-        } else {
-            // A range with no shard-local pages should always be returned as a single fragment
-            assert_eq!(fragments, vec![(0, range_start..range_end)]);
-        }
-
-        // Invariant: fragments must be ordered and non-overlapping
-        let mut last: Option<Range<Key>> = None;
-        for frag in &fragments {
-            if let Some(last) = last {
-                assert!(frag.1.start >= last.end);
-                assert!(frag.1.start > last.start);
-            }
-            last = Some(frag.1.clone())
-        }
-
-        // Invariant: fragments respect target_nblocks
-        for frag in &fragments {
-            assert!(frag.0 == u32::MAX || frag.0 <= target_nblocks);
-        }
-
-        (page_count, fragments)
-    }
-
-    /// Really simple tests for fragment(), on a range that just contains a single stripe
-    /// for a single tenant.
-    #[test]
-    fn sharded_range_fragment_simple() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-
-        // A range which we happen to know covers exactly one stripe which belongs to this shard
-        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000008000").unwrap();
-
-        // Ask for stripe_size blocks, we get the whole stripe
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 32768),
-            (32768, vec![(32768, input_start..input_end)])
-        );
-
-        // Ask for more, we still get the whole stripe
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 10000000),
-            (32768, vec![(32768, input_start..input_end)])
-        );
-
-        // Ask for target_nblocks of half the stripe size, we get two halves
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 16384),
-            (
-                32768,
-                vec![
-                    (16384, input_start..input_start.add(16384)),
-                    (16384, input_start.add(16384)..input_end)
-                ]
-            )
-        );
-    }
-
-    #[test]
-    fn sharded_range_fragment_multi_stripe() {
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-
-        // A range which covers multiple stripes, exactly one of which belongs to the current shard.
-        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000020000").unwrap();
-        // Ask for all the blocks, get a fragment that covers the whole range but reports
-        // its size to be just the blocks belonging to our shard.
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 131072),
-            (32768, vec![(32768, input_start..input_end)])
-        );
-
-        // Ask for a sub-stripe quantity
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 16000),
-            (
-                32768,
-                vec![
-                    (16000, input_start..input_start.add(16000)),
-                    (16000, input_start.add(16000)..input_start.add(32000)),
-                    (768, input_start.add(32000)..input_end),
-                ]
-            )
-        );
-
-        // Try on a range that starts slightly after our owned stripe
-        assert_eq!(
-            do_fragment(input_start.add(1), input_end, &shard_identity, 131072),
-            (32767, vec![(32767, input_start.add(1)..input_end)])
-        );
-    }
-
-    /// Test our calculations work correctly when we start a range from the logical size key of
-    /// a previous relation.
-    #[test]
-    fn sharded_range_fragment_starting_from_logical_size() {
-        let input_start = Key::from_hex("000000067f00000001000000ae00ffffffff").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0100008000").unwrap();
-
-        // Shard 0 owns the first stripe in the relation, and the preceding logical size is shard local too
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x10000),
-            (0x8001, vec![(0x8001, input_start..input_end)])
-        );
-
-        // Shard 1 does not own the first stripe in the relation, but it does own the logical size (all shards
-        // store all logical sizes)
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(1),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x10000),
-            (0x1, vec![(0x1, input_start..input_end)])
-        );
-    }
-
-    /// Test that ShardedRange behaves properly when used on un-sharded data
-    #[test]
-    fn sharded_range_fragment_unsharded() {
-        let shard_identity = ShardIdentity::unsharded();
-
-        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000067f00000001000000ae0000010000").unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x8000),
-            (
-                0x10000,
-                vec![
-                    (0x8000, input_start..input_start.add(0x8000)),
-                    (0x8000, input_start.add(0x8000)..input_start.add(0x10000))
-                ]
-            )
-        );
-    }
-
-    #[test]
-    fn sharded_range_fragment_cross_relation() {
-        let shard_identity = ShardIdentity::unsharded();
-
-        // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
-        let input_start = Key::from_hex("000000067f00000001000000ae0000000000").unwrap();
-        let input_end = Key::from_hex("000000068f00000001000000ae0000010000").unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x8000),
-            (u32::MAX, vec![(u32::MAX, input_start..input_end),])
-        );
-
-        // Same, but using a sharded identity
-        let shard_identity = ShardIdentity::new(
-            ShardNumber(0),
-            ShardCount::new(4),
-            ShardParameters::DEFAULT_STRIPE_SIZE,
-        )
-        .unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 0x8000),
-            (u32::MAX, vec![(u32::MAX, input_start..input_end),])
-        );
-    }
-
-    #[test]
-    fn sharded_range_fragment_tiny_nblocks() {
-        let shard_identity = ShardIdentity::unsharded();
-
-        // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
-        let input_start = Key::from_hex("000000067F00000001000004E10000000000").unwrap();
-        let input_end = Key::from_hex("000000067F00000001000004E10000000038").unwrap();
-        assert_eq!(
-            do_fragment(input_start, input_end, &shard_identity, 16),
-            (
-                0x38,
-                vec![
-                    (16, input_start..input_start.add(16)),
-                    (16, input_start.add(16)..input_start.add(32)),
-                    (16, input_start.add(32)..input_start.add(48)),
-                    (8, input_start.add(48)..input_end),
-                ]
-            )
-        );
-    }
-
-    #[test]
-    fn sharded_range_fragment_fuzz() {
-        // Use a fixed seed: we don't want to explicitly pick values, but we do want
-        // the test to be reproducible.
-        let mut prng = rand::rngs::StdRng::seed_from_u64(0xdeadbeef);
-
-        for _i in 0..1000 {
-            let shard_identity = if prng.next_u32() % 2 == 0 {
-                ShardIdentity::unsharded()
-            } else {
-                let shard_count = prng.next_u32() % 127 + 1;
-                ShardIdentity::new(
-                    ShardNumber((prng.next_u32() % shard_count) as u8),
-                    ShardCount::new(shard_count as u8),
-                    ShardParameters::DEFAULT_STRIPE_SIZE,
-                )
-                .unwrap()
-            };
-
-            let target_nblocks = prng.next_u32() % 65536 + 1;
-
-            let start_offset = prng.next_u32() % 16384;
-
-            // Try ranges up to 4GiB in size, that are always at least 1
-            let range_size = prng.next_u32() % 8192 + 1;
-
-            // A range that spans relations: expect fragmentation to give up and return a u32::MAX size
-            let input_start = Key::from_hex("000000067F00000001000004E10000000000")
-                .unwrap()
-                .add(start_offset);
-            let input_end = input_start.add(range_size);
-
-            // This test's main success conditions are the invariants baked into do_fragment
-            let (_total_size, fragments) =
-                do_fragment(input_start, input_end, &shard_identity, target_nblocks);
-
-            // Pick a random key within the range and check it appears in the output
-            let example_key = input_start.add(prng.next_u32() % range_size);
-
-            // Panic on unwrap if it isn't found
-            let example_key_frag = fragments
-                .iter()
-                .find(|f| f.1.contains(&example_key))
-                .unwrap();
-
-            // Check that the fragment containing our random key has a nonzero size if
-            // that key is shard-local
-            let example_key_local = !shard_identity.is_key_disposable(&example_key);
-            if example_key_local {
-                assert!(example_key_frag.0 > 0);
-            }
-        }
-    }
 }
--- a/libs/pageserver_api/src/lib.rs
+++ b/libs/pageserver_api/src/lib.rs
@@ -1,5 +1,6 @@
 #![deny(unsafe_code)]
 #![deny(clippy::undocumented_unsafe_blocks)]
+use const_format::formatcp;

 pub mod controller_api;
 pub mod key;
@@ -10,4 +11,7 @@ pub mod shard;
 /// Public API types
 pub mod upcall_api;

-pub mod config;
+pub const DEFAULT_PG_LISTEN_PORT: u16 = 64000;
+pub const DEFAULT_PG_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_PG_LISTEN_PORT}");
+pub const DEFAULT_HTTP_LISTEN_PORT: u16 = 9898;
+pub const DEFAULT_HTTP_LISTEN_ADDR: &str = formatcp!("127.0.0.1:{DEFAULT_HTTP_LISTEN_PORT}");
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -20,7 +20,6 @@ use utils::{
    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
-    serde_system_time,
 };

 use crate::controller_api::PlacementPolicy;
@@ -303,7 +302,6 @@ pub struct TenantConfig {
    pub lazy_slru_download: Option<bool>,
    pub timeline_get_throttle: Option<ThrottleConfig>,
    pub image_layer_creation_check_threshold: Option<u8>,
-    pub switch_to_aux_file_v2: Option<bool>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -430,6 +428,7 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
+    pub tenant_id: Option<TenantShardId>,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
@@ -747,18 +746,10 @@ pub struct TimelineGcRequest {
    pub gc_horizon: Option<u64>,
 }

-#[derive(Debug, Clone, Serialize, Deserialize)]
-pub struct WalRedoManagerProcessStatus {
-    pub pid: u32,
-    /// The strum-generated `into::<&'static str>()` for `pageserver::walredo::ProcessKind`.
-    /// `ProcessKind` are a transitory thing, so, they have no enum representation in `pageserver_api`.
-    pub kind: Cow<'static, str>,
-}
-
 #[derive(Debug, Clone, Serialize, Deserialize)]
 pub struct WalRedoManagerStatus {
    pub last_redo_at: Option<chrono::DateTime<chrono::Utc>>,
-    pub process: Option<WalRedoManagerProcessStatus>,
+    pub pid: Option<u32>,
 }

 /// The progress of a secondary tenant is mostly useful when doing a long running download: e.g. initiating
@@ -767,7 +758,11 @@ pub struct WalRedoManagerStatus {
 #[derive(Default, Debug, Serialize, Deserialize, Clone)]
 pub struct SecondaryProgress {
    /// The remote storage LastModified time of the heatmap object we last downloaded.
-    pub heatmap_mtime: Option<serde_system_time::SystemTime>,
+    #[serde(
+        serialize_with = "opt_ser_rfc3339_millis",
+        deserialize_with = "opt_deser_rfc3339_millis"
+    )]
+    pub heatmap_mtime: Option<SystemTime>,

    /// The number of layers currently on-disk
    pub layers_downloaded: usize,
@@ -780,15 +775,27 @@ pub struct SecondaryProgress {
    pub bytes_total: u64,
 }

-#[derive(Serialize, Deserialize, Debug)]
-pub struct TenantScanRemoteStorageShard {
-    pub tenant_shard_id: TenantShardId,
-    pub generation: Option<u32>,
+fn opt_ser_rfc3339_millis<S: serde::Serializer>(
+    ts: &Option<SystemTime>,
+    serializer: S,
+) -> Result<S::Ok, S::Error> {
+    match ts {
+        Some(ts) => serializer.collect_str(&humantime::format_rfc3339_millis(*ts)),
+        None => serializer.serialize_none(),
+    }
 }

-#[derive(Serialize, Deserialize, Debug, Default)]
-pub struct TenantScanRemoteStorageResponse {
-    pub shards: Vec<TenantScanRemoteStorageShard>,
+fn opt_deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<Option<SystemTime>, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    let s: Option<String> = serde::de::Deserialize::deserialize(deserializer)?;
+    match s {
+        None => Ok(None),
+        Some(s) => humantime::parse_rfc3339(&s)
+            .map_err(serde::de::Error::custom)
+            .map(Some),
+    }
 }

 pub mod virtual_file {
@@ -858,72 +865,39 @@ impl TryFrom<u8> for PagestreamBeMessageTag {
    }
 }

-// In the V2 protocol version, a GetPage request contains two LSN values:
-//
-// request_lsn: Get the page version at this point in time.  Lsn::Max is a special value that means
-// "get the latest version present". It's used by the primary server, which knows that no one else
-// is writing WAL. 'not_modified_since' must be set to a proper value even if request_lsn is
-// Lsn::Max. Standby servers use the current replay LSN as the request LSN.
-//
-// not_modified_since: Hint to the pageserver that the client knows that the page has not been
-// modified between 'not_modified_since' and the request LSN. It's always correct to set
-// 'not_modified_since equal' to 'request_lsn' (unless Lsn::Max is used as the 'request_lsn'), but
-// passing an earlier LSN can speed up the request, by allowing the pageserver to process the
-// request without waiting for 'request_lsn' to arrive.
-//
-// The legacy V1 interface contained only one LSN, and a boolean 'latest' flag. The V1 interface was
-// sufficient for the primary; the 'lsn' was equivalent to the 'not_modified_since' value, and
-// 'latest' was set to true. The V2 interface was added because there was no correct way for a
-// standby to request a page at a particular non-latest LSN, and also include the
-// 'not_modified_since' hint. That led to an awkward choice of either using an old LSN in the
-// request, if the standby knows that the page hasn't been modified since, and risk getting an error
-// if that LSN has fallen behind the GC horizon, or requesting the current replay LSN, which could
-// require the pageserver unnecessarily to wait for the WAL to arrive up to that point. The new V2
-// interface allows sending both LSNs, and let the pageserver do the right thing. There is no
-// difference in the responses between V1 and V2.
-//
-// The Request structs below reflect the V2 interface. If V1 is used, the parse function
-// maps the old format requests to the new format.
-//
-#[derive(Clone, Copy)]
-pub enum PagestreamProtocolVersion {
-    V1,
-    V2,
-}
-
 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamExistsRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub rel: RelTag,
 }

 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamNblocksRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub rel: RelTag,
 }

 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetPageRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub rel: RelTag,
    pub blkno: u32,
 }

 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamDbSizeRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub dbnode: u32,
 }

 #[derive(Debug, PartialEq, Eq)]
 pub struct PagestreamGetSlruSegmentRequest {
-    pub request_lsn: Lsn,
-    pub not_modified_since: Lsn,
+    pub latest: bool,
+    pub lsn: Lsn,
    pub kind: u8,
    pub segno: u32,
 }
@@ -970,16 +944,14 @@ pub struct TenantHistorySize {
 }

 impl PagestreamFeMessage {
-    /// Serialize a compute -> pageserver message. This is currently only used in testing
-    /// tools. Always uses protocol version 2.
    pub fn serialize(&self) -> Bytes {
        let mut bytes = BytesMut::new();

        match self {
            Self::Exists(req) => {
                bytes.put_u8(0);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -988,8 +960,8 @@ impl PagestreamFeMessage {

            Self::Nblocks(req) => {
                bytes.put_u8(1);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -998,8 +970,8 @@ impl PagestreamFeMessage {

            Self::GetPage(req) => {
                bytes.put_u8(2);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.rel.spcnode);
                bytes.put_u32(req.rel.dbnode);
                bytes.put_u32(req.rel.relnode);
@@ -1009,15 +981,15 @@ impl PagestreamFeMessage {

            Self::DbSize(req) => {
                bytes.put_u8(3);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u32(req.dbnode);
            }

            Self::GetSlruSegment(req) => {
                bytes.put_u8(4);
-                bytes.put_u64(req.request_lsn.0);
-                bytes.put_u64(req.not_modified_since.0);
+                bytes.put_u8(u8::from(req.latest));
+                bytes.put_u64(req.lsn.0);
                bytes.put_u8(req.kind);
                bytes.put_u32(req.segno);
            }
@@ -1026,40 +998,18 @@ impl PagestreamFeMessage {
        bytes.into()
    }

-    pub fn parse<R: std::io::Read>(
-        body: &mut R,
-        protocol_version: PagestreamProtocolVersion,
-    ) -> anyhow::Result<PagestreamFeMessage> {
+    pub fn parse<R: std::io::Read>(body: &mut R) -> anyhow::Result<PagestreamFeMessage> {
+        // TODO these gets can fail
+
        // these correspond to the NeonMessageTag enum in pagestore_client.h
        //
        // TODO: consider using protobuf or serde bincode for less error prone
        // serialization.
        let msg_tag = body.read_u8()?;
-
-        let (request_lsn, not_modified_since) = match protocol_version {
-            PagestreamProtocolVersion::V2 => (
-                Lsn::from(body.read_u64::<BigEndian>()?),
-                Lsn::from(body.read_u64::<BigEndian>()?),
-            ),
-            PagestreamProtocolVersion::V1 => {
-                // In the old protocol, each message starts with a boolean 'latest' flag,
-                // followed by 'lsn'. Convert that to the two LSNs, 'request_lsn' and
-                // 'not_modified_since', used in the new protocol version.
-                let latest = body.read_u8()? != 0;
-                let request_lsn = Lsn::from(body.read_u64::<BigEndian>()?);
-                if latest {
-                    (Lsn::MAX, request_lsn) // get latest version
-                } else {
-                    (request_lsn, request_lsn) // get version at specified LSN
-                }
-            }
-        };
-
-        // The rest of the messages are the same between V1 and V2
        match msg_tag {
            0 => Ok(PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                request_lsn,
-                not_modified_since,
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                rel: RelTag {
                    spcnode: body.read_u32::<BigEndian>()?,
                    dbnode: body.read_u32::<BigEndian>()?,
@@ -1068,8 +1018,8 @@ impl PagestreamFeMessage {
                },
            })),
            1 => Ok(PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                request_lsn,
-                not_modified_since,
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                rel: RelTag {
                    spcnode: body.read_u32::<BigEndian>()?,
                    dbnode: body.read_u32::<BigEndian>()?,
@@ -1078,8 +1028,8 @@ impl PagestreamFeMessage {
                },
            })),
            2 => Ok(PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                request_lsn,
-                not_modified_since,
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                rel: RelTag {
                    spcnode: body.read_u32::<BigEndian>()?,
                    dbnode: body.read_u32::<BigEndian>()?,
@@ -1089,14 +1039,14 @@ impl PagestreamFeMessage {
                blkno: body.read_u32::<BigEndian>()?,
            })),
            3 => Ok(PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                request_lsn,
-                not_modified_since,
+                latest: body.read_u8()? != 0,
+                lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                dbnode: body.read_u32::<BigEndian>()?,
            })),
            4 => Ok(PagestreamFeMessage::GetSlruSegment(
                PagestreamGetSlruSegmentRequest {
-                    request_lsn,
-                    not_modified_since,
+                    latest: body.read_u8()? != 0,
+                    lsn: Lsn::from(body.read_u64::<BigEndian>()?),
                    kind: body.read_u8()?,
                    segno: body.read_u32::<BigEndian>()?,
                },
@@ -1224,8 +1174,8 @@ mod tests {
        // Test serialization/deserialization of PagestreamFeMessage
        let messages = vec![
            PagestreamFeMessage::Exists(PagestreamExistsRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(3),
+                latest: true,
+                lsn: Lsn(4),
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -1234,8 +1184,8 @@ mod tests {
                },
            }),
            PagestreamFeMessage::Nblocks(PagestreamNblocksRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(4),
+                latest: false,
+                lsn: Lsn(4),
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -1244,8 +1194,8 @@ mod tests {
                },
            }),
            PagestreamFeMessage::GetPage(PagestreamGetPageRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(3),
+                latest: true,
+                lsn: Lsn(4),
                rel: RelTag {
                    forknum: 1,
                    spcnode: 2,
@@ -1255,16 +1205,14 @@ mod tests {
                blkno: 7,
            }),
            PagestreamFeMessage::DbSize(PagestreamDbSizeRequest {
-                request_lsn: Lsn(4),
-                not_modified_since: Lsn(3),
+                latest: true,
+                lsn: Lsn(4),
                dbnode: 7,
            }),
        ];
        for msg in messages {
            let bytes = msg.serialize();
-            let reconstructed =
-                PagestreamFeMessage::parse(&mut bytes.reader(), PagestreamProtocolVersion::V2)
-                    .unwrap();
+            let reconstructed = PagestreamFeMessage::parse(&mut bytes.reader()).unwrap();
            assert!(msg == reconstructed);
        }
    }
--- a/libs/pageserver_api/src/models/partitioning.rs
+++ b/libs/pageserver_api/src/models/partitioning.rs
@@ -1,11 +1,9 @@
 use utils::lsn::Lsn;

-use crate::keyspace::SparseKeySpace;
-
 #[derive(Debug, PartialEq, Eq)]
 pub struct Partitioning {
    pub keys: crate::keyspace::KeySpace,
-    pub sparse_keys: crate::keyspace::SparseKeySpace,
+
    pub at_lsn: Lsn,
 }

@@ -34,8 +32,6 @@ impl serde::Serialize for Partitioning {
        let mut map = serializer.serialize_map(Some(2))?;
        map.serialize_key("keys")?;
        map.serialize_value(&KeySpace(&self.keys))?;
-        map.serialize_key("sparse_keys")?;
-        map.serialize_value(&KeySpace(&self.sparse_keys.0))?;
        map.serialize_key("at_lsn")?;
        map.serialize_value(&WithDisplay(&self.at_lsn))?;
        map.end()
@@ -103,7 +99,6 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
        #[derive(serde::Deserialize)]
        struct De {
            keys: KeySpace,
-            sparse_keys: KeySpace,
            #[serde_as(as = "serde_with::DisplayFromStr")]
            at_lsn: Lsn,
        }
@@ -112,7 +107,6 @@ impl<'a> serde::Deserialize<'a> for Partitioning {
        Ok(Self {
            at_lsn: de.at_lsn,
            keys: de.keys.0,
-            sparse_keys: SparseKeySpace(de.sparse_keys.0),
        })
    }
 }
@@ -139,12 +133,6 @@ mod tests {
                "030000000000000000000000000000000003"
              ]
            ],
-            "sparse_keys": [
-              [
-                "620000000000000000000000000000000000",
-                "620000000000000000000000000000000003"
-              ]
-            ],
            "at_lsn": "0/2240160"
        }
        "#;
--- a/libs/pageserver_api/src/models/utilization.rs
+++ b/libs/pageserver_api/src/models/utilization.rs
@@ -1,4 +1,4 @@
-use utils::serde_system_time::SystemTime;
+use std::time::SystemTime;

 /// Pageserver current utilization and scoring for how good candidate the pageserver would be for
 /// the next tenant.
@@ -21,9 +21,28 @@ pub struct PageserverUtilization {
    /// When was this snapshot captured, pageserver local time.
    ///
    /// Use millis to give confidence that the value is regenerated often enough.
+    #[serde(
+        serialize_with = "ser_rfc3339_millis",
+        deserialize_with = "deser_rfc3339_millis"
+    )]
    pub captured_at: SystemTime,
 }

+fn ser_rfc3339_millis<S: serde::Serializer>(
+    ts: &SystemTime,
+    serializer: S,
+) -> Result<S::Ok, S::Error> {
+    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
+}
+
+fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<SystemTime, D::Error>
+where
+    D: serde::de::Deserializer<'de>,
+{
+    let s: String = serde::de::Deserialize::deserialize(deserializer)?;
+    humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
+}
+
 /// openapi knows only `format: int64`, so avoid outputting a non-parseable value by generated clients.
 ///
 /// Instead of newtype, use this because a newtype would get require handling deserializing values
@@ -50,9 +69,7 @@ mod tests {
            disk_usage_bytes: u64::MAX,
            free_space_bytes: 0,
            utilization_score: u64::MAX,
-            captured_at: SystemTime(
-                std::time::SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
-            ),
+            captured_at: SystemTime::UNIX_EPOCH + Duration::from_secs(1708509779),
        };

        let s = serde_json::to_string(&doc).unwrap();
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -5,99 +5,21 @@ use crate::{
    models::ShardParameters,
 };
 use hex::FromHex;
-use postgres_ffi::relfile_utils::INIT_FORKNUM;
 use serde::{Deserialize, Serialize};
 use utils::id::TenantId;

-/// See docs/rfcs/031-sharding-static.md for an overview of sharding.
-///
-/// This module contains a variety of types used to represent the concept of sharding
-/// a Neon tenant across multiple physical shards.  Since there are quite a few of these,
-/// we provide an summary here.
-///
-/// Types used to describe shards:
-/// - [`ShardCount`] describes how many shards make up a tenant, plus the magic `unsharded` value
-///   which identifies a tenant which is not shard-aware.  This means its storage paths do not include
-///   a shard suffix.
-/// - [`ShardNumber`] is simply the zero-based index of a shard within a tenant.
-/// - [`ShardIndex`] is the 2-tuple of `ShardCount` and `ShardNumber`, it's just like a `TenantShardId`
-///   without the tenant ID.  This is useful for things that are implicitly scoped to a particular
-///   tenant, such as layer files.
-/// - [`ShardIdentity`]` is the full description of a particular shard's parameters, in sufficient
-///   detail to convert a [`Key`] to a [`ShardNumber`] when deciding where to write/read.
-/// - The [`ShardSlug`] is a terse formatter for ShardCount and ShardNumber, written as
-///   four hex digits.  An unsharded tenant is `0000`.
-/// - [`TenantShardId`] is the unique ID of a particular shard within a particular tenant
-///
-/// Types used to describe the parameters for data distribution in a sharded tenant:
-/// - [`ShardStripeSize`] controls how long contiguous runs of [`Key`]s (stripes) are when distributed across
-///   multiple shards.  Its value is given in 8kiB pages.
-/// - [`ShardLayout`] describes the data distribution scheme, and at time of writing is
-///   always zero: this is provided for future upgrades that might introduce different
-///   data distribution schemes.
-///
-/// Examples:
-/// - A legacy unsharded tenant has one shard with ShardCount(0), ShardNumber(0), and its slug is 0000
-/// - A single sharded tenant has one shard with ShardCount(1), ShardNumber(0), and its slug is 0001
-/// - In a tenant with 4 shards, each shard has ShardCount(N), ShardNumber(i) where i in 0..N-1 (inclusive),
-///   and their slugs are 0004, 0104, 0204, and 0304.
-
 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardNumber(pub u8);

 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(u8);

-/// Combination of ShardNumber and ShardCount.  For use within the context of a particular tenant,
-/// when we need to know which shard we're dealing with, but do not need to know the full
-/// ShardIdentity (because we won't be doing any page->shard mapping), and do not need to know
-/// the fully qualified TenantShardId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct ShardIndex {
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
-/// The ShardIdentity contains enough information to map a [`Key`] to a [`ShardNumber`],
-/// and to check whether that [`ShardNumber`] is the same as the current shard.
-#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
-pub struct ShardIdentity {
-    pub number: ShardNumber,
-    pub count: ShardCount,
-    pub stripe_size: ShardStripeSize,
-    layout: ShardLayout,
-}
-
-/// Formatting helper, for generating the `shard_id` label in traces.
-struct ShardSlug<'a>(&'a TenantShardId);
-
-/// TenantShardId globally identifies a particular shard in a particular tenant.
-///
-/// These are written as `<TenantId>-<ShardSlug>`, for example:
-///   # The second shard in a two-shard tenant
-///   072f1291a5310026820b2fe4b2968934-0102
-///
-/// If the `ShardCount` is _unsharded_, the `TenantShardId` is written without
-/// a shard suffix and is equivalent to the encoding of a `TenantId`: this enables
-/// an unsharded [`TenantShardId`] to be used interchangably with a [`TenantId`].
-///
-/// The human-readable encoding of an unsharded TenantShardId, such as used in API URLs,
-/// is both forward and backward compatible with TenantId: a legacy TenantId can be
-/// decoded as a TenantShardId, and when re-encoded it will be parseable
-/// as a TenantId.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
-pub struct TenantShardId {
-    pub tenant_id: TenantId,
-    pub shard_number: ShardNumber,
-    pub shard_count: ShardCount,
-}
-
 impl ShardCount {
    pub const MAX: Self = Self(u8::MAX);

    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
    /// legacy format for TenantShardId that excludes the shard suffix", also known
-    /// as [`TenantShardId::unsharded`].
+    /// as `TenantShardId::unsharded`.
    ///
    /// This method returns the actual number of shards, i.e. if our internal value is
    /// zero, we return 1 (unsharded tenants have 1 shard).
@@ -116,9 +38,6 @@ impl ShardCount {
        self.0
    }

-    /// Whether the `ShardCount` is for an unsharded tenant, so uses one shard but
-    /// uses the legacy format for `TenantShardId`. See also the documentation for
-    /// [`Self::count`].
    pub fn is_unsharded(&self) -> bool {
        self.0 == 0
    }
@@ -134,6 +53,33 @@ impl ShardNumber {
    pub const MAX: Self = Self(u8::MAX);
 }

+/// TenantShardId identify the units of work for the Pageserver.
+///
+/// These are written as `<tenant_id>-<shard number><shard-count>`, for example:
+///
+///   # The second shard in a two-shard tenant
+///   072f1291a5310026820b2fe4b2968934-0102
+///
+/// Historically, tenants could not have multiple shards, and were identified
+/// by TenantId.  To support this, TenantShardId has a special legacy
+/// mode where `shard_count` is equal to zero: this represents a single-sharded
+/// tenant which should be written as a TenantId with no suffix.
+///
+/// The human-readable encoding of TenantShardId, such as used in API URLs,
+/// is both forward and backward compatible: a legacy TenantId can be
+/// decoded as a TenantShardId, and when re-encoded it will be parseable
+/// as a TenantId.
+///
+/// Note that the binary encoding is _not_ backward compatible, because
+/// at the time sharding is introduced, there are no existing binary structures
+/// containing TenantId that we need to handle.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct TenantShardId {
+    pub tenant_id: TenantId,
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
 impl TenantShardId {
    pub fn unsharded(tenant_id: TenantId) -> Self {
        Self {
@@ -165,13 +111,10 @@ impl TenantShardId {
    }

    /// Convenience for code that has special behavior on the 0th shard.
-    pub fn is_shard_zero(&self) -> bool {
+    pub fn is_zero(&self) -> bool {
        self.shard_number == ShardNumber(0)
    }

-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
    }
@@ -207,6 +150,9 @@ impl TenantShardId {
    }
 }

+/// Formatting helper
+struct ShardSlug<'a>(&'a TenantShardId);
+
 impl<'a> std::fmt::Display for ShardSlug<'a> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
@@ -276,6 +222,16 @@ impl From<[u8; 18]> for TenantShardId {
    }
 }

+/// For use within the context of a particular tenant, when we need to know which
+/// shard we're dealing with, but do not need to know the full ShardIdentity (because
+/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
+/// TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
 impl ShardIndex {
    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
        Self {
@@ -290,9 +246,6 @@ impl ShardIndex {
        }
    }

-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
    }
@@ -360,8 +313,6 @@ impl Serialize for TenantShardId {
        if serializer.is_human_readable() {
            serializer.collect_str(self)
        } else {
-            // Note: while human encoding of [`TenantShardId`] is backward and forward
-            // compatible, this binary encoding is not.
            let mut packed: [u8; 18] = [0; 18];
            packed[0..16].clone_from_slice(&self.tenant_id.as_arr());
            packed[16] = self.shard_number.0;
@@ -439,6 +390,16 @@ const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
 /// Default stripe size in pages: 256MiB divided by 8kiB page size.
 const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);

+/// The ShardIdentity contains the information needed for one member of map
+/// to resolve a key to a shard, and then check whether that shard is ==self.
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+pub struct ShardIdentity {
+    pub number: ShardNumber,
+    pub count: ShardCount,
+    pub stripe_size: ShardStripeSize,
+    layout: ShardLayout,
+}
+
 #[derive(thiserror::Error, Debug, PartialEq, Eq)]
 pub enum ShardConfigError {
    #[error("Invalid shard count")]
@@ -453,7 +414,7 @@ impl ShardIdentity {
    /// An identity with number=0 count=0 is a "none" identity, which represents legacy
    /// tenants.  Modern single-shard tenants should not use this: they should
    /// have number=0 count=1.
-    pub const fn unsharded() -> Self {
+    pub fn unsharded() -> Self {
        Self {
            number: ShardNumber(0),
            count: ShardCount(0),
@@ -478,9 +439,6 @@ impl ShardIdentity {
        }
    }

-    /// The "unsharded" value is distinct from simply having a single shard: it represents
-    /// a tenant which is not shard-aware at all, and whose storage paths will not include
-    /// a shard suffix.
    pub fn is_unsharded(&self) -> bool {
        self.number == ShardNumber(0) && self.count == ShardCount(0)
    }
@@ -529,8 +487,6 @@ impl ShardIdentity {
    }

    /// Return true if the key should be ingested by this shard
-    ///
-    /// Shards must ingest _at least_ keys which return true from this check.
    pub fn is_key_local(&self, key: &Key) -> bool {
        assert!(!self.is_broken());
        if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
@@ -541,9 +497,7 @@ impl ShardIdentity {
    }

    /// Return true if the key should be discarded if found in this shard's
-    /// data store, e.g. during compaction after a split.
-    ///
-    /// Shards _may_ drop keys which return false here, but are not obliged to.
+    /// data store, e.g. during compaction after a split
    pub fn is_key_disposable(&self, key: &Key) -> bool {
        if key_is_shard0(key) {
            // Q: Why can't we dispose of shard0 content if we're not shard 0?
@@ -569,7 +523,7 @@ impl ShardIdentity {

    /// Convenience for checking if this identity is the 0th shard in a tenant,
    /// for special cases on shard 0 such as ingesting relation sizes.
-    pub fn is_shard_zero(&self) -> bool {
+    pub fn is_zero(&self) -> bool {
        self.number == ShardNumber(0)
    }
 }
@@ -652,13 +606,7 @@ fn key_is_shard0(key: &Key) -> bool {
    // relation pages are distributed to shards other than shard zero. Everything else gets
    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
    // requests, and any request other than those for particular blocks in relations.
-    //
-    // The only exception to this rule is "initfork" data -- this relates to postgres's UNLOGGED table
-    // type. These are special relations, usually with only 0 or 1 blocks, and we store them on shard 0
-    // because they must be included in basebackups.
-    let is_initfork = key.field5 == INIT_FORKNUM;
-
-    !is_rel_block_key(key) || is_initfork
+    !is_rel_block_key(key)
 }

 /// Provide the same result as the function in postgres `hashfn.h` with the same name
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -118,9 +118,7 @@ pub use v14::bindings::{TimeLineID, TimestampTz, XLogRecPtr, XLogSegNo};
 // Likewise for these, although the assumption that these don't change is a little more iffy.
 pub use v14::bindings::{MultiXactOffset, MultiXactStatus};
 pub use v14::bindings::{PageHeaderData, XLogRecord};
-pub use v14::xlog_utils::{
-    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
-};
+pub use v14::xlog_utils::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};

 pub use v14::bindings::{CheckPoint, ControlFileData};

--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -4,9 +4,7 @@ use log::*;
 use postgres::types::PgLsn;
 use postgres::Client;
 use postgres_ffi::{WAL_SEGMENT_SIZE, XLOG_BLCKSZ};
-use postgres_ffi::{
-    XLOG_SIZE_OF_XLOG_LONG_PHD, XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD,
-};
+use postgres_ffi::{XLOG_SIZE_OF_XLOG_RECORD, XLOG_SIZE_OF_XLOG_SHORT_PHD};
 use std::path::{Path, PathBuf};
 use std::process::Command;
 use std::time::{Duration, Instant};
@@ -264,21 +262,11 @@ fn craft_internal<C: postgres::GenericClient>(
        intermediate_lsns.insert(0, initial_lsn);
    }

-    // Some records may be not flushed, e.g. non-transactional logical messages. Flush now.
+    // Some records may be not flushed, e.g. non-transactional logical messages.
    //
-    // If the previous WAL record ended exactly at page boundary, pg_current_wal_insert_lsn
-    // returns the position just after the page header on the next page. That's where the next
-    // record will be inserted. But the page header hasn't actually been written to the WAL
-    // yet, and if you try to flush it, you get a "request to flush past end of generated WAL"
-    // error. Because of that, if the insert location is just after a page header, back off to
-    // previous page boundary.
-    let mut lsn = u64::from(client.pg_current_wal_insert_lsn()?);
-    if lsn % WAL_SEGMENT_SIZE as u64 == XLOG_SIZE_OF_XLOG_LONG_PHD as u64 {
-        lsn -= XLOG_SIZE_OF_XLOG_LONG_PHD as u64;
-    } else if lsn % XLOG_BLCKSZ as u64 == XLOG_SIZE_OF_XLOG_SHORT_PHD as u64 {
-        lsn -= XLOG_SIZE_OF_XLOG_SHORT_PHD as u64;
-    }
-    client.execute("select neon_xlogflush($1)", &[&PgLsn::from(lsn)])?;
+    // Note: this is broken if pg_current_wal_insert_lsn is at page boundary
+    // because pg_current_wal_insert_lsn skips page headers.
+    client.execute("select neon_xlogflush(pg_current_wal_insert_lsn())", &[])?;
    Ok(intermediate_lsns)
 }

@@ -332,49 +320,38 @@ impl Crafter for LastWalRecordXlogSwitchEndsOnPageBoundary {

        client.execute("CREATE table t(x int)", &[])?;

-        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.  We
-        // will use carefully-sized logical messages to advance WAL insert location such
-        // that there is just enough space on the page for the XLOG_SWITCH record.
-        loop {
-            // We start with measuring how much WAL it takes for one logical message,
-            // considering all alignments and headers.
+        // Add padding so the XLOG_SWITCH record ends exactly on XLOG_BLCKSZ boundary.
+        // We will use logical message as the padding. We start with detecting how much WAL
+        // it takes for one logical message, considering all alignments and headers.
+        let base_wal_advance = {
            let before_lsn = client.pg_current_wal_insert_lsn()?;
+            // Small non-empty message bigger than few bytes is more likely than an empty
+            // message to have the same format as the big padding message.
            client.execute(
                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', 10))",
                &[],
            )?;
-            let after_lsn = client.pg_current_wal_insert_lsn()?;
-
-            // Did the record cross a page boundary? If it did, start over. Crossing a
-            // page boundary adds to the apparent size of the record because of the page
-            // header, which throws off the calculation.
-            if u64::from(before_lsn) / XLOG_BLCKSZ as u64
-                != u64::from(after_lsn) / XLOG_BLCKSZ as u64
-            {
-                continue;
-            }
-            // base_size is the size of a logical message without the payload
-            let base_size = u64::from(after_lsn) - u64::from(before_lsn) - 10;
-
-            // Is there enough space on the page for another logical message and an
-            // XLOG_SWITCH? If not, start over.
-            let page_remain = XLOG_BLCKSZ as u64 - u64::from(after_lsn) % XLOG_BLCKSZ as u64;
-            if page_remain < base_size - XLOG_SIZE_OF_XLOG_RECORD as u64 {
-                continue;
-            }
-
-            // We will write another logical message, such that after the logical message
-            // record, there will be space for exactly one XLOG_SWITCH. How large should
-            // the logical message's payload be? An XLOG_SWITCH record has no data => its
-            // size is exactly XLOG_SIZE_OF_XLOG_RECORD.
-            let repeats = page_remain - base_size - XLOG_SIZE_OF_XLOG_RECORD as u64;
-
-            client.execute(
-                "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
-                &[&(repeats as i32)],
-            )?;
-            break;
+            // The XLOG_SWITCH record has no data => its size is exactly XLOG_SIZE_OF_XLOG_RECORD.
+            (u64::from(client.pg_current_wal_insert_lsn()?) - u64::from(before_lsn)) as usize
+                + XLOG_SIZE_OF_XLOG_RECORD
+        };
+        let mut remaining_lsn =
+            XLOG_BLCKSZ - u64::from(client.pg_current_wal_insert_lsn()?) as usize % XLOG_BLCKSZ;
+        if remaining_lsn < base_wal_advance {
+            remaining_lsn += XLOG_BLCKSZ;
        }
+        let repeats = 10 + remaining_lsn - base_wal_advance;
+        info!(
+            "current_wal_insert_lsn={}, remaining_lsn={}, base_wal_advance={}, repeats={}",
+            client.pg_current_wal_insert_lsn()?,
+            remaining_lsn,
+            base_wal_advance,
+            repeats
+        );
+        client.execute(
+            "SELECT pg_logical_emit_message(false, 'swch', REPEAT('a', $1))",
+            &[&(repeats as i32)],
+        )?;
        info!(
            "current_wal_insert_lsn={}, XLOG_SIZE_OF_XLOG_RECORD={}",
            client.pg_current_wal_insert_lsn()?,
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -38,7 +38,6 @@ azure_storage_blobs.workspace = true
 futures-util.workspace = true
 http-types.workspace = true
 itertools.workspace = true
-sync_wrapper = { workspace = true, features = ["futures"] }

 [dev-dependencies]
 camino-tempfile.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -3,7 +3,6 @@
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::env;
-use std::io;
 use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::str::FromStr;
@@ -21,7 +20,6 @@ use azure_storage_blobs::blob::CopyStatus;
 use azure_storage_blobs::prelude::ClientBuilder;
 use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
 use bytes::Bytes;
-use futures::future::Either;
 use futures::stream::Stream;
 use futures_util::StreamExt;
 use futures_util::TryStreamExt;
@@ -130,12 +128,12 @@ impl AzureBlobStorage {
        let kind = RequestKind::Get;

        let _permit = self.permit(kind, cancel).await?;
-        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
-        let cancel_or_timeout_ = crate::support::cancel_or_timeout(self.timeout, cancel.clone());

        let mut etag = None;
        let mut last_modified = None;
        let mut metadata = HashMap::new();
+        // TODO give proper streaming response instead of buffering into RAM
+        // https://github.com/neondatabase/neon/issues/5563

        let download = async {
            let response = builder
@@ -154,46 +152,39 @@ impl AzureBlobStorage {
                Err(_elapsed) => Err(DownloadError::Timeout),
            });

-            let mut response = Box::pin(response);
+            let mut response = std::pin::pin!(response);

-            let Some(part) = response.next().await else {
+            let mut bufs = Vec::new();
+            while let Some(part) = response.next().await {
+                let part = part?;
+                if etag.is_none() {
+                    etag = Some(part.blob.properties.etag);
+                }
+                if last_modified.is_none() {
+                    last_modified = Some(part.blob.properties.last_modified.into());
+                }
+                if let Some(blob_meta) = part.blob.metadata {
+                    metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
+                }
+                let data = part
+                    .data
+                    .collect()
+                    .await
+                    .map_err(|e| DownloadError::Other(e.into()))?;
+                bufs.push(data);
+            }
+
+            if bufs.is_empty() {
                return Err(DownloadError::Other(anyhow::anyhow!(
-                    "Azure GET response contained no response body"
+                    "Azure GET response contained no buffers"
                )));
-            };
-            let part = part?;
-            if etag.is_none() {
-                etag = Some(part.blob.properties.etag);
            }
-            if last_modified.is_none() {
-                last_modified = Some(part.blob.properties.last_modified.into());
-            }
-            if let Some(blob_meta) = part.blob.metadata {
-                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
-            }
-
            // unwrap safety: if these were None, bufs would be empty and we would have returned an error already
            let etag = etag.unwrap();
            let last_modified = last_modified.unwrap();

-            let tail_stream = response
-                .map(|part| match part {
-                    Ok(part) => Either::Left(part.data.map(|r| r.map_err(io::Error::other))),
-                    Err(e) => {
-                        Either::Right(futures::stream::once(async { Err(io::Error::other(e)) }))
-                    }
-                })
-                .flatten();
-            let stream = part
-                .data
-                .map(|r| r.map_err(io::Error::other))
-                .chain(sync_wrapper::SyncStream::new(tail_stream));
-            //.chain(SyncStream::from_pin(Box::pin(tail_stream)));
-
-            let download_stream = crate::support::DownloadStream::new(cancel_or_timeout_, stream);
-
            Ok(Download {
-                download_stream: Box::pin(download_stream),
+                download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
                etag,
                last_modified,
                metadata: Some(StorageMetadata(metadata)),
@@ -202,10 +193,7 @@ impl AzureBlobStorage {

        tokio::select! {
            bufs = download => bufs,
-            cancel_or_timeout = cancel_or_timeout => match cancel_or_timeout {
-                TimeoutOrCancel::Timeout => Err(DownloadError::Timeout),
-                TimeoutOrCancel::Cancel => Err(DownloadError::Cancelled),
-            },
+            _ = cancel.cancelled() => Err(DownloadError::Cancelled),
        }
    }

--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -21,13 +21,11 @@ use std::{
    fmt::Debug,
    num::{NonZeroU32, NonZeroUsize},
    pin::Pin,
-    str::FromStr,
    sync::Arc,
    time::{Duration, SystemTime},
 };

 use anyhow::{bail, Context};
-use aws_sdk_s3::types::StorageClass;
 use camino::{Utf8Path, Utf8PathBuf};

 use bytes::Bytes;
@@ -55,11 +53,11 @@ pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
 /// ~3500 PUT/COPY/POST/DELETE or 5500 GET/HEAD S3 requests
 /// <https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/>
 pub const DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT: usize = 100;
-/// Set this limit analogously to the S3 limit
+/// We set this a little bit low as we currently buffer the entire file into RAM
 ///
 /// Here, a limit of max 20k concurrent connections was noted.
 /// <https://learn.microsoft.com/en-us/answers/questions/1301863/is-there-any-limitation-to-concurrent-connections>
-pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 100;
+pub const DEFAULT_REMOTE_STORAGE_AZURE_CONCURRENCY_LIMIT: usize = 30;
 /// No limits on the client side, which currenltly means 1000 for AWS S3.
 /// <https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax>
 pub const DEFAULT_MAX_KEYS_PER_LIST_RESPONSE: Option<i32> = None;
@@ -136,11 +134,6 @@ impl RemotePath {
    pub fn strip_prefix(&self, p: &RemotePath) -> Result<&Utf8Path, std::path::StripPrefixError> {
        self.0.strip_prefix(&p.0)
    }
-
-    pub fn add_trailing_slash(&self) -> Self {
-        // Unwrap safety inputs are guararnteed to be valid UTF-8
-        Self(format!("{}/", self.0).try_into().unwrap())
-    }
 }

 /// We don't need callers to be able to pass arbitrary delimiters: just control
@@ -164,21 +157,47 @@ pub struct Listing {
 /// providing basic CRUD operations for storage files.
 #[allow(async_fn_in_trait)]
 pub trait RemoteStorage: Send + Sync + 'static {
-    /// List objects in remote storage, with semantics matching AWS S3's ListObjectsV2.
-    /// (see `<https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html>`)
-    ///
-    /// Note that the prefix is relative to any `prefix_in_bucket` configured for the client, not
-    /// from the absolute root of the bucket.
-    ///
-    /// `mode` configures whether to use a delimiter.  Without a delimiter all keys
-    /// within the prefix are listed in the `keys` of the result.  With a delimiter, any "directories" at the top level of
-    /// the prefix are returned in the `prefixes` of the result, and keys in the top level of the prefix are
-    /// returned in `keys` ().
-    ///
-    /// `max_keys` controls the maximum number of keys that will be returned.  If this is None, this function
-    /// will iteratively call listobjects until it runs out of keys.  Note that this is not safe to use on
-    /// unlimted size buckets, as the full list of objects is allocated into a monolithic data structure.
+    /// Lists all top level subdirectories for a given prefix
+    /// Note: here we assume that if the prefix is passed it was obtained via remote_object_id
+    /// which already takes into account any kind of global prefix (prefix_in_bucket for S3 or storage_root for LocalFS)
+    /// so this method doesnt need to.
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+        cancel: &CancellationToken,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        let result = self
+            .list(prefix, ListingMode::WithDelimiter, None, cancel)
+            .await?
+            .prefixes;
+        Ok(result)
+    }
+    /// Lists all files in directory "recursively"
+    /// (not really recursively, because AWS has a flat namespace)
+    /// Note: This is subtely different than list_prefixes,
+    /// because it is for listing files instead of listing
+    /// names sharing common prefixes.
+    /// For example,
+    /// list_files("foo/bar") = ["foo/bar/cat123.txt",
+    /// "foo/bar/cat567.txt", "foo/bar/dog123.txt", "foo/bar/dog456.txt"]
+    /// whereas,
+    /// list_prefixes("foo/bar/") = ["cat", "dog"]
+    /// See `test_real_s3.rs` for more details.
    ///
+    /// max_keys limits max number of keys returned; None means unlimited.
+    async fn list_files(
+        &self,
+        prefix: Option<&RemotePath>,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        let result = self
+            .list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
+            .await?
+            .keys;
+        Ok(result)
+    }
+
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
@@ -317,6 +336,41 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        }
    }

+    // A function for listing all the files in a "directory"
+    // Example:
+    // list_files("foo/bar") = ["foo/bar/a.txt", "foo/bar/b.txt"]
+    //
+    // max_keys limits max number of keys returned; None means unlimited.
+    pub async fn list_files(
+        &self,
+        folder: Option<&RemotePath>,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        match self {
+            Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
+            Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
+            Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
+            Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
+        }
+    }
+
+    // lists common *prefixes*, if any of files
+    // Example:
+    // list_prefixes("foo123","foo567","bar123","bar432") = ["foo", "bar"]
+    pub async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+        cancel: &CancellationToken,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        match self {
+            Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
+            Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
+            Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
+            Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
+        }
+    }
+
    /// See [`RemoteStorage::upload`]
    pub async fn upload(
        &self,
@@ -511,16 +565,6 @@ impl GenericRemoteStorage {
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct StorageMetadata(HashMap<String, String>);

-impl<const N: usize> From<[(&str, &str); N]> for StorageMetadata {
-    fn from(arr: [(&str, &str); N]) -> Self {
-        let map: HashMap<String, String> = arr
-            .iter()
-            .map(|(k, v)| (k.to_string(), v.to_string()))
-            .collect();
-        Self(map)
-    }
-}
-
 /// External backup storage configuration, enough for creating a client for that storage.
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct RemoteStorageConfig {
@@ -565,7 +609,6 @@ pub struct S3Config {
    /// See [`DEFAULT_REMOTE_STORAGE_S3_CONCURRENCY_LIMIT`] for more details.
    pub concurrency_limit: NonZeroUsize,
    pub max_keys_per_list_response: Option<i32>,
-    pub upload_storage_class: Option<StorageClass>,
 }

 impl Debug for S3Config {
@@ -694,18 +737,6 @@ impl RemoteStorageConfig {
                    endpoint,
                    concurrency_limit,
                    max_keys_per_list_response,
-                    upload_storage_class: toml
-                        .get("upload_storage_class")
-                        .map(|prefix_in_bucket| -> anyhow::Result<_> {
-                            let s = parse_toml_string("upload_storage_class", prefix_in_bucket)?;
-                            let storage_class = StorageClass::from_str(&s).expect("infallible");
-                            #[allow(deprecated)]
-                            if matches!(storage_class, StorageClass::Unknown(_)) {
-                                bail!("Specified storage class unknown to SDK: '{s}'. Allowed values: {:?}", StorageClass::values());
-                            }
-                            Ok(storage_class)
-                        })
-                        .transpose()?,
                })
            }
            (_, _, _, Some(_), None) => {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -5,9 +5,11 @@
 //! volume is mounted to the local FS.

 use std::{
-    collections::HashSet,
+    borrow::Cow,
+    future::Future,
    io::ErrorKind,
    num::NonZeroU32,
+    pin::Pin,
    time::{Duration, SystemTime, UNIX_EPOCH},
 };

@@ -20,11 +22,11 @@ use tokio::{
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tokio_util::{io::ReaderStream, sync::CancellationToken};
-use utils::crashsafe::path_with_suffix_extension;
+use tracing::*;
+use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

 use crate::{
    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 use super::{RemoteStorage, StorageMetadata};
@@ -91,47 +93,7 @@ impl LocalFs {

    #[cfg(test)]
    async fn list_all(&self) -> anyhow::Result<Vec<RemotePath>> {
-        use std::{future::Future, pin::Pin};
-        fn get_all_files<'a, P>(
-            directory_path: P,
-        ) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
-        where
-            P: AsRef<Utf8Path> + Send + Sync + 'a,
-        {
-            Box::pin(async move {
-                let directory_path = directory_path.as_ref();
-                if directory_path.exists() {
-                    if directory_path.is_dir() {
-                        let mut paths = Vec::new();
-                        let mut dir_contents = fs::read_dir(directory_path).await?;
-                        while let Some(dir_entry) = dir_contents.next_entry().await? {
-                            let file_type = dir_entry.file_type().await?;
-                            let entry_path =
-                                Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
-                                    anyhow::Error::msg(format!(
-                                        "non-Unicode path: {}",
-                                        pb.to_string_lossy()
-                                    ))
-                                })?;
-                            if file_type.is_symlink() {
-                                tracing::debug!("{entry_path:?} is a symlink, skipping")
-                            } else if file_type.is_dir() {
-                                paths.extend(get_all_files(&entry_path).await?.into_iter())
-                            } else {
-                                paths.push(entry_path);
-                            }
-                        }
-                        Ok(paths)
-                    } else {
-                        bail!("Path {directory_path:?} is not a directory")
-                    }
-                } else {
-                    Ok(Vec::new())
-                }
-            })
-        }
-
-        Ok(get_all_files(&self.storage_root)
+        Ok(get_all_files(&self.storage_root, true)
            .await?
            .into_iter()
            .map(|path| {
@@ -158,14 +120,6 @@ impl LocalFs {
        // S3 object list prefixes can be arbitrary strings, but when reading
        // the local filesystem we need a directory to start calling read_dir on.
        let mut initial_dir = full_path.clone();
-
-        // If there's no trailing slash, we have to start looking from one above: even if
-        // `initial_dir` is a directory, we should still list any prefixes in the parent
-        // that start with the same string.
-        if !full_path.to_string().ends_with('/') {
-            initial_dir.pop();
-        }
-
        loop {
            // Did we make it to the root?
            if initial_dir.parent().is_none() {
@@ -341,66 +295,61 @@ impl RemoteStorage for LocalFs {
        let op = async {
            let mut result = Listing::default();

-            // Filter out directories: in S3 directories don't exist, only the keys within them do.
-            let keys = self
-                .list_recursive(prefix)
+            if let ListingMode::NoDelimiter = mode {
+                let keys = self
+                    .list_recursive(prefix)
+                    .await
+                    .map_err(DownloadError::Other)?;
+
+                result.keys = keys
+                    .into_iter()
+                    .filter(|k| {
+                        let path = k.with_base(&self.storage_root);
+                        !path.is_dir()
+                    })
+                    .collect();
+
+                if let Some(max_keys) = max_keys {
+                    result.keys.truncate(max_keys.get() as usize);
+                }
+
+                return Ok(result);
+            }
+
+            let path = match prefix {
+                Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
+                None => Cow::Borrowed(&self.storage_root),
+            };
+
+            let prefixes_to_filter = get_all_files(path.as_ref(), false)
                .await
                .map_err(DownloadError::Other)?;
-            let keys = keys
-                .into_iter()
-                .filter(|k| {
-                    let path = k.with_base(&self.storage_root);
-                    !path.is_dir()
-                })
-                .collect();

-            if let ListingMode::NoDelimiter = mode {
-                result.keys = keys;
-            } else {
-                let mut prefixes = HashSet::new();
-                for key in keys {
-                    // If the part after the prefix includes a "/", take only the first part and put it in `prefixes`.
-                    let relative_key = if let Some(prefix) = prefix {
-                        let mut prefix = prefix.clone();
-                        // We only strip the dirname of the prefix, so that when we strip it from the start of keys we
-                        // end up with full file/dir names.
-                        let prefix_full_local_path = prefix.with_base(&self.storage_root);
-                        let has_slash = prefix.0.to_string().ends_with('/');
-                        let strip_prefix = if prefix_full_local_path.is_dir() && has_slash {
-                            prefix
-                        } else {
-                            prefix.0.pop();
-                            prefix
-                        };
-
-                        RemotePath::new(key.strip_prefix(&strip_prefix).unwrap()).unwrap()
-                    } else {
-                        key
-                    };
-
-                    let relative_key = format!("{}", relative_key);
-                    if relative_key.contains(REMOTE_STORAGE_PREFIX_SEPARATOR) {
-                        let first_part = relative_key
-                            .split(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                            .next()
-                            .unwrap()
-                            .to_owned();
-                        prefixes.insert(first_part);
-                    } else {
-                        result
-                            .keys
-                            .push(RemotePath::from_string(&relative_key).unwrap());
-                    }
+            // filter out empty directories to mirror s3 behavior.
+            for prefix in prefixes_to_filter {
+                if prefix.is_dir()
+                    && is_directory_empty(&prefix)
+                        .await
+                        .map_err(DownloadError::Other)?
+                {
+                    continue;
+                }
+
+                let stripped = prefix
+                    .strip_prefix(&self.storage_root)
+                    .context("Failed to strip prefix")
+                    .and_then(RemotePath::new)
+                    .expect(
+                        "We list files for storage root, hence should be able to remote the prefix",
+                    );
+
+                if prefix.is_dir() {
+                    result.prefixes.push(stripped);
+                } else {
+                    result.keys.push(stripped);
                }
-                result.prefixes = prefixes
-                    .into_iter()
-                    .map(|s| RemotePath::from_string(&s).unwrap())
-                    .collect();
            }

-            if let Some(max_keys) = max_keys {
-                result.keys.truncate(max_keys.get() as usize);
-            }
            Ok(result)
        };

@@ -611,6 +560,50 @@ fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
    path_with_suffix_extension(original_path, "metadata")
 }

+fn get_all_files<'a, P>(
+    directory_path: P,
+    recursive: bool,
+) -> Pin<Box<dyn Future<Output = anyhow::Result<Vec<Utf8PathBuf>>> + Send + Sync + 'a>>
+where
+    P: AsRef<Utf8Path> + Send + Sync + 'a,
+{
+    Box::pin(async move {
+        let directory_path = directory_path.as_ref();
+        if directory_path.exists() {
+            if directory_path.is_dir() {
+                let mut paths = Vec::new();
+                let mut dir_contents = fs::read_dir(directory_path).await?;
+                while let Some(dir_entry) = dir_contents.next_entry().await? {
+                    let file_type = dir_entry.file_type().await?;
+                    let entry_path =
+                        Utf8PathBuf::from_path_buf(dir_entry.path()).map_err(|pb| {
+                            anyhow::Error::msg(format!(
+                                "non-Unicode path: {}",
+                                pb.to_string_lossy()
+                            ))
+                        })?;
+                    if file_type.is_symlink() {
+                        debug!("{entry_path:?} is a symlink, skipping")
+                    } else if file_type.is_dir() {
+                        if recursive {
+                            paths.extend(get_all_files(&entry_path, true).await?.into_iter())
+                        } else {
+                            paths.push(entry_path)
+                        }
+                    } else {
+                        paths.push(entry_path);
+                    }
+                }
+                Ok(paths)
+            } else {
+                bail!("Path {directory_path:?} is not a directory")
+            }
+        } else {
+            Ok(Vec::new())
+        }
+    })
+}
+
 async fn create_target_directory(target_file_path: &Utf8Path) -> anyhow::Result<()> {
    let target_dir = match target_file_path.parent() {
        Some(parent_dir) => parent_dir,
@@ -930,18 +923,13 @@ mod fs_tests {
        // No delimiter: should recursively list everything
        let (storage, cancel) = create_storage()?;
        let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
-        let child_sibling =
-            upload_dummy_file(&storage, "grandparent/parent/child_sibling", None, &cancel).await?;
        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;

        let listing = storage
            .list(None, ListingMode::NoDelimiter, None, &cancel)
            .await?;
        assert!(listing.prefixes.is_empty());
-        assert_eq!(
-            listing.keys.into_iter().collect::<HashSet<_>>(),
-            HashSet::from([uncle.clone(), child.clone(), child_sibling.clone()])
-        );
+        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());

        // Delimiter: should only go one deep
        let listing = storage
@@ -954,25 +942,7 @@ mod fs_tests {
        );
        assert!(listing.keys.is_empty());

-        // Delimiter & prefix with a trailing slash
-        let listing = storage
-            .list(
-                Some(&RemotePath::from_string("timelines/some_timeline/grandparent/").unwrap()),
-                ListingMode::WithDelimiter,
-                None,
-                &cancel,
-            )
-            .await?;
-        assert_eq!(
-            listing.keys,
-            [RemotePath::from_string("uncle").unwrap()].to_vec()
-        );
-        assert_eq!(
-            listing.prefixes,
-            [RemotePath::from_string("parent").unwrap()].to_vec()
-        );
-
-        // Delimiter and prefix without a trailing slash
+        // Delimiter & prefix
        let listing = storage
            .list(
                Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
@@ -981,66 +951,12 @@ mod fs_tests {
                &cancel,
            )
            .await?;
-        assert_eq!(listing.keys, [].to_vec());
        assert_eq!(
            listing.prefixes,
-            [RemotePath::from_string("grandparent").unwrap()].to_vec()
-        );
-
-        // Delimiter and prefix that's partway through a path component
-        let listing = storage
-            .list(
-                Some(&RemotePath::from_string("timelines/some_timeline/grandp").unwrap()),
-                ListingMode::WithDelimiter,
-                None,
-                &cancel,
-            )
-            .await?;
-        assert_eq!(listing.keys, [].to_vec());
-        assert_eq!(
-            listing.prefixes,
-            [RemotePath::from_string("grandparent").unwrap()].to_vec()
-        );
-
-        Ok(())
-    }
-
-    #[tokio::test]
-    async fn list_part_component() -> anyhow::Result<()> {
-        // No delimiter: should recursively list everything
-        let (storage, cancel) = create_storage()?;
-
-        // Imitates what happens in a tenant path when we have an unsharded path and a sharded path, and do a listing
-        // of the unsharded path: although there is a "directory" at the unsharded path, it should be handled as
-        // a freeform prefix.
-        let _child_a =
-            upload_dummy_file(&storage, "grandparent/tenant-01/child", None, &cancel).await?;
-        let _child_b =
-            upload_dummy_file(&storage, "grandparent/tenant/child", None, &cancel).await?;
-
-        // Delimiter and prefix that's partway through a path component
-        let listing = storage
-            .list(
-                Some(
-                    &RemotePath::from_string("timelines/some_timeline/grandparent/tenant").unwrap(),
-                ),
-                ListingMode::WithDelimiter,
-                None,
-                &cancel,
-            )
-            .await?;
-        assert_eq!(listing.keys, [].to_vec());
-
-        let mut found_prefixes = listing.prefixes.clone();
-        found_prefixes.sort();
-        assert_eq!(
-            found_prefixes,
-            [
-                RemotePath::from_string("tenant").unwrap(),
-                RemotePath::from_string("tenant-01").unwrap(),
-            ]
-            .to_vec()
+            [RemotePath::from_string("timelines/some_timeline/grandparent/parent").unwrap()]
+                .to_vec()
        );
+        assert_eq!(listing.keys, [uncle.clone()].to_vec());

        Ok(())
    }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -30,7 +30,7 @@ use aws_sdk_s3::{
    config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
    operation::get_object::GetObjectError,
-    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion, StorageClass},
+    types::{Delete, DeleteMarkerEntry, ObjectIdentifier, ObjectVersion},
    Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
@@ -62,7 +62,6 @@ pub struct S3Bucket {
    bucket_name: String,
    prefix_in_bucket: Option<String>,
    max_keys_per_list_response: Option<i32>,
-    upload_storage_class: Option<StorageClass>,
    concurrency_limiter: ConcurrencyLimiter,
    // Per-request timeout. Accessible for tests.
    pub timeout: Duration,
@@ -155,7 +154,6 @@ impl S3Bucket {
            max_keys_per_list_response: aws_config.max_keys_per_list_response,
            prefix_in_bucket,
            concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
-            upload_storage_class: aws_config.upload_storage_class.clone(),
            timeout,
        })
    }
@@ -180,7 +178,10 @@ impl S3Bucket {

    pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
-        let path_string = path.get_path().as_str();
+        let path_string = path
+            .get_path()
+            .as_str()
+            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR);
        match &self.prefix_in_bucket {
            Some(prefix) => prefix.clone() + "/" + path_string,
            None => path_string.to_string(),
@@ -470,11 +471,16 @@ impl RemoteStorage for S3Bucket {
        // get the passed prefix or if it is not set use prefix_in_bucket value
        let list_prefix = prefix
            .map(|p| self.relative_path_to_s3_object(p))
-            .or_else(|| {
-                self.prefix_in_bucket.clone().map(|mut s| {
-                    s.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                    s
-                })
+            .or_else(|| self.prefix_in_bucket.clone())
+            .map(|mut p| {
+                // required to end with a separator
+                // otherwise request will return only the entry of a prefix
+                if matches!(mode, ListingMode::WithDelimiter)
+                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                {
+                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+                }
+                p
            });

        let _permit = self.permit(kind, cancel).await?;
@@ -543,15 +549,11 @@ impl RemoteStorage for S3Bucket {
                }
            }

-            // S3 gives us prefixes like "foo/", we return them like "foo"
-            result.prefixes.extend(prefixes.iter().filter_map(|o| {
-                Some(
-                    self.s3_object_to_relative_path(
-                        o.prefix()?
-                            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR),
-                    ),
-                )
-            }));
+            result.prefixes.extend(
+                prefixes
+                    .iter()
+                    .filter_map(|o| Some(self.s3_object_to_relative_path(o.prefix()?))),
+            );

            continuation_token = match response.next_continuation_token {
                Some(new_token) => Some(new_token),
@@ -584,7 +586,6 @@ impl RemoteStorage for S3Bucket {
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(to))
            .set_metadata(metadata.map(|m| m.0))
-            .set_storage_class(self.upload_storage_class.clone())
            .content_length(from_size_bytes.try_into()?)
            .body(bytes_stream)
            .send();
@@ -636,7 +637,6 @@ impl RemoteStorage for S3Bucket {
            .copy_object()
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(to))
-            .set_storage_class(self.upload_storage_class.clone())
            .copy_source(copy_source)
            .send();

@@ -894,7 +894,6 @@ impl RemoteStorage for S3Bucket {
                                    .copy_object()
                                    .bucket(self.bucket_name.clone())
                                    .key(key)
-                                    .set_storage_class(self.upload_storage_class.clone())
                                    .copy_source(&source_id)
                                    .send();

@@ -1051,22 +1050,22 @@ mod tests {
            Some("/test/prefix/"),
        ];
        let expected_outputs = [
-            vec!["", "some/path", "some/path/"],
-            vec!["/", "/some/path", "/some/path/"],
+            vec!["", "some/path", "some/path"],
+            vec!["/", "/some/path", "/some/path"],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path/",
+                "test/prefix/some/path",
            ],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path/",
+                "test/prefix/some/path",
            ],
            vec![
                "test/prefix/",
                "test/prefix/some/path",
-                "test/prefix/some/path/",
+                "test/prefix/some/path",
            ],
        ];

@@ -1078,7 +1077,6 @@ mod tests {
                endpoint: None,
                concurrency_limit: NonZeroUsize::new(100).unwrap(),
                max_keys_per_list_response: Some(5),
-                upload_storage_class: None,
            };
            let storage =
                S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -107,6 +107,27 @@ impl UnreliableWrapper {
 type VoidStorage = crate::LocalFs;

 impl RemoteStorage for UnreliableWrapper {
+    async fn list_prefixes(
+        &self,
+        prefix: Option<&RemotePath>,
+        cancel: &CancellationToken,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
+            .map_err(DownloadError::Other)?;
+        self.inner.list_prefixes(prefix, cancel).await
+    }
+
+    async fn list_files(
+        &self,
+        folder: Option<&RemotePath>,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<Vec<RemotePath>, DownloadError> {
+        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
+            .map_err(DownloadError::Other)?;
+        self.inner.list_files(folder, max_keys, cancel).await
+    }
+
    async fn list(
        &self,
        prefix: Option<&RemotePath>,
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -1,6 +1,5 @@
 use anyhow::Context;
 use camino::Utf8Path;
-use remote_storage::ListingMode;
 use remote_storage::RemotePath;
 use std::sync::Arc;
 use std::{collections::HashSet, num::NonZeroU32};
@@ -55,9 +54,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
        .context("common_prefix construction")?;
    let root_remote_prefixes = test_client
-        .list(None, ListingMode::WithDelimiter, None, &cancel)
-        .await?
-        .prefixes
+        .list_prefixes(None, &cancel)
+        .await
+        .context("client list root prefixes failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
@@ -66,14 +65,9 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
    );

    let nested_remote_prefixes = test_client
-        .list(
-            Some(&base_prefix.add_trailing_slash()),
-            ListingMode::WithDelimiter,
-            None,
-            &cancel,
-        )
-        .await?
-        .prefixes
+        .list_prefixes(Some(&base_prefix), &cancel)
+        .await
+        .context("client list nested prefixes failure")?
        .into_iter()
        .collect::<HashSet<_>>();
    let remote_only_prefixes = nested_remote_prefixes
@@ -96,13 +90,11 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
 ///
 /// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
 /// Then performs the following queries:
-///    1. `list(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
 #[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
 #[tokio::test]
-async fn list_no_delimiter_works(
-    ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs,
-) -> anyhow::Result<()> {
+async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
    let ctx = match ctx {
        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
@@ -115,36 +107,29 @@ async fn list_no_delimiter_works(
    let base_prefix =
        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
    let root_files = test_client
-        .list(None, ListingMode::NoDelimiter, None, &cancel)
+        .list_files(None, None, &cancel)
        .await
        .context("client list root files failure")?
-        .keys
        .into_iter()
        .collect::<HashSet<_>>();
    assert_eq!(
        root_files,
        ctx.remote_blobs.clone(),
-        "remote storage list on root mismatches with the uploads."
+        "remote storage list_files on root mismatches with the uploads."
    );

    // Test that max_keys limit works. In total there are about 21 files (see
    // upload_simple_remote_data call in test_real_s3.rs).
    let limited_root_files = test_client
-        .list(
-            None,
-            ListingMode::NoDelimiter,
-            Some(NonZeroU32::new(2).unwrap()),
-            &cancel,
-        )
+        .list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
        .await
        .context("client list root files failure")?;
-    assert_eq!(limited_root_files.keys.len(), 2);
+    assert_eq!(limited_root_files.len(), 2);

    let nested_remote_files = test_client
-        .list(Some(&base_prefix), ListingMode::NoDelimiter, None, &cancel)
+        .list_files(Some(&base_prefix), None, &cancel)
        .await
        .context("client list nested files failure")?
-        .keys
        .into_iter()
        .collect::<HashSet<_>>();
    let trim_remote_blobs: HashSet<_> = ctx
@@ -156,7 +141,7 @@ async fn list_no_delimiter_works(
        .collect();
    assert_eq!(
        nested_remote_files, trim_remote_blobs,
-        "remote storage list on subdirrectory mismatches with the uploads."
+        "remote storage list_files on subdirrectory mismatches with the uploads."
    );
    Ok(())
 }
@@ -214,11 +199,7 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(

    ctx.client.delete_objects(&[path1, path2], &cancel).await?;

-    let prefixes = ctx
-        .client
-        .list(None, ListingMode::WithDelimiter, None, &cancel)
-        .await?
-        .prefixes;
+    let prefixes = ctx.client.list_prefixes(None, &cancel).await?;

    assert_eq!(prefixes.len(), 1);

--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -57,6 +57,7 @@ enum MaybeEnabledStorage {
    Disabled,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -85,6 +86,7 @@ struct AzureWithTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -132,6 +134,10 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    }
 }

+// NOTE: the setups for the list_prefixes test and the list_files test are very similar
+// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
+// whereas the list_files function is concerned with listing files.
+// See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(AzureWithSimpleTestBlobs),
    Disabled,
@@ -142,6 +148,7 @@ struct AzureWithSimpleTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -12,8 +12,8 @@ use anyhow::Context;
 use camino::Utf8Path;
 use futures_util::StreamExt;
 use remote_storage::{
-    DownloadError, GenericRemoteStorage, ListingMode, RemotePath, RemoteStorageConfig,
-    RemoteStorageKind, S3Config,
+    DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
+    S3Config,
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
@@ -75,14 +75,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
        client: &Arc<GenericRemoteStorage>,
        cancel: &CancellationToken,
    ) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(
-            retry(|| client.list(None, ListingMode::NoDelimiter, None, cancel))
-                .await
-                .context("list root files failure")?
-                .keys
-                .into_iter()
-                .collect::<HashSet<_>>(),
-        )
+        Ok(retry(|| client.list_files(None, None, cancel))
+            .await
+            .context("list root files failure")?
+            .into_iter()
+            .collect::<HashSet<_>>())
    }

    let cancel = CancellationToken::new();
@@ -222,6 +219,7 @@ enum MaybeEnabledStorage {
    Disabled,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -250,6 +248,7 @@ struct S3WithTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -297,6 +296,10 @@ impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    }
 }

+// NOTE: the setups for the list_prefixes test and the list_files test are very similar
+// However, they are not idential. The list_prefixes function is concerned with listing prefixes,
+// whereas the list_files function is concerned with listing files.
+// See `RemoteStorage::list_files` documentation for more details
 enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(S3WithSimpleTestBlobs),
    Disabled,
@@ -307,6 +310,7 @@ struct S3WithSimpleTestBlobs {
    remote_blobs: HashSet<RemotePath>,
 }

+#[async_trait::async_trait]
 impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
@@ -380,7 +384,6 @@ fn create_s3_client(
            endpoint: None,
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
            max_keys_per_list_response,
-            upload_storage_class: None,
        }),
        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
    };
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -22,7 +22,6 @@ camino.workspace = true
 chrono.workspace = true
 heapless.workspace = true
 hex = { workspace = true, features = ["serde"] }
-humantime.workspace = true
 hyper = { workspace = true, features = ["full"] }
 fail.workspace = true
 futures = { workspace = true}
--- a/libs/utils/src/env.rs
+++ b/libs/utils/src/env.rs
@@ -1,21 +0,0 @@
-//! Wrapper around `std::env::var` for parsing environment variables.
-
-use std::{fmt::Display, str::FromStr};
-
-pub fn var<V, E>(varname: &str) -> Option<V>
-where
-    V: FromStr<Err = E>,
-    E: Display,
-{
-    match std::env::var(varname) {
-        Ok(s) => Some(
-            s.parse()
-                .map_err(|e| format!("failed to parse env var {varname}: {e:#}"))
-                .unwrap(),
-        ),
-        Err(std::env::VarError::NotPresent) => None,
-        Err(std::env::VarError::NotUnicode(_)) => {
-            panic!("env var {varname} is not unicode")
-        }
-    }
-}
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -34,8 +34,6 @@ pub enum Generation {
 /// scenarios where pageservers might otherwise issue conflicting writes to
 /// remote storage
 impl Generation {
-    pub const MAX: Self = Self::Valid(u32::MAX);
-
    /// Create a new Generation that represents a legacy key format with
    /// no generation suffix
    pub fn none() -> Self {
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -63,7 +63,6 @@ pub mod measured_stream;

 pub mod serde_percent;
 pub mod serde_regex;
-pub mod serde_system_time;

 pub mod pageserver_feedback;

@@ -90,10 +89,6 @@ pub mod yielding_loop;

 pub mod zstd;

-pub mod env;
-
-pub mod poison;
-
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/poison.rs
+++ b/libs/utils/src/poison.rs
@@ -1,121 +0,0 @@
-//!  Protect a piece of state from reuse after it is left in an inconsistent state.
-//!
-//!  # Example
-//!
-//!  ```
-//!  # tokio_test::block_on(async {
-//!  use utils::poison::Poison;
-//!  use std::time::Duration;
-//!
-//!  struct State {
-//!    clean: bool,
-//!  }
-//!  let state = tokio::sync::Mutex::new(Poison::new("mystate", State { clean: true }));
-//!
-//!  let mut mutex_guard = state.lock().await;
-//!  let mut poison_guard = mutex_guard.check_and_arm()?;
-//!  let state = poison_guard.data_mut();
-//!  state.clean = false;
-//!  // If we get cancelled at this await point, subsequent check_and_arm() calls will fail.
-//!  tokio::time::sleep(Duration::from_secs(10)).await;
-//!  state.clean = true;
-//!  poison_guard.disarm();
-//!  # Ok::<(), utils::poison::Error>(())
-//!  # });
-//!  ```
-
-use tracing::warn;
-
-pub struct Poison<T> {
-    what: &'static str,
-    state: State,
-    data: T,
-}
-
-#[derive(Clone, Copy)]
-enum State {
-    Clean,
-    Armed,
-    Poisoned { at: chrono::DateTime<chrono::Utc> },
-}
-
-impl<T> Poison<T> {
-    /// We log `what` `warning!` level if the [`Guard`] gets dropped without being [`Guard::disarm`]ed.
-    pub fn new(what: &'static str, data: T) -> Self {
-        Self {
-            what,
-            state: State::Clean,
-            data,
-        }
-    }
-
-    /// Check for poisoning and return a [`Guard`] that provides access to the wrapped state.
-    pub fn check_and_arm(&mut self) -> Result<Guard<T>, Error> {
-        match self.state {
-            State::Clean => {
-                self.state = State::Armed;
-                Ok(Guard(self))
-            }
-            State::Armed => unreachable!("transient state"),
-            State::Poisoned { at } => Err(Error::Poisoned {
-                what: self.what,
-                at,
-            }),
-        }
-    }
-}
-
-/// Use [`Self::data`] and [`Self::data_mut`] to access the wrapped state.
-/// Once modifications are done, use [`Self::disarm`].
-/// If [`Guard`] gets dropped instead of calling [`Self::disarm`], the state is poisoned
-/// and subsequent calls to [`Poison::check_and_arm`] will fail with an error.
-pub struct Guard<'a, T>(&'a mut Poison<T>);
-
-impl<'a, T> Guard<'a, T> {
-    pub fn data(&self) -> &T {
-        &self.0.data
-    }
-    pub fn data_mut(&mut self) -> &mut T {
-        &mut self.0.data
-    }
-
-    pub fn disarm(self) {
-        match self.0.state {
-            State::Clean => unreachable!("we set it to Armed in check_and_arm()"),
-            State::Armed => {
-                self.0.state = State::Clean;
-            }
-            State::Poisoned { at } => {
-                unreachable!("we fail check_and_arm() if it's in that state: {at}")
-            }
-        }
-    }
-}
-
-impl<'a, T> Drop for Guard<'a, T> {
-    fn drop(&mut self) {
-        match self.0.state {
-            State::Clean => {
-                // set by disarm()
-            }
-            State::Armed => {
-                // still armed => poison it
-                let at = chrono::Utc::now();
-                self.0.state = State::Poisoned { at };
-                warn!(at=?at, "poisoning {}", self.0.what);
-            }
-            State::Poisoned { at } => {
-                unreachable!("we fail check_and_arm() if it's in that state: {at}")
-            }
-        }
-    }
-}
-
-#[derive(thiserror::Error, Debug)]
-pub enum Error {
-    #[error("poisoned at {at}: {what}")]
-    Poisoned {
-        what: &'static str,
-        at: chrono::DateTime<chrono::Utc>,
-    },
-}
--- a/libs/utils/src/seqwait.rs
+++ b/libs/utils/src/seqwait.rs
@@ -2,10 +2,11 @@

 use std::cmp::{Eq, Ordering};
 use std::collections::BinaryHeap;
+use std::fmt::Debug;
 use std::mem;
 use std::sync::Mutex;
 use std::time::Duration;
-use tokio::sync::watch::{self, channel};
+use tokio::sync::watch::{channel, Receiver, Sender};
 use tokio::time::timeout;

 /// An error happened while waiting for a number
@@ -34,73 +35,23 @@ pub trait MonotonicCounter<V> {
    fn cnt_value(&self) -> V;
 }

-/// Heap of waiters, lowest numbers pop first.
-struct Waiters<V>
+/// Internal components of a `SeqWait`
+struct SeqWaitInt<S, V>
 where
+    S: MonotonicCounter<V>,
    V: Ord,
 {
-    heap: BinaryHeap<Waiter<V>>,
-    /// Number of the first waiter in the heap, or None if there are no waiters.
-    status_channel: watch::Sender<Option<V>>,
-}
-
-impl<V> Waiters<V>
-where
-    V: Ord + Copy,
-{
-    fn new() -> Self {
-        Waiters {
-            heap: BinaryHeap::new(),
-            status_channel: channel(None).0,
-        }
-    }
-
-    /// `status_channel` contains the number of the first waiter in the heap.
-    /// This function should be called whenever waiters heap changes.
-    fn update_status(&self) {
-        let first_waiter = self.heap.peek().map(|w| w.wake_num);
-        let _ = self.status_channel.send_replace(first_waiter);
-    }
-
-    /// Add new waiter to the heap, return a channel that will be notified when the number arrives.
-    fn add(&mut self, num: V) -> watch::Receiver<()> {
-        let (tx, rx) = channel(());
-        self.heap.push(Waiter {
-            wake_num: num,
-            wake_channel: tx,
-        });
-        self.update_status();
-        rx
-    }
-
-    /// Pop all waiters <= num from the heap. Collect channels in a vector,
-    /// so that caller can wake them up.
-    fn pop_leq(&mut self, num: V) -> Vec<watch::Sender<()>> {
-        let mut wake_these = Vec::new();
-        while let Some(n) = self.heap.peek() {
-            if n.wake_num > num {
-                break;
-            }
-            wake_these.push(self.heap.pop().unwrap().wake_channel);
-        }
-        self.update_status();
-        wake_these
-    }
-
-    /// Used on shutdown to efficiently drop all waiters.
-    fn take_all(&mut self) -> BinaryHeap<Waiter<V>> {
-        let heap = mem::take(&mut self.heap);
-        self.update_status();
-        heap
-    }
+    waiters: BinaryHeap<Waiter<V>>,
+    current: S,
+    shutdown: bool,
 }

 struct Waiter<T>
 where
    T: Ord,
 {
-    wake_num: T,                     // wake me when this number arrives ...
-    wake_channel: watch::Sender<()>, // ... by sending a message to this channel
+    wake_num: T,              // wake me when this number arrives ...
+    wake_channel: Sender<()>, // ... by sending a message to this channel
 }

 // BinaryHeap is a max-heap, and we want a min-heap. Reverse the ordering here
@@ -125,17 +76,6 @@ impl<T: Ord> PartialEq for Waiter<T> {

 impl<T: Ord> Eq for Waiter<T> {}

-/// Internal components of a `SeqWait`
-struct SeqWaitInt<S, V>
-where
-    S: MonotonicCounter<V>,
-    V: Ord,
-{
-    waiters: Waiters<V>,
-    current: S,
-    shutdown: bool,
-}
-
 /// A tool for waiting on a sequence number
 ///
 /// This provides a way to wait the arrival of a number.
@@ -168,7 +108,7 @@ where
    /// Create a new `SeqWait`, initialized to a particular number
    pub fn new(starting_num: S) -> Self {
        let internal = SeqWaitInt {
-            waiters: Waiters::new(),
+            waiters: BinaryHeap::new(),
            current: starting_num,
            shutdown: false,
        };
@@ -188,8 +128,9 @@ where
            // Block any future waiters from starting
            internal.shutdown = true;

-            // Take all waiters to drop them later.
-            internal.waiters.take_all()
+            // This will steal the entire waiters map.
+            // When we drop it all waiters will be woken.
+            mem::take(&mut internal.waiters)

            // Drop the lock as we exit this scope.
        };
@@ -241,21 +182,9 @@ where
        }
    }

-    /// Check if [`Self::wait_for`] or [`Self::wait_for_timeout`] would wait if called with `num`.
-    pub fn would_wait_for(&self, num: V) -> Result<(), V> {
-        let internal = self.internal.lock().unwrap();
-        let cnt = internal.current.cnt_value();
-        drop(internal);
-        if cnt >= num {
-            Ok(())
-        } else {
-            Err(cnt)
-        }
-    }
-
    /// Register and return a channel that will be notified when a number arrives,
    /// or None, if it has already arrived.
-    fn queue_for_wait(&self, num: V) -> Result<Option<watch::Receiver<()>>, SeqWaitError> {
+    fn queue_for_wait(&self, num: V) -> Result<Option<Receiver<()>>, SeqWaitError> {
        let mut internal = self.internal.lock().unwrap();
        if internal.current.cnt_value() >= num {
            return Ok(None);
@@ -264,8 +193,12 @@ where
            return Err(SeqWaitError::Shutdown);
        }

-        // Add waiter channel to the queue.
-        let rx = internal.waiters.add(num);
+        // Create a new channel.
+        let (tx, rx) = channel(());
+        internal.waiters.push(Waiter {
+            wake_num: num,
+            wake_channel: tx,
+        });
        // Drop the lock as we exit this scope.
        Ok(Some(rx))
    }
@@ -286,8 +219,16 @@ where
            }
            internal.current.cnt_advance(num);

-            // Pop all waiters <= num from the heap.
-            internal.waiters.pop_leq(num)
+            // Pop all waiters <= num from the heap. Collect them in a vector, and
+            // wake them up after releasing the lock.
+            let mut wake_these = Vec::new();
+            while let Some(n) = internal.waiters.peek() {
+                if n.wake_num > num {
+                    break;
+                }
+                wake_these.push(internal.waiters.pop().unwrap().wake_channel);
+            }
+            wake_these
        };

        for tx in wake_these {
@@ -302,23 +243,6 @@ where
    pub fn load(&self) -> S {
        self.internal.lock().unwrap().current
    }
-
-    /// Get a Receiver for the current status.
-    ///
-    /// The current status is the number of the first waiter in the queue,
-    /// or None if there are no waiters.
-    ///
-    /// This receiver will be notified whenever the status changes.
-    /// It is useful for receiving notifications when the first waiter
-    /// starts waiting for a number, or when there are no more waiters left.
-    pub fn status_receiver(&self) -> watch::Receiver<Option<V>> {
-        self.internal
-            .lock()
-            .unwrap()
-            .waiters
-            .status_channel
-            .subscribe()
-    }
 }

 #[cfg(test)]
--- a/libs/utils/src/serde_system_time.rs
+++ b/libs/utils/src/serde_system_time.rs
@@ -1,55 +0,0 @@
-//! A `serde::{Deserialize,Serialize}` type for SystemTime with RFC3339 format and millisecond precision.
-
-#[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Ord, serde::Serialize, serde::Deserialize)]
-#[serde(transparent)]
-pub struct SystemTime(
-    #[serde(
-        deserialize_with = "deser_rfc3339_millis",
-        serialize_with = "ser_rfc3339_millis"
-    )]
-    pub std::time::SystemTime,
-);
-
-fn ser_rfc3339_millis<S: serde::ser::Serializer>(
-    ts: &std::time::SystemTime,
-    serializer: S,
-) -> Result<S::Ok, S::Error> {
-    serializer.collect_str(&humantime::format_rfc3339_millis(*ts))
-}
-
-fn deser_rfc3339_millis<'de, D>(deserializer: D) -> Result<std::time::SystemTime, D::Error>
-where
-    D: serde::de::Deserializer<'de>,
-{
-    let s: String = serde::de::Deserialize::deserialize(deserializer)?;
-    humantime::parse_rfc3339(&s).map_err(serde::de::Error::custom)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    /// Helper function to make a SystemTime have millisecond precision by truncating additional nanoseconds.
-    fn to_millisecond_precision(time: SystemTime) -> SystemTime {
-        match time.0.duration_since(std::time::SystemTime::UNIX_EPOCH) {
-            Ok(duration) => {
-                let total_millis = duration.as_secs() * 1_000 + u64::from(duration.subsec_millis());
-                SystemTime(
-                    std::time::SystemTime::UNIX_EPOCH
-                        + std::time::Duration::from_millis(total_millis),
-                )
-            }
-            Err(_) => time,
-        }
-    }
-
-    #[test]
-    fn test_serialize_deserialize() {
-        let input = SystemTime(std::time::SystemTime::now());
-        let expected_serialized = format!("\"{}\"", humantime::format_rfc3339_millis(input.0));
-        let serialized = serde_json::to_string(&input).unwrap();
-        assert_eq!(expected_serialized, serialized);
-        let deserialized: SystemTime = serde_json::from_str(&expected_serialized).unwrap();
-        assert_eq!(to_millisecond_precision(input), deserialized);
-    }
-}
--- a/Show More
+++ b/Show More