Add safekeeper option to patch control file.

https://github.com/neondatabase/neon/issues/6397
Remove rename in parameters (#6411 )
2026-07-02 03:30:37 +00:00 · 2024-01-21 00:22:30 +03:00 · 2024-01-20 10:20:53 +00:00 · 2024-01-20 09:38:11 +00:00 · 2024-01-19 19:16:01 +00:00 · 2024-01-19 20:11:24 +04:00
215 changed files with 15821 additions and 5638 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -105,11 +105,11 @@ jobs:
      - name: Install Python deps
        run: ./scripts/pysync

-      - name: Run ruff to ensure code format
-        run: poetry run ruff .
+      - name: Run `ruff check` to ensure code format
+        run: poetry run ruff check .

-      - name: Run black to ensure code format
-        run: poetry run black --diff --check .
+      - name: Run `ruff format` to ensure code format
+        run: poetry run ruff format --check .

      - name: Run mypy to check types
        run: poetry run mypy .
@@ -1131,7 +1131,7 @@ jobs:
            # TODO: move deployPreprodRegion to release (`"$GITHUB_REF_NAME" == "release"` block), once Staging support different compute tag prefixes for different regions
            gh workflow --repo neondatabase/aws run deploy-dev.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f deployPreprodRegion=true
          elif [[ "$GITHUB_REF_NAME" == "release" ]]; then
-            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}} -f disclamerAcknowledged=true
+            gh workflow --repo neondatabase/aws run deploy-prod.yml --ref main -f branch=main -f dockerTag=${{needs.tag.outputs.build-tag}}
          else
            echo "GITHUB_REF_NAME (value '$GITHUB_REF_NAME') is not set to either 'main' or 'release'"
            exit 1
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -30,6 +30,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cd7d5a2cecb58716e47d67d5703a249964b14c7be1ec3cad3affc295b2d1c35d"
 dependencies = [
 "cfg-if",
+ "const-random",
+ "getrandom 0.2.11",
 "once_cell",
 "version_check",
 "zerocopy",
@@ -50,6 +52,12 @@ version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "0942ffc6dcaadf03badf6e6a2d0228460359d5e34b57ccdc720b7382dfbd5ec5"

+[[package]]
+name = "android-tzdata"
+version = "0.1.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0"
+
 [[package]]
 name = "android_system_properties"
 version = "0.1.5"
@@ -247,6 +255,12 @@ dependencies = [
 "syn 2.0.32",
 ]

+[[package]]
+name = "atomic"
+version = "0.5.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c59bdb34bc650a32731b31bd8f0829cc15d24a708ee31559e0bb34f2bc320cba"
+
 [[package]]
 name = "atomic-polyfill"
 version = "1.0.2"
@@ -256,6 +270,32 @@ dependencies = [
 "critical-section",
 ]

+[[package]]
+name = "attachment_service"
+version = "0.1.0"
+dependencies = [
+ "anyhow",
+ "camino",
+ "clap",
+ "control_plane",
+ "futures",
+ "git-version",
+ "hyper",
+ "metrics",
+ "pageserver_api",
+ "pageserver_client",
+ "postgres_backend",
+ "postgres_connection",
+ "serde",
+ "serde_json",
+ "thiserror",
+ "tokio",
+ "tokio-util",
+ "tracing",
+ "utils",
+ "workspace_hack",
+]
+
 [[package]]
 name = "autocfg"
 version = "1.1.0"
@@ -1011,17 +1051,17 @@ dependencies = [

 [[package]]
 name = "chrono"
-version = "0.4.24"
+version = "0.4.31"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4e3c5919066adf22df73762e50cffcde3a758f2a848b113b586d1f86728b673b"
+checksum = "7f2c685bad3eb3d45a01354cedb7d5faa66194d1d58ba6e267a8de788f79db38"
 dependencies = [
+ "android-tzdata",
 "iana-time-zone",
 "js-sys",
- "num-integer",
 "num-traits",
 "serde",
 "wasm-bindgen",
- "winapi",
+ "windows-targets 0.48.0",
 ]

 [[package]]
@@ -1120,6 +1160,20 @@ version = "1.0.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "acbf1af155f9b9ef647e42cdc158db4b64a1b61f743629225fde6f3e0be2a7c7"

+[[package]]
+name = "combine"
+version = "4.6.6"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4"
+dependencies = [
+ "bytes",
+ "futures-core",
+ "memchr",
+ "pin-project-lite",
+ "tokio",
+ "tokio-util",
+]
+
 [[package]]
 name = "comfy-table"
 version = "6.1.4"
@@ -1720,6 +1774,12 @@ dependencies = [
 "termcolor",
 ]

+[[package]]
+name = "equivalent"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5"
+
 [[package]]
 name = "errno"
 version = "0.3.1"
@@ -2078,9 +2138,9 @@ dependencies = [

 [[package]]
 name = "h2"
-version = "0.3.19"
+version = "0.3.24"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "d357c7ae988e7d2182f7d7871d0b963962420b0678b0997ce7de72001aeab782"
+checksum = "bb2c4422095b67ee78da96fbb51a4cc413b3b25883c7717ff7ca1ab31022c9c9"
 dependencies = [
 "bytes",
 "fnv",
@@ -2088,7 +2148,7 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "http",
- "indexmap",
+ "indexmap 2.0.1",
 "slab",
 "tokio",
 "tokio-util",
@@ -2424,6 +2484,16 @@ dependencies = [
 "serde",
 ]

+[[package]]
+name = "indexmap"
+version = "2.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ad227c3af19d4914570ad36d30409928b75967c298feb9ea1969db3a610bb14e"
+dependencies = [
+ "equivalent",
+ "hashbrown 0.14.0",
+]
+
 [[package]]
 name = "infer"
 version = "0.2.3"
@@ -2475,6 +2545,12 @@ dependencies = [
 "web-sys",
 ]

+[[package]]
+name = "integer-encoding"
+version = "3.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8bb03732005da905c88227371639bf1ad885cc712789c011c31c5fb3ab3ccf02"
+
 [[package]]
 name = "io-lifetimes"
 version = "1.0.11"
@@ -2838,6 +2914,19 @@ dependencies = [
 "winapi",
 ]

+[[package]]
+name = "num"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b05180d69e3da0e530ba2a1dae5110317e49e3b7f3d41be227dc5f92e49ee7af"
+dependencies = [
+ "num-complex",
+ "num-integer",
+ "num-iter",
+ "num-rational",
+ "num-traits",
+]
+
 [[package]]
 name = "num-bigint"
 version = "0.4.3"
@@ -2849,6 +2938,15 @@ dependencies = [
 "num-traits",
 ]

+[[package]]
+name = "num-complex"
+version = "0.4.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "1ba157ca0885411de85d6ca030ba7e2a83a28636056c7c699b07c8b6f7383214"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "num-integer"
 version = "0.1.45"
@@ -2859,6 +2957,28 @@ dependencies = [
 "num-traits",
 ]

+[[package]]
+name = "num-iter"
+version = "0.1.43"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7d03e6c028c5dc5cac6e2dec0efda81fc887605bb3d884578bb6d6bf7514e252"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
+[[package]]
+name = "num-rational"
+version = "0.4.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0638a1c9d0a3c0914158145bc76cff373a75a627e6ecbfb71cbe6f453a5a19b0"
+dependencies = [
+ "autocfg",
+ "num-integer",
+ "num-traits",
+]
+
 [[package]]
 name = "num-traits"
 version = "0.2.15"
@@ -3052,7 +3172,7 @@ dependencies = [
 "fnv",
 "futures-channel",
 "futures-util",
- "indexmap",
+ "indexmap 1.9.3",
 "once_cell",
 "pin-project-lite",
 "thiserror",
@@ -3081,6 +3201,15 @@ dependencies = [
 "tokio-stream",
 ]

+[[package]]
+name = "ordered-float"
+version = "2.10.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "68f19d67e5a2795c94e73e0bb1cc1a7edeb2e28efd39e2e1c9b7a40c1108b11c"
+dependencies = [
+ "num-traits",
+]
+
 [[package]]
 name = "ordered-multimap"
 version = "0.7.1"
@@ -3124,18 +3253,19 @@ name = "pagebench"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "camino",
 "clap",
 "futures",
 "hdrhistogram",
 "humantime",
 "humantime-serde",
- "pageserver",
 "pageserver_api",
 "pageserver_client",
 "rand 0.8.5",
 "serde",
 "serde_json",
 "tokio",
+ "tokio-util",
 "tracing",
 "utils",
 "workspace_hack",
@@ -3252,6 +3382,7 @@ dependencies = [
 "const_format",
 "enum-map",
 "hex",
+ "humantime-serde",
 "postgres_ffi",
 "rand 0.8.5",
 "serde",
@@ -3339,6 +3470,35 @@ dependencies = [
 "windows-targets 0.48.0",
 ]

+[[package]]
+name = "parquet"
+version = "49.0.0"
+source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
+dependencies = [
+ "ahash",
+ "bytes",
+ "chrono",
+ "hashbrown 0.14.0",
+ "num",
+ "num-bigint",
+ "paste",
+ "seq-macro",
+ "thrift",
+ "twox-hash",
+ "zstd",
+]
+
+[[package]]
+name = "parquet_derive"
+version = "49.0.0"
+source = "git+https://github.com/neondatabase/arrow-rs?branch=neon-fix-bugs#8a0bc58aa67b98aabbd8eee7c6ca4281967ff9e9"
+dependencies = [
+ "parquet",
+ "proc-macro2",
+ "quote",
+ "syn 2.0.32",
+]
+
 [[package]]
 name = "password-hash"
 version = "0.5.0"
@@ -3407,7 +3567,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "4dd7d28ee937e54fe3080c91faa1c3a46c06de6252988a7f4592ba2310ef22a4"
 dependencies = [
 "fixedbitset",
- "indexmap",
+ "indexmap 1.9.3",
 ]

 [[package]]
@@ -3762,6 +3922,8 @@ dependencies = [
 "base64 0.13.1",
 "bstr",
 "bytes",
+ "camino",
+ "camino-tempfile",
 "chrono",
 "clap",
 "consumption_metrics",
@@ -3784,6 +3946,8 @@ dependencies = [
 "once_cell",
 "opentelemetry",
 "parking_lot 0.12.1",
+ "parquet",
+ "parquet_derive",
 "pbkdf2",
 "pin-project-lite",
 "postgres-native-tls",
@@ -3793,7 +3957,9 @@ dependencies = [
 "prometheus",
 "rand 0.8.5",
 "rcgen",
+ "redis",
 "regex",
+ "remote_storage",
 "reqwest",
 "reqwest-middleware",
 "reqwest-retry",
@@ -3825,6 +3991,7 @@ dependencies = [
 "url",
 "utils",
 "uuid",
+ "walkdir",
 "webpki-roots 0.25.2",
 "workspace_hack",
 "x509-parser",
@@ -3954,6 +4121,32 @@ dependencies = [
 "yasna",
 ]

+[[package]]
+name = "redis"
+version = "0.24.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "c580d9cbbe1d1b479e8d67cf9daf6a62c957e6846048408b80b43ac3f6af84cd"
+dependencies = [
+ "async-trait",
+ "bytes",
+ "combine",
+ "futures-util",
+ "itoa",
+ "percent-encoding",
+ "pin-project-lite",
+ "rustls",
+ "rustls-native-certs",
+ "rustls-pemfile",
+ "rustls-webpki 0.101.7",
+ "ryu",
+ "sha1_smol",
+ "socket2 0.4.9",
+ "tokio",
+ "tokio-rustls",
+ "tokio-util",
+ "url",
+]
+
 [[package]]
 name = "redox_syscall"
 version = "0.2.16"
@@ -4405,12 +4598,14 @@ dependencies = [
 "async-stream",
 "aws-config",
 "aws-sdk-s3",
+ "aws-smithy-async",
 "bincode",
 "bytes",
 "chrono",
 "clap",
 "crc32c",
 "either",
+ "futures",
 "futures-util",
 "hex",
 "histogram",
@@ -4473,6 +4668,7 @@ dependencies = [
 "serde",
 "serde_json",
 "serde_with",
+ "sha2",
 "signal-hook",
 "storage_broker",
 "thiserror",
@@ -4679,6 +4875,12 @@ dependencies = [
 "uuid",
 ]

+[[package]]
+name = "seq-macro"
+version = "0.3.5"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
+
 [[package]]
 name = "serde"
 version = "1.0.183"
@@ -4771,7 +4973,7 @@ dependencies = [
 "base64 0.13.1",
 "chrono",
 "hex",
- "indexmap",
+ "indexmap 1.9.3",
 "serde",
 "serde_json",
 "serde_with_macros",
@@ -4801,6 +5003,12 @@ dependencies = [
 "digest",
 ]

+[[package]]
+name = "sha1_smol"
+version = "1.0.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
+
 [[package]]
 name = "sha2"
 version = "0.10.6"
@@ -5199,6 +5407,17 @@ dependencies = [
 "once_cell",
 ]

+[[package]]
+name = "thrift"
+version = "0.17.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09"
+dependencies = [
+ "byteorder",
+ "integer-encoding",
+ "ordered-float",
+]
+
 [[package]]
 name = "time"
 version = "0.3.21"
@@ -5455,7 +5674,7 @@ version = "0.19.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2380d56e8670370eee6566b0bfd4265f65b3f432e8c6d85623f728d4fa31f739"
 dependencies = [
- "indexmap",
+ "indexmap 1.9.3",
 "serde",
 "serde_spanned",
 "toml_datetime",
@@ -5547,7 +5766,7 @@ checksum = "b8fa9be0de6cf49e536ce1851f987bd21a43b771b09473c3549a6c853db37c1c"
 dependencies = [
 "futures-core",
 "futures-util",
- "indexmap",
+ "indexmap 1.9.3",
 "pin-project",
 "pin-project-lite",
 "rand 0.8.5",
@@ -5743,6 +5962,16 @@ dependencies = [
 "utf-8",
 ]

+[[package]]
+name = "twox-hash"
+version = "1.6.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675"
+dependencies = [
+ "cfg-if",
+ "static_assertions",
+]
+
 [[package]]
 name = "typenum"
 version = "1.16.0"
@@ -5920,10 +6149,11 @@ dependencies = [

 [[package]]
 name = "uuid"
-version = "1.3.3"
+version = "1.6.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "345444e32442451b267fc254ae85a209c64be56d2890e601a0c37ff0c3c5ecd2"
+checksum = "5e395fcf16a7a3d8127ec99782007af141946b4795001f876d54fb0d55978560"
 dependencies = [
+ "atomic",
 "getrandom 0.2.11",
 "serde",
 ]
@@ -6407,9 +6637,11 @@ dependencies = [
 "futures-sink",
 "futures-util",
 "getrandom 0.2.11",
+ "hashbrown 0.14.0",
 "hex",
 "hmac",
 "hyper",
+ "indexmap 1.9.3",
 "itertools",
 "libc",
 "log",
@@ -6419,6 +6651,7 @@ dependencies = [
 "num-integer",
 "num-traits",
 "once_cell",
+ "parquet",
 "prost",
 "rand 0.8.5",
 "regex",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -3,6 +3,7 @@ resolver = "2"
 members = [
    "compute_tools",
    "control_plane",
+    "control_plane/attachment_service",
    "pageserver",
    "pageserver/ctl",
    "pageserver/client",
@@ -107,11 +108,14 @@ opentelemetry = "0.19.0"
 opentelemetry-otlp = { version = "0.12.0", default_features=false, features = ["http-proto", "trace", "http", "reqwest-client"] }
 opentelemetry-semantic-conventions = "0.11.0"
 parking_lot = "0.12"
+parquet = { version = "49.0.0", default-features = false, features = ["zstd"] }
+parquet_derive = "49.0.0"
 pbkdf2 = { version = "0.12.1", features = ["simple", "std"] }
 pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
+redis = { version = "0.24.0", features = ["tokio-rustls-comp", "keep-alive"] }
 regex = "1.10.2"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
@@ -161,7 +165,7 @@ tracing-error = "0.2.0"
 tracing-opentelemetry = "0.19.0"
 tracing-subscriber = { version = "0.3", default_features = false, features = ["smallvec", "fmt", "tracing-log", "std", "env-filter", "json"] }
 url = "2.2"
-uuid = { version = "1.2", features = ["v4", "serde"] }
+uuid = { version = "1.6.1", features = ["v4", "v7", "serde"] }
 walkdir = "2.3.2"
 webpki-roots = "0.25"
 x509-parser = "0.15"
@@ -215,6 +219,10 @@ tonic-build = "0.9"
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
 tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }

+# bug fixes for UUID
+parquet = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
+parquet_derive = { git = "https://github.com/neondatabase/arrow-rs", branch = "neon-fix-bugs" }
+
 ################# Binary contents sections

 [profile.release]
--- a/Dockerfile.buildtools
+++ b/Dockerfile.buildtools
@@ -135,7 +135,7 @@ WORKDIR /home/nonroot

 # Rust
 # Please keep the version of llvm (installed above) in sync with rust llvm (`rustc --version --verbose | grep LLVM`)
-ENV RUSTC_VERSION=1.74.0
+ENV RUSTC_VERSION=1.75.0
 ENV RUSTUP_HOME="/home/nonroot/.rustup"
 ENV PATH="/home/nonroot/.cargo/bin:${PATH}"
 RUN curl -sSO https://static.rust-lang.org/rustup/dist/$(uname -m)-unknown-linux-gnu/rustup-init && whoami && \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -883,8 +883,10 @@ FROM debian:bullseye-slim
 RUN mkdir /var/db && useradd -m -d /var/db/postgres postgres && \
    echo "postgres:test_console_pass" | chpasswd && \
    mkdir /var/db/postgres/compute && mkdir /var/db/postgres/specs && \
+    mkdir /var/db/postgres/pgbouncer && \
    chown -R postgres:postgres /var/db/postgres && \
    chmod 0750 /var/db/postgres/compute && \
+    chmod 0750 /var/db/postgres/pgbouncer && \
    echo '/usr/local/lib' >> /etc/ld.so.conf && /sbin/ldconfig && \
    # create folder for file cache
    mkdir -p -m 777 /neon/cache
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -32,8 +32,6 @@
 //!             -S /var/db/postgres/specs/current.json \
 //!             -b /usr/local/bin/postgres \
 //!             -r http://pg-ext-s3-gateway \
-//!             --pgbouncer-connstr 'host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable'
-//!             --pgbouncer-ini-path /etc/pgbouncer.ini \
 //! ```
 //!
 use std::collections::HashMap;
@@ -112,9 +110,6 @@ fn main() -> Result<()> {
    let spec_json = matches.get_one::<String>("spec");
    let spec_path = matches.get_one::<String>("spec-path");

-    let pgbouncer_connstr = matches.get_one::<String>("pgbouncer-connstr");
-    let pgbouncer_ini_path = matches.get_one::<String>("pgbouncer-ini-path");
-
    // Extract OpenTelemetry context for the startup actions from the
    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
    // tracing context.
@@ -225,15 +220,13 @@ fn main() -> Result<()> {
        ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
        ext_download_progress: RwLock::new(HashMap::new()),
        build_tag,
-        pgbouncer_connstr: pgbouncer_connstr.map(|s| s.to_string()),
-        pgbouncer_ini_path: pgbouncer_ini_path.map(|s| s.to_string()),
    };
    let compute = Arc::new(compute_node);

    // If this is a pooled VM, prewarm before starting HTTP server and becoming
-    // available for binding. Prewarming helps postgres start quicker later,
+    // available for binding. Prewarming helps Postgres start quicker later,
    // because QEMU will already have it's memory allocated from the host, and
-    // the necessary binaries will alreaady be cached.
+    // the necessary binaries will already be cached.
    if !spec_set {
        compute.prewarm_postgres()?;
    }
@@ -276,6 +269,11 @@ fn main() -> Result<()> {

    state.status = ComputeStatus::Init;
    compute.state_changed.notify_all();
+
+    info!(
+        "running compute with features: {:?}",
+        state.pspec.as_ref().unwrap().spec.features
+    );
    drop(state);

    // Launch remaining service threads
@@ -288,7 +286,7 @@ fn main() -> Result<()> {
    let pg = match compute.start_compute(extension_server_port) {
        Ok(pg) => Some(pg),
        Err(err) => {
-            error!("could not start the compute node: {:?}", err);
+            error!("could not start the compute node: {:#}", err);
            let mut state = compute.state.lock().unwrap();
            state.error = Some(format!("{:?}", err));
            state.status = ComputeStatus::Failed;
@@ -350,7 +348,7 @@ fn main() -> Result<()> {

    // Wait for the child Postgres process forever. In this state Ctrl+C will
    // propagate to Postgres and it will be shut down as well.
-    if let Some(mut pg) = pg {
+    if let Some((mut pg, logs_handle)) = pg {
        // Startup is finished, exit the startup tracing span
        drop(startup_context_guard);

@@ -358,6 +356,12 @@ fn main() -> Result<()> {
            .wait()
            .expect("failed to start waiting on Postgres process");
        PG_PID.store(0, Ordering::SeqCst);
+
+        // Process has exited, so we can join the logs thread.
+        let _ = logs_handle
+            .join()
+            .map_err(|e| tracing::error!("log thread panicked: {:?}", e));
+
        info!("Postgres exited with code {}, shutting down", ecode);
        exit_code = ecode.code()
    }
@@ -512,23 +516,6 @@ fn cli() -> clap::Command {
                )
                .value_name("FILECACHE_CONNSTR"),
        )
-        .arg(
-            Arg::new("pgbouncer-connstr")
-                .long("pgbouncer-connstr")
-                .default_value(
-                    "host=localhost port=6432 dbname=pgbouncer user=cloud_admin sslmode=disable",
-                )
-                .value_name("PGBOUNCER_CONNSTR"),
-        )
-        .arg(
-            Arg::new("pgbouncer-ini-path")
-                .long("pgbouncer-ini-path")
-                // Note: this doesn't match current path for pgbouncer.ini.
-                // Until we fix it, we need to pass the path explicitly
-                // or this will be effectively no-op.
-                .default_value("/etc/pgbouncer.ini")
-                .value_name("PGBOUNCER_INI_PATH"),
-        )
 }

 /// When compute_ctl is killed, send also termination signal to sync-safekeepers
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -20,7 +20,7 @@ use futures::StreamExt;
 use postgres::{Client, NoTls};
 use tokio;
 use tokio_postgres;
-use tracing::{error, info, instrument, warn};
+use tracing::{debug, error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

@@ -31,6 +31,7 @@ use utils::measured_stream::MeasuredReader;
 use remote_storage::{DownloadError, RemotePath};

 use crate::checker::create_availability_check_data;
+use crate::logger::inlinify;
 use crate::pg_helpers::*;
 use crate::spec::*;
 use crate::sync_sk::{check_if_synced, ping_safekeeper};
@@ -70,10 +71,6 @@ pub struct ComputeNode {
    // key: ext_archive_name, value: started download time, download_completed?
    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
    pub build_tag: String,
-    // connection string to pgbouncer to change settings
-    pub pgbouncer_connstr: Option<String>,
-    // path to pgbouncer.ini to change settings
-    pub pgbouncer_ini_path: Option<String>,
 }

 // store some metrics about download size that might impact startup time
@@ -279,7 +276,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
            $$;"#,
        roles_decl, database_decl,
    );
-    info!("Neon superuser created:\n{}", &query);
+    info!("Neon superuser created: {}", inlinify(&query));
    client
        .simple_query(&query)
        .map_err(|e| anyhow::anyhow!(e).context(query))?;
@@ -495,7 +492,7 @@ impl ComputeNode {
    pub fn sync_safekeepers(&self, storage_auth_token: Option<String>) -> Result<Lsn> {
        let start_time = Utc::now();

-        let sync_handle = maybe_cgexec(&self.pgbin)
+        let mut sync_handle = maybe_cgexec(&self.pgbin)
            .args(["--sync-safekeepers"])
            .env("PGDATA", &self.pgdata) // we cannot use -D in this mode
            .envs(if let Some(storage_auth_token) = &storage_auth_token {
@@ -504,18 +501,30 @@ impl ComputeNode {
                vec![]
            })
            .stdout(Stdio::piped())
+            .stderr(Stdio::piped())
            .spawn()
            .expect("postgres --sync-safekeepers failed to start");
        SYNC_SAFEKEEPERS_PID.store(sync_handle.id(), Ordering::SeqCst);

        // `postgres --sync-safekeepers` will print all log output to stderr and
-        // final LSN to stdout. So we pipe only stdout, while stderr will be automatically
-        // redirected to the caller output.
+        // final LSN to stdout. So we leave stdout to collect LSN, while stderr logs
+        // will be collected in a child thread.
+        let stderr = sync_handle
+            .stderr
+            .take()
+            .expect("stderr should be captured");
+        let logs_handle = handle_postgres_logs(stderr);
+
        let sync_output = sync_handle
            .wait_with_output()
            .expect("postgres --sync-safekeepers failed");
        SYNC_SAFEKEEPERS_PID.store(0, Ordering::SeqCst);

+        // Process has exited, so we can join the logs thread.
+        let _ = logs_handle
+            .join()
+            .map_err(|e| tracing::error!("log thread panicked: {:?}", e));
+
        if !sync_output.status.success() {
            anyhow::bail!(
                "postgres --sync-safekeepers exited with non-zero status: {}. stdout: {}",
@@ -652,11 +661,12 @@ impl ComputeNode {

    /// Start Postgres as a child process and manage DBs/roles.
    /// After that this will hang waiting on the postmaster process to exit.
+    /// Returns a handle to the child process and a handle to the logs thread.
    #[instrument(skip_all)]
    pub fn start_postgres(
        &self,
        storage_auth_token: Option<String>,
-    ) -> Result<std::process::Child> {
+    ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
        let pgdata_path = Path::new(&self.pgdata);

        // Run postgres as a child process.
@@ -667,13 +677,18 @@ impl ComputeNode {
            } else {
                vec![]
            })
+            .stderr(Stdio::piped())
            .spawn()
            .expect("cannot start postgres process");
        PG_PID.store(pg.id(), Ordering::SeqCst);

+        // Start a thread to collect logs from stderr.
+        let stderr = pg.stderr.take().expect("stderr should be captured");
+        let logs_handle = handle_postgres_logs(stderr);
+
        wait_for_postgres(&mut pg, pgdata_path)?;

-        Ok(pg)
+        Ok((pg, logs_handle))
    }

    /// Do initial configuration of the already started Postgres.
@@ -750,8 +765,8 @@ impl ComputeNode {
    pub fn reconfigure(&self) -> Result<()> {
        let spec = self.state.lock().unwrap().pspec.clone().unwrap().spec;

-        if let Some(connstr) = &self.pgbouncer_connstr {
-            info!("tuning pgbouncer with connstr: {:?}", connstr);
+        if let Some(ref pgbouncer_settings) = spec.pgbouncer_settings {
+            info!("tuning pgbouncer");

            let rt = tokio::runtime::Builder::new_current_thread()
                .enable_all()
@@ -760,15 +775,9 @@ impl ComputeNode {

            // Spawn a thread to do the tuning,
            // so that we don't block the main thread that starts Postgres.
-            let pgbouncer_settings = spec.pgbouncer_settings.clone();
-            let connstr_clone = connstr.clone();
-            let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
+            let pgbouncer_settings = pgbouncer_settings.clone();
            let _handle = thread::spawn(move || {
-                let res = rt.block_on(tune_pgbouncer(
-                    pgbouncer_settings,
-                    &connstr_clone,
-                    pgbouncer_ini_path,
-                ));
+                let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
                if let Err(err) = res {
                    error!("error while tuning pgbouncer: {err:?}");
                }
@@ -818,7 +827,10 @@ impl ComputeNode {
    }

    #[instrument(skip_all)]
-    pub fn start_compute(&self, extension_server_port: u16) -> Result<std::process::Child> {
+    pub fn start_compute(
+        &self,
+        extension_server_port: u16,
+    ) -> Result<(std::process::Child, std::thread::JoinHandle<()>)> {
        let compute_state = self.state.lock().unwrap().clone();
        let pspec = compute_state.pspec.as_ref().expect("spec must be set");
        info!(
@@ -830,8 +842,8 @@ impl ComputeNode {
        );

        // tune pgbouncer
-        if let Some(connstr) = &self.pgbouncer_connstr {
-            info!("tuning pgbouncer with connstr: {:?}", connstr);
+        if let Some(pgbouncer_settings) = &pspec.spec.pgbouncer_settings {
+            info!("tuning pgbouncer");

            let rt = tokio::runtime::Builder::new_current_thread()
                .enable_all()
@@ -840,15 +852,9 @@ impl ComputeNode {

            // Spawn a thread to do the tuning,
            // so that we don't block the main thread that starts Postgres.
-            let pgbouncer_settings = pspec.spec.pgbouncer_settings.clone();
-            let connstr_clone = connstr.clone();
-            let pgbouncer_ini_path = self.pgbouncer_ini_path.clone();
+            let pgbouncer_settings = pgbouncer_settings.clone();
            let _handle = thread::spawn(move || {
-                let res = rt.block_on(tune_pgbouncer(
-                    pgbouncer_settings,
-                    &connstr_clone,
-                    pgbouncer_ini_path,
-                ));
+                let res = rt.block_on(tune_pgbouncer(pgbouncer_settings));
                if let Err(err) = res {
                    error!("error while tuning pgbouncer: {err:?}");
                }
@@ -889,7 +895,7 @@ impl ComputeNode {
        self.prepare_pgdata(&compute_state, extension_server_port)?;

        let start_time = Utc::now();
-        let pg = self.start_postgres(pspec.storage_auth_token.clone())?;
+        let pg_process = self.start_postgres(pspec.storage_auth_token.clone())?;

        let config_time = Utc::now();
        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
@@ -939,7 +945,17 @@ impl ComputeNode {
        };
        info!(?metrics, "compute start finished");

-        Ok(pg)
+        Ok(pg_process)
+    }
+
+    /// Update the `last_active` in the shared state, but ensure that it's a more recent one.
+    pub fn update_last_active(&self, last_active: Option<DateTime<Utc>>) {
+        let mut state = self.state.lock().unwrap();
+        // NB: `Some(<DateTime>)` is always greater than `None`.
+        if last_active > state.last_active {
+            state.last_active = last_active;
+            debug!("set the last compute activity time to: {:?}", last_active);
+        }
    }

    // Look for core dumps and collect backtraces.
--- a/compute_tools/src/logger.rs
+++ b/compute_tools/src/logger.rs
@@ -38,3 +38,9 @@ pub fn init_tracing_and_logging(default_log_level: &str) -> anyhow::Result<()> {

    Ok(())
 }
+
+/// Replace all newline characters with a special character to make it
+/// easier to grep for log messages.
+pub fn inlinify(s: &str) -> String {
+    s.replace('\n', "\u{200B}")
+}
--- a/compute_tools/src/monitor.rs
+++ b/compute_tools/src/monitor.rs
@@ -3,97 +3,165 @@ use std::{thread, time::Duration};

 use chrono::{DateTime, Utc};
 use postgres::{Client, NoTls};
-use tracing::{debug, info};
+use tracing::{debug, error, info, warn};

 use crate::compute::ComputeNode;
+use compute_api::responses::ComputeStatus;
+use compute_api::spec::ComputeFeature;

 const MONITOR_CHECK_INTERVAL: Duration = Duration::from_millis(500);

 // Spin in a loop and figure out the last activity time in the Postgres.
 // Then update it in the shared state. This function never errors out.
-// XXX: the only expected panic is at `RwLock` unwrap().
+// NB: the only expected panic is at `Mutex` unwrap(), all other errors
+// should be handled gracefully.
 fn watch_compute_activity(compute: &ComputeNode) {
    // Suppose that `connstr` doesn't change
    let connstr = compute.connstr.as_str();
+
+    // During startup and configuration we connect to every Postgres database,
+    // but we don't want to count this as some user activity. So wait until
+    // the compute fully started before monitoring activity.
+    wait_for_postgres_start(compute);
+
    // Define `client` outside of the loop to reuse existing connection if it's active.
    let mut client = Client::connect(connstr, NoTls);

-    info!("watching Postgres activity at {}", connstr);
+    let mut sleep = false;
+    let mut prev_active_time: Option<f64> = None;
+    let mut prev_sessions: Option<i64> = None;
+
+    if compute.has_feature(ComputeFeature::ActivityMonitorExperimental) {
+        info!("starting experimental activity monitor for {}", connstr);
+    } else {
+        info!("starting activity monitor for {}", connstr);
+    }

    loop {
-        // Should be outside of the write lock to allow others to read while we sleep.
-        thread::sleep(MONITOR_CHECK_INTERVAL);
+        // We use `continue` a lot, so it's more convenient to sleep at the top of the loop.
+        // But skip the first sleep, so we can connect to Postgres immediately.
+        if sleep {
+            // Should be outside of the mutex lock to allow others to read while we sleep.
+            thread::sleep(MONITOR_CHECK_INTERVAL);
+        } else {
+            sleep = true;
+        }

        match &mut client {
            Ok(cli) => {
                if cli.is_closed() {
-                    info!("connection to postgres closed, trying to reconnect");
+                    info!("connection to Postgres is closed, trying to reconnect");

                    // Connection is closed, reconnect and try again.
                    client = Client::connect(connstr, NoTls);
                    continue;
                }

-                // Get all running client backends except ourself, use RFC3339 DateTime format.
-                let backends = cli
-                    .query(
-                        "SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change
-                         FROM pg_stat_activity
-                         WHERE backend_type = 'client backend'
-                            AND pid != pg_backend_pid()
-                            AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
-                        &[],
-                    );
-                let mut last_active = compute.state.lock().unwrap().last_active;
+                // This is a new logic, only enable if the feature flag is set.
+                // TODO: remove this once we are sure that it works OR drop it altogether.
+                if compute.has_feature(ComputeFeature::ActivityMonitorExperimental) {
+                    // First, check if the total active time or sessions across all databases has changed.
+                    // If it did, it means that user executed some queries. In theory, it can even go down if
+                    // some databases were dropped, but it's still a user activity.
+                    match get_database_stats(cli) {
+                        Ok((active_time, sessions)) => {
+                            let mut detected_activity = false;

-                if let Ok(backs) = backends {
-                    let mut idle_backs: Vec<DateTime<Utc>> = vec![];
-
-                    for b in backs.into_iter() {
-                        let state: String = match b.try_get("state") {
-                            Ok(state) => state,
-                            Err(_) => continue,
-                        };
-
-                        if state == "idle" {
-                            let change: String = match b.try_get("state_change") {
-                                Ok(state_change) => state_change,
-                                Err(_) => continue,
-                            };
-                            let change = DateTime::parse_from_rfc3339(&change);
-                            match change {
-                                Ok(t) => idle_backs.push(t.with_timezone(&Utc)),
-                                Err(e) => {
-                                    info!("cannot parse backend state_change DateTime: {}", e);
-                                    continue;
+                            prev_active_time = match prev_active_time {
+                                Some(prev_active_time) => {
+                                    if active_time != prev_active_time {
+                                        detected_activity = true;
+                                    }
+                                    Some(active_time)
                                }
-                            }
-                        } else {
-                            // Found non-idle backend, so the last activity is NOW.
-                            // Save it and exit the for loop. Also clear the idle backend
-                            // `state_change` timestamps array as it doesn't matter now.
-                            last_active = Some(Utc::now());
-                            idle_backs.clear();
-                            break;
-                        }
-                    }
+                                None => Some(active_time),
+                            };
+                            prev_sessions = match prev_sessions {
+                                Some(prev_sessions) => {
+                                    if sessions != prev_sessions {
+                                        detected_activity = true;
+                                    }
+                                    Some(sessions)
+                                }
+                                None => Some(sessions),
+                            };

-                    // Get idle backend `state_change` with the max timestamp.
-                    if let Some(last) = idle_backs.iter().max() {
-                        last_active = Some(*last);
+                            if detected_activity {
+                                // Update the last active time and continue, we don't need to
+                                // check backends state change.
+                                compute.update_last_active(Some(Utc::now()));
+                                continue;
+                            }
+                        }
+                        Err(e) => {
+                            error!("could not get database statistics: {}", e);
+                            continue;
+                        }
                    }
                }

-                // Update the last activity in the shared state if we got a more recent one.
-                let mut state = compute.state.lock().unwrap();
-                // NB: `Some(<DateTime>)` is always greater than `None`.
-                if last_active > state.last_active {
-                    state.last_active = last_active;
-                    debug!("set the last compute activity time to: {:?}", last_active);
+                // Second, if database statistics is the same, check all backends state change,
+                // maybe there is some with more recent activity. `get_backends_state_change()`
+                // can return None or stale timestamp, so it's `compute.update_last_active()`
+                // responsibility to check if the new timestamp is more recent than the current one.
+                // This helps us to discover new sessions, that did nothing yet.
+                match get_backends_state_change(cli) {
+                    Ok(last_active) => {
+                        compute.update_last_active(last_active);
+                    }
+                    Err(e) => {
+                        error!("could not get backends state change: {}", e);
+                    }
+                }
+
+                // Finally, if there are existing (logical) walsenders, do not suspend.
+                //
+                // walproposer doesn't currently show up in pg_stat_replication,
+                // but protect if it will be
+                let ws_count_query = "select count(*) from pg_stat_replication where application_name != 'walproposer';";
+                match cli.query_one(ws_count_query, &[]) {
+                    Ok(r) => match r.try_get::<&str, i64>("count") {
+                        Ok(num_ws) => {
+                            if num_ws > 0 {
+                                compute.update_last_active(Some(Utc::now()));
+                                continue;
+                            }
+                        }
+                        Err(e) => {
+                            warn!("failed to parse walsenders count: {:?}", e);
+                            continue;
+                        }
+                    },
+                    Err(e) => {
+                        warn!("failed to get list of walsenders: {:?}", e);
+                        continue;
+                    }
+                }
+                //
+                // Do not suspend compute if autovacuum is running
+                //
+                let autovacuum_count_query = "select count(*) from pg_stat_activity where backend_type = 'autovacuum worker'";
+                match cli.query_one(autovacuum_count_query, &[]) {
+                    Ok(r) => match r.try_get::<&str, i64>("count") {
+                        Ok(num_workers) => {
+                            if num_workers > 0 {
+                                compute.update_last_active(Some(Utc::now()));
+                                continue;
+                            }
+                        }
+                        Err(e) => {
+                            warn!("failed to parse autovacuum workers count: {:?}", e);
+                            continue;
+                        }
+                    },
+                    Err(e) => {
+                        warn!("failed to get list of autovacuum workers: {:?}", e);
+                        continue;
+                    }
                }
            }
            Err(e) => {
-                debug!("cannot connect to postgres: {}, retrying", e);
+                debug!("could not connect to Postgres: {}, retrying", e);

                // Establish a new connection and try again.
                client = Client::connect(connstr, NoTls);
@@ -102,12 +170,124 @@ fn watch_compute_activity(compute: &ComputeNode) {
    }
 }

+// Hang on condition variable waiting until the compute status is `Running`.
+fn wait_for_postgres_start(compute: &ComputeNode) {
+    let mut state = compute.state.lock().unwrap();
+    while state.status != ComputeStatus::Running {
+        info!("compute is not running, waiting before monitoring activity");
+        state = compute.state_changed.wait(state).unwrap();
+
+        if state.status == ComputeStatus::Running {
+            break;
+        }
+    }
+}
+
+// Figure out the total active time and sessions across all non-system databases.
+// Returned tuple is `(active_time, sessions)`.
+// It can return `0.0` active time or `0` sessions, which means no user databases exist OR
+// it was a start with skipped `pg_catalog` updates and user didn't do any queries
+// (or open any sessions) yet.
+fn get_database_stats(cli: &mut Client) -> anyhow::Result<(f64, i64)> {
+    // Filter out `postgres` database as `compute_ctl` and other monitoring tools
+    // like `postgres_exporter` use it to query Postgres statistics.
+    // Use explicit 8 bytes type casts to match Rust types.
+    let stats = cli.query_one(
+        "SELECT coalesce(sum(active_time), 0.0)::float8 AS total_active_time,
+            coalesce(sum(sessions), 0)::bigint AS total_sessions
+        FROM pg_stat_database
+        WHERE datname NOT IN (
+                'postgres',
+                'template0',
+                'template1'
+            );",
+        &[],
+    );
+    let stats = match stats {
+        Ok(stats) => stats,
+        Err(e) => {
+            return Err(anyhow::anyhow!("could not query active_time: {}", e));
+        }
+    };
+
+    let active_time: f64 = match stats.try_get("total_active_time") {
+        Ok(active_time) => active_time,
+        Err(e) => return Err(anyhow::anyhow!("could not get total_active_time: {}", e)),
+    };
+
+    let sessions: i64 = match stats.try_get("total_sessions") {
+        Ok(sessions) => sessions,
+        Err(e) => return Err(anyhow::anyhow!("could not get total_sessions: {}", e)),
+    };
+
+    Ok((active_time, sessions))
+}
+
+// Figure out the most recent state change time across all client backends.
+// If there is currently active backend, timestamp will be `Utc::now()`.
+// It can return `None`, which means no client backends exist or we were
+// unable to parse the timestamp.
+fn get_backends_state_change(cli: &mut Client) -> anyhow::Result<Option<DateTime<Utc>>> {
+    let mut last_active: Option<DateTime<Utc>> = None;
+    // Get all running client backends except ourself, use RFC3339 DateTime format.
+    let backends = cli.query(
+        "SELECT state, to_char(state_change, 'YYYY-MM-DD\"T\"HH24:MI:SS.US\"Z\"') AS state_change
+                FROM pg_stat_activity
+                    WHERE backend_type = 'client backend'
+                    AND pid != pg_backend_pid()
+                    AND usename != 'cloud_admin';", // XXX: find a better way to filter other monitors?
+        &[],
+    );
+
+    match backends {
+        Ok(backs) => {
+            let mut idle_backs: Vec<DateTime<Utc>> = vec![];
+
+            for b in backs.into_iter() {
+                let state: String = match b.try_get("state") {
+                    Ok(state) => state,
+                    Err(_) => continue,
+                };
+
+                if state == "idle" {
+                    let change: String = match b.try_get("state_change") {
+                        Ok(state_change) => state_change,
+                        Err(_) => continue,
+                    };
+                    let change = DateTime::parse_from_rfc3339(&change);
+                    match change {
+                        Ok(t) => idle_backs.push(t.with_timezone(&Utc)),
+                        Err(e) => {
+                            info!("cannot parse backend state_change DateTime: {}", e);
+                            continue;
+                        }
+                    }
+                } else {
+                    // Found non-idle backend, so the last activity is NOW.
+                    // Return immediately, no need to check other backends.
+                    return Ok(Some(Utc::now()));
+                }
+            }
+
+            // Get idle backend `state_change` with the max timestamp.
+            if let Some(last) = idle_backs.iter().max() {
+                last_active = Some(*last);
+            }
+        }
+        Err(e) => {
+            return Err(anyhow::anyhow!("could not query backends: {}", e));
+        }
+    }
+
+    Ok(last_active)
+}
+
 /// Launch a separate compute monitor thread and return its `JoinHandle`.
-pub fn launch_monitor(state: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
-    let state = Arc::clone(state);
+pub fn launch_monitor(compute: &Arc<ComputeNode>) -> thread::JoinHandle<()> {
+    let compute = Arc::clone(compute);

    thread::Builder::new()
        .name("compute-monitor".into())
-        .spawn(move || watch_compute_activity(&state))
+        .spawn(move || watch_compute_activity(&compute))
        .expect("cannot launch compute monitor thread")
 }
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -6,12 +6,15 @@ use std::io::{BufRead, BufReader};
 use std::os::unix::fs::PermissionsExt;
 use std::path::Path;
 use std::process::Child;
+use std::thread::JoinHandle;
 use std::time::{Duration, Instant};

 use anyhow::{bail, Result};
 use ini::Ini;
 use notify::{RecursiveMode, Watcher};
 use postgres::{Client, Transaction};
+use tokio::io::AsyncBufReadExt;
+use tokio::time::timeout;
 use tokio_postgres::NoTls;
 use tracing::{debug, error, info, instrument};

@@ -363,7 +366,7 @@ pub fn create_pgdata(pgdata: &str) -> Result<()> {
 }

 /// Update pgbouncer.ini with provided options
-pub fn update_pgbouncer_ini(
+fn update_pgbouncer_ini(
    pgbouncer_config: HashMap<String, String>,
    pgbouncer_ini_path: &str,
 ) -> Result<()> {
@@ -372,6 +375,10 @@ pub fn update_pgbouncer_ini(

    for (option_name, value) in pgbouncer_config.iter() {
        section.insert(option_name, value);
+        debug!(
+            "Updating pgbouncer.ini with new values {}={}",
+            option_name, value
+        );
    }

    conf.write_to_file(pgbouncer_ini_path)?;
@@ -381,46 +388,146 @@ pub fn update_pgbouncer_ini(
 /// Tune pgbouncer.
 /// 1. Apply new config using pgbouncer admin console
 /// 2. Add new values to pgbouncer.ini to preserve them after restart
-pub async fn tune_pgbouncer(
-    pgbouncer_settings: Option<HashMap<String, String>>,
-    pgbouncer_connstr: &str,
-    pgbouncer_ini_path: Option<String>,
-) -> Result<()> {
-    if let Some(pgbouncer_config) = pgbouncer_settings {
-        // Apply new config
-        let connect_result = tokio_postgres::connect(pgbouncer_connstr, NoTls).await;
-        let (client, connection) = connect_result.unwrap();
-        tokio::spawn(async move {
-            if let Err(e) = connection.await {
-                eprintln!("connection error: {}", e);
+pub async fn tune_pgbouncer(pgbouncer_config: HashMap<String, String>) -> Result<()> {
+    let pgbouncer_connstr = if std::env::var_os("AUTOSCALING").is_some() {
+        // for VMs use pgbouncer specific way to connect to
+        // pgbouncer admin console without password
+        // when pgbouncer is running under the same user.
+        "host=/tmp port=6432 dbname=pgbouncer user=pgbouncer".to_string()
+    } else {
+        // for k8s use normal connection string with password
+        // to connect to pgbouncer admin console
+        let mut pgbouncer_connstr =
+            "host=localhost port=6432 dbname=pgbouncer user=postgres sslmode=disable".to_string();
+        if let Ok(pass) = std::env::var("PGBOUNCER_PASSWORD") {
+            pgbouncer_connstr.push_str(format!(" password={}", pass).as_str());
+        }
+        pgbouncer_connstr
+    };
+
+    info!(
+        "Connecting to pgbouncer with connection string: {}",
+        pgbouncer_connstr
+    );
+
+    // connect to pgbouncer, retrying several times
+    // because pgbouncer may not be ready yet
+    let mut retries = 3;
+    let client = loop {
+        match tokio_postgres::connect(&pgbouncer_connstr, NoTls).await {
+            Ok((client, connection)) => {
+                tokio::spawn(async move {
+                    if let Err(e) = connection.await {
+                        eprintln!("connection error: {}", e);
+                    }
+                });
+                break client;
            }
-        });
+            Err(e) => {
+                if retries == 0 {
+                    return Err(e.into());
+                }
+                error!("Failed to connect to pgbouncer: pgbouncer_connstr {}", e);
+                retries -= 1;
+                tokio::time::sleep(Duration::from_secs(1)).await;
+            }
+        }
+    };

-        for (option_name, value) in pgbouncer_config.iter() {
-            info!(
-                "Applying pgbouncer setting change: {} = {}",
-                option_name, value
+    // Apply new config
+    for (option_name, value) in pgbouncer_config.iter() {
+        let query = format!("SET {}={}", option_name, value);
+        // keep this log line for debugging purposes
+        info!("Applying pgbouncer setting change: {}", query);
+
+        if let Err(err) = client.simple_query(&query).await {
+            // Don't fail on error, just print it into log
+            error!(
+                "Failed to apply pgbouncer setting change: {},  {}",
+                query, err
            );
-            let query = format!("SET {} = {}", option_name, value);
+        };
+    }

-            let result = client.simple_query(&query).await;
+    // save values to pgbouncer.ini
+    // so that they are preserved after pgbouncer restart
+    let pgbouncer_ini_path = if std::env::var_os("AUTOSCALING").is_some() {
+        // in VMs we use /etc/pgbouncer.ini
+        "/etc/pgbouncer.ini".to_string()
+    } else {
+        // in pods we use /var/db/postgres/pgbouncer/pgbouncer.ini
+        // this is a shared volume between pgbouncer and postgres containers
+        // FIXME: fix permissions for this file
+        "/var/db/postgres/pgbouncer/pgbouncer.ini".to_string()
+    };
+    update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;

-            info!("Applying pgbouncer setting change: {}", query);
-            info!("pgbouncer setting change result: {:?}", result);
+    Ok(())
+}

-            if let Err(err) = result {
-                // Don't fail on error, just print it into log
-                error!(
-                    "Failed to apply pgbouncer setting change: {},  {}",
-                    query, err
-                );
-            };
+/// Spawn a thread that will read Postgres logs from `stderr`, join multiline logs
+/// and send them to the logger. In the future we may also want to add context to
+/// these logs.
+pub fn handle_postgres_logs(stderr: std::process::ChildStderr) -> JoinHandle<()> {
+    std::thread::spawn(move || {
+        let runtime = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .expect("failed to build tokio runtime");
+
+        let res = runtime.block_on(async move {
+            let stderr = tokio::process::ChildStderr::from_std(stderr)?;
+            handle_postgres_logs_async(stderr).await
+        });
+        if let Err(e) = res {
+            tracing::error!("error while processing postgres logs: {}", e);
+        }
+    })
+}
+
+/// Read Postgres logs from `stderr` until EOF. Buffer is flushed on one of the following conditions:
+/// - next line starts with timestamp
+/// - EOF
+/// - no new lines were written for the last second
+async fn handle_postgres_logs_async(stderr: tokio::process::ChildStderr) -> Result<()> {
+    let mut lines = tokio::io::BufReader::new(stderr).lines();
+    let timeout_duration = Duration::from_millis(100);
+    let ts_regex =
+        regex::Regex::new(r"^\d+-\d{2}-\d{2} \d{2}:\d{2}:\d{2}").expect("regex is valid");
+
+    let mut buf = vec![];
+    loop {
+        let next_line = timeout(timeout_duration, lines.next_line()).await;
+
+        // we should flush lines from the buffer if we cannot continue reading multiline message
+        let should_flush_buf = match next_line {
+            // Flushing if new line starts with timestamp
+            Ok(Ok(Some(ref line))) => ts_regex.is_match(line),
+            // Flushing on EOF, timeout or error
+            _ => true,
+        };
+
+        if !buf.is_empty() && should_flush_buf {
+            // join multiline message into a single line, separated by unicode Zero Width Space.
+            // "PG:" suffix is used to distinguish postgres logs from other logs.
+            let combined = format!("PG:{}\n", buf.join("\u{200B}"));
+            buf.clear();
+
+            // sync write to stderr to avoid interleaving with other logs
+            use std::io::Write;
+            let res = std::io::stderr().lock().write_all(combined.as_bytes());
+            if let Err(e) = res {
+                tracing::error!("error while writing to stderr: {}", e);
+            }
        }

-        // save values to pgbouncer.ini
-        // so that they are preserved after pgbouncer restart
-        if let Some(pgbouncer_ini_path) = pgbouncer_ini_path {
-            update_pgbouncer_ini(pgbouncer_config, &pgbouncer_ini_path)?;
+        // if not timeout, append line to the buffer
+        if next_line.is_ok() {
+            match next_line?? {
+                Some(line) => buf.push(line),
+                // EOF
+                None => break,
+            };
        }
    }

--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -9,6 +9,7 @@ use reqwest::StatusCode;
 use tracing::{error, info, info_span, instrument, span_enabled, warn, Level};

 use crate::config;
+use crate::logger::inlinify;
 use crate::params::PG_HBA_ALL_MD5;
 use crate::pg_helpers::*;

@@ -189,18 +190,20 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {

    // Print a list of existing Postgres roles (only in debug mode)
    if span_enabled!(Level::INFO) {
-        info!("postgres roles:");
+        let mut vec = Vec::new();
        for r in &existing_roles {
-            info!(
-                "    - {}:{}",
+            vec.push(format!(
+                "{}:{}",
                r.name,
                if r.encrypted_password.is_some() {
                    "[FILTERED]"
                } else {
                    "(null)"
                }
-            );
+            ));
        }
+
+        info!("postgres roles (total {}): {:?}", vec.len(), vec);
    }

    // Process delta operations first
@@ -238,7 +241,10 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    // Refresh Postgres roles info to handle possible roles renaming
    let existing_roles: Vec<Role> = get_existing_roles(&mut xact)?;

-    info!("cluster spec roles:");
+    info!(
+        "handling cluster spec roles (total {})",
+        spec.cluster.roles.len()
+    );
    for role in &spec.cluster.roles {
        let name = &role.name;
        // XXX: with a limited number of roles it is fine, but consider making it a HashMap
@@ -301,7 +307,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
                    name.pg_quote()
                );
-                info!("role create query: '{}'", &query);
+                info!("running role create query: '{}'", &query);
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
            }
@@ -318,7 +324,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                RoleAction::Create => " -> create",
                RoleAction::Update => " -> update",
            };
-            info!("   - {}:{}{}", name, pwd, action_str);
+            info!(" - {}:{}{}", name, pwd, action_str);
        }
    }

@@ -427,10 +433,11 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {

    // Print a list of existing Postgres databases (only in debug mode)
    if span_enabled!(Level::INFO) {
-        info!("postgres databases:");
+        let mut vec = Vec::new();
        for (dbname, db) in &existing_dbs {
-            info!("    {}:{}", dbname, db.owner);
+            vec.push(format!("{}:{}", dbname, db.owner));
        }
+        info!("postgres databases (total {}): {:?}", vec.len(), vec);
    }

    // Process delta operations first
@@ -502,7 +509,10 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
    // Refresh Postgres databases info to handle possible renames
    let existing_dbs = get_existing_dbs(client)?;

-    info!("cluster spec databases:");
+    info!(
+        "handling cluster spec databases (total {})",
+        spec.cluster.databases.len()
+    );
    for db in &spec.cluster.databases {
        let name = &db.name;
        let pg_db = existing_dbs.get(name);
@@ -561,7 +571,7 @@ pub fn handle_databases(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
                DatabaseAction::Create => " -> create",
                DatabaseAction::Update => " -> update",
            };
-            info!("   - {}:{}{}", db.name, db.owner, action_str);
+            info!(" - {}:{}{}", db.name, db.owner, action_str);
        }
    }

@@ -662,7 +672,11 @@ pub fn handle_grants(spec: &ComputeSpec, client: &mut Client, connstr: &str) ->
            $$;"
        .to_string();

-        info!("grant query for db {} : {}", &db.name, &grant_query);
+        info!(
+            "grant query for db {} : {}",
+            &db.name,
+            inlinify(&grant_query)
+        );
        db_client.simple_query(&grant_query)?;
    }

--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -0,0 +1,32 @@
+[package]
+name = "attachment_service"
+version = "0.1.0"
+edition.workspace = true
+license.workspace = true
+
+[dependencies]
+anyhow.workspace = true
+camino.workspace = true
+clap.workspace = true
+futures.workspace = true
+git-version.workspace = true
+hyper.workspace = true
+pageserver_api.workspace = true
+pageserver_client.workspace = true
+postgres_connection.workspace = true
+serde.workspace = true
+serde_json.workspace = true
+thiserror.workspace = true
+tokio.workspace = true
+tokio-util.workspace = true
+tracing.workspace = true
+
+# TODO: remove this after DB persistence is added, it is only used for
+# a parsing function when loading pageservers from neon_local LocalEnv
+postgres_backend.workspace = true
+
+utils = { path = "../../libs/utils/" }
+metrics = { path = "../../libs/metrics/" }
+control_plane = { path = ".." }
+workspace_hack = { version = "0.1", path = "../../workspace_hack" }
+
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -0,0 +1,116 @@
+use std::collections::HashMap;
+
+use control_plane::endpoint::ComputeControlPlane;
+use control_plane::local_env::LocalEnv;
+use pageserver_api::shard::{ShardCount, ShardIndex, TenantShardId};
+use postgres_connection::parse_host_port;
+use utils::id::{NodeId, TenantId};
+
+pub(super) struct ComputeHookTenant {
+    shards: Vec<(ShardIndex, NodeId)>,
+}
+
+impl ComputeHookTenant {
+    pub(super) async fn maybe_reconfigure(&mut self, tenant_id: TenantId) -> anyhow::Result<()> {
+        // Find the highest shard count and drop any shards that aren't
+        // for that shard count.
+        let shard_count = self.shards.iter().map(|(k, _v)| k.shard_count).max();
+        let Some(shard_count) = shard_count else {
+            // No shards, nothing to do.
+            tracing::info!("ComputeHookTenant::maybe_reconfigure: no shards");
+            return Ok(());
+        };
+
+        self.shards.retain(|(k, _v)| k.shard_count == shard_count);
+        self.shards
+            .sort_by_key(|(shard, _node_id)| shard.shard_number);
+
+        if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) {
+            // We have pageservers for all the shards: proceed to reconfigure compute
+            let env = match LocalEnv::load_config() {
+                Ok(e) => e,
+                Err(e) => {
+                    tracing::warn!(
+                        "Couldn't load neon_local config, skipping compute update ({e})"
+                    );
+                    return Ok(());
+                }
+            };
+            let cplane = ComputeControlPlane::load(env.clone())
+                .expect("Error loading compute control plane");
+
+            let compute_pageservers = self
+                .shards
+                .iter()
+                .map(|(_shard, node_id)| {
+                    let ps_conf = env
+                        .get_pageserver_conf(*node_id)
+                        .expect("Unknown pageserver");
+                    let (pg_host, pg_port) = parse_host_port(&ps_conf.listen_pg_addr)
+                        .expect("Unable to parse listen_pg_addr");
+                    (pg_host, pg_port.unwrap_or(5432))
+                })
+                .collect::<Vec<_>>();
+
+            for (endpoint_name, endpoint) in &cplane.endpoints {
+                if endpoint.tenant_id == tenant_id && endpoint.status() == "running" {
+                    tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
+                    endpoint.reconfigure(compute_pageservers.clone()).await?;
+                }
+            }
+        } else {
+            tracing::info!(
+                "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
+                self.shards.len(),
+                shard_count.0
+            );
+        }
+
+        Ok(())
+    }
+}
+
+/// The compute hook is a destination for notifications about changes to tenant:pageserver
+/// mapping.  It aggregates updates for the shards in a tenant, and when appropriate reconfigures
+/// the compute connection string.
+pub(super) struct ComputeHook {
+    state: tokio::sync::Mutex<HashMap<TenantId, ComputeHookTenant>>,
+}
+
+impl ComputeHook {
+    pub(super) fn new() -> Self {
+        Self {
+            state: Default::default(),
+        }
+    }
+
+    pub(super) async fn notify(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node_id: NodeId,
+    ) -> anyhow::Result<()> {
+        tracing::info!("ComputeHook::notify: {}->{}", tenant_shard_id, node_id);
+        let mut locked = self.state.lock().await;
+        let entry = locked
+            .entry(tenant_shard_id.tenant_id)
+            .or_insert_with(|| ComputeHookTenant { shards: Vec::new() });
+
+        let shard_index = ShardIndex {
+            shard_count: tenant_shard_id.shard_count,
+            shard_number: tenant_shard_id.shard_number,
+        };
+
+        let mut set = false;
+        for (existing_shard, existing_node) in &mut entry.shards {
+            if *existing_shard == shard_index {
+                *existing_node = node_id;
+                set = true;
+            }
+        }
+        if !set {
+            entry.shards.push((shard_index, node_id));
+        }
+
+        entry.maybe_reconfigure(tenant_shard_id.tenant_id).await
+    }
+}
--- a/control_plane/attachment_service/src/http.rs
+++ b/control_plane/attachment_service/src/http.rs
@@ -0,0 +1,218 @@
+use crate::reconciler::ReconcileError;
+use crate::service::Service;
+use hyper::{Body, Request, Response};
+use hyper::{StatusCode, Uri};
+use pageserver_api::models::{TenantCreateRequest, TimelineCreateRequest};
+use pageserver_api::shard::TenantShardId;
+use std::sync::Arc;
+use utils::auth::SwappableJwtAuth;
+use utils::http::endpoint::{auth_middleware, request_span};
+use utils::http::request::parse_request_param;
+use utils::id::TenantId;
+
+use utils::{
+    http::{
+        endpoint::{self},
+        error::ApiError,
+        json::{json_request, json_response},
+        RequestExt, RouterBuilder,
+    },
+    id::NodeId,
+};
+
+use pageserver_api::control_api::{ReAttachRequest, ValidateRequest};
+
+use control_plane::attachment_service::{
+    AttachHookRequest, InspectRequest, NodeConfigureRequest, NodeRegisterRequest,
+    TenantShardMigrateRequest,
+};
+
+/// State available to HTTP request handlers
+#[derive(Clone)]
+pub struct HttpState {
+    service: Arc<crate::service::Service>,
+    auth: Option<Arc<SwappableJwtAuth>>,
+    allowlist_routes: Vec<Uri>,
+}
+
+impl HttpState {
+    pub fn new(service: Arc<crate::service::Service>, auth: Option<Arc<SwappableJwtAuth>>) -> Self {
+        let allowlist_routes = ["/status"]
+            .iter()
+            .map(|v| v.parse().unwrap())
+            .collect::<Vec<_>>();
+        Self {
+            service,
+            auth,
+            allowlist_routes,
+        }
+    }
+}
+
+#[inline(always)]
+fn get_state(request: &Request<Body>) -> &HttpState {
+    request
+        .data::<Arc<HttpState>>()
+        .expect("unknown state type")
+        .as_ref()
+}
+
+/// Pageserver calls into this on startup, to learn which tenants it should attach
+async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
+    let state = get_state(&req);
+    json_response(
+        StatusCode::OK,
+        state
+            .service
+            .re_attach(reattach_req)
+            .await
+            .map_err(ApiError::InternalServerError)?,
+    )
+}
+
+/// Pageserver calls into this before doing deletions, to confirm that it still
+/// holds the latest generation for the tenants with deletions enqueued
+async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let validate_req = json_request::<ValidateRequest>(&mut req).await?;
+    let state = get_state(&req);
+    json_response(StatusCode::OK, state.service.validate(validate_req))
+}
+
+/// Call into this before attaching a tenant to a pageserver, to acquire a generation number
+/// (in the real control plane this is unnecessary, because the same program is managing
+///  generation numbers and doing attachments).
+async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
+    let state = get_state(&req);
+
+    json_response(
+        StatusCode::OK,
+        state
+            .service
+            .attach_hook(attach_req)
+            .await
+            .map_err(ApiError::InternalServerError)?,
+    )
+}
+
+async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let inspect_req = json_request::<InspectRequest>(&mut req).await?;
+
+    let state = get_state(&req);
+
+    json_response(StatusCode::OK, state.service.inspect(inspect_req))
+}
+
+async fn handle_tenant_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let create_req = json_request::<TenantCreateRequest>(&mut req).await?;
+    let state = get_state(&req);
+    json_response(
+        StatusCode::OK,
+        state.service.tenant_create(create_req).await?,
+    )
+}
+
+async fn handle_tenant_timeline_create(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let create_req = json_request::<TimelineCreateRequest>(&mut req).await?;
+
+    let state = get_state(&req);
+    json_response(
+        StatusCode::OK,
+        state
+            .service
+            .tenant_timeline_create(tenant_id, create_req)
+            .await?,
+    )
+}
+
+async fn handle_tenant_locate(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_id: TenantId = parse_request_param(&req, "tenant_id")?;
+    let state = get_state(&req);
+
+    json_response(StatusCode::OK, state.service.tenant_locate(tenant_id)?)
+}
+
+async fn handle_node_register(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let register_req = json_request::<NodeRegisterRequest>(&mut req).await?;
+    let state = get_state(&req);
+    state.service.node_register(register_req).await?;
+    json_response(StatusCode::OK, ())
+}
+
+async fn handle_node_configure(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let node_id: NodeId = parse_request_param(&req, "node_id")?;
+    let config_req = json_request::<NodeConfigureRequest>(&mut req).await?;
+    if node_id != config_req.node_id {
+        return Err(ApiError::BadRequest(anyhow::anyhow!(
+            "Path and body node_id differ"
+        )));
+    }
+    let state = get_state(&req);
+
+    json_response(StatusCode::OK, state.service.node_configure(config_req)?)
+}
+
+async fn handle_tenant_shard_migrate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&req, "tenant_shard_id")?;
+    let migrate_req = json_request::<TenantShardMigrateRequest>(&mut req).await?;
+    let state = get_state(&req);
+    json_response(
+        StatusCode::OK,
+        state
+            .service
+            .tenant_shard_migrate(tenant_shard_id, migrate_req)
+            .await?,
+    )
+}
+
+/// Status endpoint is just used for checking that our HTTP listener is up
+async fn handle_status(_req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    json_response(StatusCode::OK, ())
+}
+
+impl From<ReconcileError> for ApiError {
+    fn from(value: ReconcileError) -> Self {
+        ApiError::Conflict(format!("Reconciliation error: {}", value))
+    }
+}
+
+pub fn make_router(
+    service: Arc<Service>,
+    auth: Option<Arc<SwappableJwtAuth>>,
+) -> RouterBuilder<hyper::Body, ApiError> {
+    let mut router = endpoint::make_router();
+    if auth.is_some() {
+        router = router.middleware(auth_middleware(|request| {
+            let state = get_state(request);
+            if state.allowlist_routes.contains(request.uri()) {
+                None
+            } else {
+                state.auth.as_deref()
+            }
+        }))
+    }
+
+    router
+        .data(Arc::new(HttpState::new(service, auth)))
+        .get("/status", |r| request_span(r, handle_status))
+        .post("/re-attach", |r| request_span(r, handle_re_attach))
+        .post("/validate", |r| request_span(r, handle_validate))
+        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
+        .post("/inspect", |r| request_span(r, handle_inspect))
+        .post("/node", |r| request_span(r, handle_node_register))
+        .put("/node/:node_id/config", |r| {
+            request_span(r, handle_node_configure)
+        })
+        .post("/tenant", |r| request_span(r, handle_tenant_create))
+        .post("/tenant/:tenant_id/timeline", |r| {
+            request_span(r, handle_tenant_timeline_create)
+        })
+        .get("/tenant/:tenant_id/locate", |r| {
+            request_span(r, handle_tenant_locate)
+        })
+        .put("/tenant/:tenant_shard_id/migrate", |r| {
+            request_span(r, handle_tenant_shard_migrate)
+        })
+}
--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -0,0 +1,57 @@
+use serde::{Deserialize, Serialize};
+use utils::seqwait::MonotonicCounter;
+
+mod compute_hook;
+pub mod http;
+mod node;
+pub mod persistence;
+mod reconciler;
+mod scheduler;
+pub mod service;
+mod tenant_state;
+
+#[derive(Clone, Serialize, Deserialize)]
+enum PlacementPolicy {
+    /// Cheapest way to attach a tenant: just one pageserver, no secondary
+    Single,
+    /// Production-ready way to attach a tenant: one attached pageserver and
+    /// some number of secondaries.
+    Double(usize),
+}
+
+#[derive(Ord, PartialOrd, Eq, PartialEq, Copy, Clone)]
+struct Sequence(u64);
+
+impl Sequence {
+    fn initial() -> Self {
+        Self(0)
+    }
+}
+
+impl std::fmt::Display for Sequence {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
+impl MonotonicCounter<Sequence> for Sequence {
+    fn cnt_advance(&mut self, v: Sequence) {
+        assert!(*self <= v);
+        *self = v;
+    }
+    fn cnt_value(&self) -> Sequence {
+        *self
+    }
+}
+
+impl Sequence {
+    fn next(&self) -> Sequence {
+        Sequence(self.0 + 1)
+    }
+}
+
+impl Default for PlacementPolicy {
+    fn default() -> Self {
+        PlacementPolicy::Double(1)
+    }
+}
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -0,0 +1,100 @@
+/// The attachment service mimics the aspects of the control plane API
+/// that are required for a pageserver to operate.
+///
+/// This enables running & testing pageservers without a full-blown
+/// deployment of the Neon cloud platform.
+///
+use anyhow::anyhow;
+use attachment_service::http::make_router;
+use attachment_service::persistence::Persistence;
+use attachment_service::service::{Config, Service};
+use camino::Utf8PathBuf;
+use clap::Parser;
+use metrics::launch_timestamp::LaunchTimestamp;
+use std::sync::Arc;
+use utils::auth::{JwtAuth, SwappableJwtAuth};
+use utils::logging::{self, LogFormat};
+use utils::signals::{ShutdownSignals, Signal};
+
+use utils::{project_build_tag, project_git_version, tcp_listener};
+
+project_git_version!(GIT_VERSION);
+project_build_tag!(BUILD_TAG);
+
+#[derive(Parser)]
+#[command(author, version, about, long_about = None)]
+#[command(arg_required_else_help(true))]
+struct Cli {
+    /// Host and port to listen on, like `127.0.0.1:1234`
+    #[arg(short, long)]
+    listen: std::net::SocketAddr,
+
+    /// Path to public key for JWT authentication of clients
+    #[arg(long)]
+    public_key: Option<camino::Utf8PathBuf>,
+
+    /// Token for authenticating this service with the pageservers it controls
+    #[arg(short, long)]
+    jwt_token: Option<String>,
+
+    /// Path to the .json file to store state (will be created if it doesn't exist)
+    #[arg(short, long)]
+    path: Utf8PathBuf,
+}
+
+#[tokio::main]
+async fn main() -> anyhow::Result<()> {
+    let launch_ts = Box::leak(Box::new(LaunchTimestamp::generate()));
+
+    logging::init(
+        LogFormat::Plain,
+        logging::TracingErrorLayerEnablement::Disabled,
+        logging::Output::Stdout,
+    )?;
+
+    let args = Cli::parse();
+    tracing::info!(
+        "version: {}, launch_timestamp: {}, build_tag {}, state at {}, listening on {}",
+        GIT_VERSION,
+        launch_ts.to_string(),
+        BUILD_TAG,
+        args.path,
+        args.listen
+    );
+
+    let config = Config {
+        jwt_token: args.jwt_token,
+    };
+
+    let persistence = Arc::new(Persistence::new(&args.path).await);
+
+    let service = Service::spawn(config, persistence).await?;
+
+    let http_listener = tcp_listener::bind(args.listen)?;
+
+    let auth = if let Some(public_key_path) = &args.public_key {
+        let jwt_auth = JwtAuth::from_key_path(public_key_path)?;
+        Some(Arc::new(SwappableJwtAuth::new(jwt_auth)))
+    } else {
+        None
+    };
+    let router = make_router(service, auth)
+        .build()
+        .map_err(|err| anyhow!(err))?;
+    let service = utils::http::RouterService::new(router).unwrap();
+    let server = hyper::Server::from_tcp(http_listener)?.serve(service);
+
+    tracing::info!("Serving on {0}", args.listen);
+
+    tokio::task::spawn(server);
+
+    ShutdownSignals::handle(|signal| match signal {
+        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
+            tracing::info!("Got {}. Terminating", signal.name());
+            // We're just a test helper: no graceful shutdown.
+            std::process::exit(0);
+        }
+    })?;
+
+    Ok(())
+}
--- a/control_plane/attachment_service/src/node.rs
+++ b/control_plane/attachment_service/src/node.rs
@@ -0,0 +1,37 @@
+use control_plane::attachment_service::{NodeAvailability, NodeSchedulingPolicy};
+use utils::id::NodeId;
+
+#[derive(Clone)]
+pub(crate) struct Node {
+    pub(crate) id: NodeId,
+
+    pub(crate) availability: NodeAvailability,
+    pub(crate) scheduling: NodeSchedulingPolicy,
+
+    pub(crate) listen_http_addr: String,
+    pub(crate) listen_http_port: u16,
+
+    pub(crate) listen_pg_addr: String,
+    pub(crate) listen_pg_port: u16,
+}
+
+impl Node {
+    pub(crate) fn base_url(&self) -> String {
+        format!("http://{}:{}", self.listen_http_addr, self.listen_http_port)
+    }
+
+    /// Is this node elegible to have work scheduled onto it?
+    pub(crate) fn may_schedule(&self) -> bool {
+        match self.availability {
+            NodeAvailability::Active => {}
+            NodeAvailability::Offline => return false,
+        }
+
+        match self.scheduling {
+            NodeSchedulingPolicy::Active => true,
+            NodeSchedulingPolicy::Draining => false,
+            NodeSchedulingPolicy::Filling => true,
+            NodeSchedulingPolicy::Pause => false,
+        }
+    }
+}
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -0,0 +1,272 @@
+use std::{collections::HashMap, str::FromStr};
+
+use camino::{Utf8Path, Utf8PathBuf};
+use control_plane::{
+    attachment_service::{NodeAvailability, NodeSchedulingPolicy},
+    local_env::LocalEnv,
+};
+use pageserver_api::{
+    models::TenantConfig,
+    shard::{ShardCount, ShardNumber, TenantShardId},
+};
+use postgres_connection::parse_host_port;
+use serde::{Deserialize, Serialize};
+use utils::{
+    generation::Generation,
+    id::{NodeId, TenantId},
+};
+
+use crate::{node::Node, PlacementPolicy};
+
+/// Placeholder for storage.  This will be replaced with a database client.
+pub struct Persistence {
+    state: std::sync::Mutex<PersistentState>,
+}
+
+// Top level state available to all HTTP handlers
+#[derive(Serialize, Deserialize)]
+struct PersistentState {
+    tenants: HashMap<TenantShardId, TenantShardPersistence>,
+
+    #[serde(skip)]
+    path: Utf8PathBuf,
+}
+
+/// A convenience for serializing the state inside a sync lock, and then
+/// writing it to disk outside of the lock.  This will go away when switching
+/// to a database backend.
+struct PendingWrite {
+    bytes: Vec<u8>,
+    path: Utf8PathBuf,
+}
+
+impl PendingWrite {
+    async fn commit(&self) -> anyhow::Result<()> {
+        tokio::fs::write(&self.path, &self.bytes).await?;
+
+        Ok(())
+    }
+}
+
+impl PersistentState {
+    fn save(&self) -> PendingWrite {
+        PendingWrite {
+            bytes: serde_json::to_vec(self).expect("Serialization error"),
+            path: self.path.clone(),
+        }
+    }
+
+    async fn load(path: &Utf8Path) -> anyhow::Result<Self> {
+        let bytes = tokio::fs::read(path).await?;
+        let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
+        decoded.path = path.to_owned();
+
+        for (tenant_id, tenant) in &mut decoded.tenants {
+            // Backward compat: an old attachments.json from before PR #6251, replace
+            // empty strings with proper defaults.
+            if tenant.tenant_id.is_empty() {
+                tenant.tenant_id = format!("{}", tenant_id);
+                tenant.config = serde_json::to_string(&TenantConfig::default())?;
+                tenant.placement_policy = serde_json::to_string(&PlacementPolicy::default())?;
+            }
+        }
+
+        Ok(decoded)
+    }
+
+    async fn load_or_new(path: &Utf8Path) -> Self {
+        match Self::load(path).await {
+            Ok(s) => {
+                tracing::info!("Loaded state file at {}", path);
+                s
+            }
+            Err(e)
+                if e.downcast_ref::<std::io::Error>()
+                    .map(|e| e.kind() == std::io::ErrorKind::NotFound)
+                    .unwrap_or(false) =>
+            {
+                tracing::info!("Will create state file at {}", path);
+                Self {
+                    tenants: HashMap::new(),
+                    path: path.to_owned(),
+                }
+            }
+            Err(e) => {
+                panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path)
+            }
+        }
+    }
+}
+
+impl Persistence {
+    pub async fn new(path: &Utf8Path) -> Self {
+        let state = PersistentState::load_or_new(path).await;
+        Self {
+            state: std::sync::Mutex::new(state),
+        }
+    }
+
+    /// When registering a node, persist it so that on next start we will be able to
+    /// iterate over known nodes to synchronize their tenant shard states with our observed state.
+    pub(crate) async fn insert_node(&self, _node: &Node) -> anyhow::Result<()> {
+        // TODO: node persitence will come with database backend
+        Ok(())
+    }
+
+    /// At startup, we populate the service's list of nodes, and use this list to call into
+    /// each node to do an initial reconciliation of the state of the world with our in-memory
+    /// observed state.
+    pub(crate) async fn list_nodes(&self) -> anyhow::Result<Vec<Node>> {
+        let env = LocalEnv::load_config()?;
+        // TODO: node persitence will come with database backend
+
+        // XXX hack: enable test_backward_compatibility to work by populating our list of
+        // nodes from LocalEnv when it is not present in persistent storage.  Otherwise at
+        // first startup in the compat test, we may have shards but no nodes.
+        let mut result = Vec::new();
+        tracing::info!(
+            "Loaded {} pageserver nodes from LocalEnv",
+            env.pageservers.len()
+        );
+        for ps_conf in env.pageservers {
+            let (pg_host, pg_port) =
+                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
+            let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
+                .expect("Unable to parse listen_http_addr");
+            result.push(Node {
+                id: ps_conf.id,
+                listen_pg_addr: pg_host.to_string(),
+                listen_pg_port: pg_port.unwrap_or(5432),
+                listen_http_addr: http_host.to_string(),
+                listen_http_port: http_port.unwrap_or(80),
+                availability: NodeAvailability::Active,
+                scheduling: NodeSchedulingPolicy::Active,
+            });
+        }
+
+        Ok(result)
+    }
+
+    /// At startup, we populate our map of tenant shards from persistent storage.
+    pub(crate) async fn list_tenant_shards(&self) -> anyhow::Result<Vec<TenantShardPersistence>> {
+        let locked = self.state.lock().unwrap();
+        Ok(locked.tenants.values().cloned().collect())
+    }
+
+    /// Tenants must be persisted before we schedule them for the first time.  This enables us
+    /// to correctly retain generation monotonicity, and the externally provided placement policy & config.
+    pub(crate) async fn insert_tenant_shards(
+        &self,
+        shards: Vec<TenantShardPersistence>,
+    ) -> anyhow::Result<()> {
+        let write = {
+            let mut locked = self.state.lock().unwrap();
+            for shard in shards {
+                let tenant_shard_id = TenantShardId {
+                    tenant_id: TenantId::from_str(shard.tenant_id.as_str())?,
+                    shard_number: ShardNumber(shard.shard_number as u8),
+                    shard_count: ShardCount(shard.shard_count as u8),
+                };
+
+                locked.tenants.insert(tenant_shard_id, shard);
+            }
+            locked.save()
+        };
+
+        write.commit().await?;
+
+        Ok(())
+    }
+
+    /// Reconciler calls this immediately before attaching to a new pageserver, to acquire a unique, monotonically
+    /// advancing generation number.  We also store the NodeId for which the generation was issued, so that in
+    /// [`Self::re_attach`] we can do a bulk UPDATE on the generations for that node.
+    pub(crate) async fn increment_generation(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node_id: Option<NodeId>,
+    ) -> anyhow::Result<Generation> {
+        let (write, gen) = {
+            let mut locked = self.state.lock().unwrap();
+            let Some(shard) = locked.tenants.get_mut(&tenant_shard_id) else {
+                anyhow::bail!("Tried to increment generation of unknown shard");
+            };
+
+            // If we're called with a None pageserver, we need only update the generation
+            // record to disassociate it with this pageserver, not actually increment the number, as
+            // the increment is guaranteed to happen the next time this tenant is attached.
+            if node_id.is_some() {
+                shard.generation += 1;
+            }
+
+            shard.generation_pageserver = node_id;
+            let gen = Generation::new(shard.generation);
+            (locked.save(), gen)
+        };
+
+        write.commit().await?;
+        Ok(gen)
+    }
+
+    pub(crate) async fn re_attach(
+        &self,
+        node_id: NodeId,
+    ) -> anyhow::Result<HashMap<TenantShardId, Generation>> {
+        let (write, result) = {
+            let mut result = HashMap::new();
+            let mut locked = self.state.lock().unwrap();
+            for (tenant_shard_id, shard) in locked.tenants.iter_mut() {
+                if shard.generation_pageserver == Some(node_id) {
+                    shard.generation += 1;
+                    result.insert(*tenant_shard_id, Generation::new(shard.generation));
+                }
+            }
+
+            (locked.save(), result)
+        };
+
+        write.commit().await?;
+        Ok(result)
+    }
+
+    // TODO: when we start shard splitting, we must durably mark the tenant so that
+    // on restart, we know that we must go through recovery (list shards that exist
+    // and pick up where we left off and/or revert to parent shards).
+    #[allow(dead_code)]
+    pub(crate) async fn begin_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
+        todo!();
+    }
+
+    // TODO: when we finish shard splitting, we must atomically clean up the old shards
+    // and insert the new shards, and clear the splitting marker.
+    #[allow(dead_code)]
+    pub(crate) async fn complete_shard_split(&self, _tenant_id: TenantId) -> anyhow::Result<()> {
+        todo!();
+    }
+}
+
+/// Parts of [`crate::tenant_state::TenantState`] that are stored durably
+#[derive(Serialize, Deserialize, Clone)]
+pub(crate) struct TenantShardPersistence {
+    #[serde(default)]
+    pub(crate) tenant_id: String,
+    #[serde(default)]
+    pub(crate) shard_number: i32,
+    #[serde(default)]
+    pub(crate) shard_count: i32,
+    #[serde(default)]
+    pub(crate) shard_stripe_size: i32,
+
+    // Currently attached pageserver
+    #[serde(rename = "pageserver")]
+    pub(crate) generation_pageserver: Option<NodeId>,
+
+    // Latest generation number: next time we attach, increment this
+    // and use the incremented number when attaching
+    pub(crate) generation: u32,
+
+    #[serde(default)]
+    pub(crate) placement_policy: String,
+    #[serde(default)]
+    pub(crate) config: String,
+}
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -0,0 +1,495 @@
+use crate::persistence::Persistence;
+use crate::service;
+use control_plane::attachment_service::NodeAvailability;
+use pageserver_api::models::{
+    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
+};
+use pageserver_api::shard::{ShardIdentity, TenantShardId};
+use pageserver_client::mgmt_api;
+use std::collections::HashMap;
+use std::sync::Arc;
+use std::time::Duration;
+use tokio_util::sync::CancellationToken;
+use utils::generation::Generation;
+use utils::id::{NodeId, TimelineId};
+use utils::lsn::Lsn;
+
+use crate::compute_hook::ComputeHook;
+use crate::node::Node;
+use crate::tenant_state::{IntentState, ObservedState, ObservedStateLocation};
+
+/// Object with the lifetime of the background reconcile task that is created
+/// for tenants which have a difference between their intent and observed states.
+pub(super) struct Reconciler {
+    /// See [`crate::tenant_state::TenantState`] for the meanings of these fields: they are a snapshot
+    /// of a tenant's state from when we spawned a reconcile task.
+    pub(super) tenant_shard_id: TenantShardId,
+    pub(crate) shard: ShardIdentity,
+    pub(crate) generation: Generation,
+    pub(crate) intent: IntentState,
+    pub(crate) config: TenantConfig,
+    pub(crate) observed: ObservedState,
+
+    pub(crate) service_config: service::Config,
+
+    /// A snapshot of the pageservers as they were when we were asked
+    /// to reconcile.
+    pub(crate) pageservers: Arc<HashMap<NodeId, Node>>,
+
+    /// A hook to notify the running postgres instances when we change the location
+    /// of a tenant
+    pub(crate) compute_hook: Arc<ComputeHook>,
+
+    /// A means to abort background reconciliation: it is essential to
+    /// call this when something changes in the original TenantState that
+    /// will make this reconciliation impossible or unnecessary, for
+    /// example when a pageserver node goes offline, or the PlacementPolicy for
+    /// the tenant is changed.
+    pub(crate) cancel: CancellationToken,
+
+    /// Access to persistent storage for updating generation numbers
+    pub(crate) persistence: Arc<Persistence>,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum ReconcileError {
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl Reconciler {
+    async fn location_config(
+        &mut self,
+        node_id: NodeId,
+        config: LocationConfig,
+        flush_ms: Option<Duration>,
+    ) -> anyhow::Result<()> {
+        let node = self
+            .pageservers
+            .get(&node_id)
+            .expect("Pageserver may not be removed while referenced");
+
+        self.observed
+            .locations
+            .insert(node.id, ObservedStateLocation { conf: None });
+
+        tracing::info!("location_config({}) calling: {:?}", node_id, config);
+        let client =
+            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
+        client
+            .location_config(self.tenant_shard_id, config.clone(), flush_ms)
+            .await?;
+        tracing::info!("location_config({}) complete: {:?}", node_id, config);
+
+        self.observed
+            .locations
+            .insert(node.id, ObservedStateLocation { conf: Some(config) });
+
+        Ok(())
+    }
+
+    async fn maybe_live_migrate(&mut self) -> Result<(), ReconcileError> {
+        let destination = if let Some(node_id) = self.intent.attached {
+            match self.observed.locations.get(&node_id) {
+                Some(conf) => {
+                    // We will do a live migration only if the intended destination is not
+                    // currently in an attached state.
+                    match &conf.conf {
+                        Some(conf) if conf.mode == LocationConfigMode::Secondary => {
+                            // Fall through to do a live migration
+                            node_id
+                        }
+                        None | Some(_) => {
+                            // Attached or uncertain: don't do a live migration, proceed
+                            // with a general-case reconciliation
+                            tracing::info!("maybe_live_migrate: destination is None or attached");
+                            return Ok(());
+                        }
+                    }
+                }
+                None => {
+                    // Our destination is not attached: maybe live migrate if some other
+                    // node is currently attached.  Fall through.
+                    node_id
+                }
+            }
+        } else {
+            // No intent to be attached
+            tracing::info!("maybe_live_migrate: no attached intent");
+            return Ok(());
+        };
+
+        let mut origin = None;
+        for (node_id, state) in &self.observed.locations {
+            if let Some(observed_conf) = &state.conf {
+                if observed_conf.mode == LocationConfigMode::AttachedSingle {
+                    let node = self
+                        .pageservers
+                        .get(node_id)
+                        .expect("Nodes may not be removed while referenced");
+                    // We will only attempt live migration if the origin is not offline: this
+                    // avoids trying to do it while reconciling after responding to an HA failover.
+                    if !matches!(node.availability, NodeAvailability::Offline) {
+                        origin = Some(*node_id);
+                        break;
+                    }
+                }
+            }
+        }
+
+        let Some(origin) = origin else {
+            tracing::info!("maybe_live_migrate: no origin found");
+            return Ok(());
+        };
+
+        // We have an origin and a destination: proceed to do the live migration
+        tracing::info!("Live migrating {}->{}", origin, destination);
+        self.live_migrate(origin, destination).await?;
+
+        Ok(())
+    }
+
+    async fn get_lsns(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node_id: &NodeId,
+    ) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
+        let node = self
+            .pageservers
+            .get(node_id)
+            .expect("Pageserver may not be removed while referenced");
+
+        let client =
+            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
+
+        let timelines = client.timeline_list(&tenant_shard_id).await?;
+        Ok(timelines
+            .into_iter()
+            .map(|t| (t.timeline_id, t.last_record_lsn))
+            .collect())
+    }
+
+    async fn secondary_download(&self, tenant_shard_id: TenantShardId, node_id: &NodeId) {
+        let node = self
+            .pageservers
+            .get(node_id)
+            .expect("Pageserver may not be removed while referenced");
+
+        let client =
+            mgmt_api::Client::new(node.base_url(), self.service_config.jwt_token.as_deref());
+
+        match client.tenant_secondary_download(tenant_shard_id).await {
+            Ok(()) => {}
+            Err(_) => {
+                tracing::info!("  (skipping, destination wasn't in secondary mode)")
+            }
+        }
+    }
+
+    async fn await_lsn(
+        &self,
+        tenant_shard_id: TenantShardId,
+        pageserver_id: &NodeId,
+        baseline: HashMap<TimelineId, Lsn>,
+    ) -> anyhow::Result<()> {
+        loop {
+            let latest = match self.get_lsns(tenant_shard_id, pageserver_id).await {
+                Ok(l) => l,
+                Err(e) => {
+                    println!(
+                        "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
+                        pageserver_id
+                    );
+                    std::thread::sleep(Duration::from_millis(500));
+                    continue;
+                }
+            };
+
+            let mut any_behind: bool = false;
+            for (timeline_id, baseline_lsn) in &baseline {
+                match latest.get(timeline_id) {
+                    Some(latest_lsn) => {
+                        println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
+                        if latest_lsn < baseline_lsn {
+                            any_behind = true;
+                        }
+                    }
+                    None => {
+                        // Expected timeline isn't yet visible on migration destination.
+                        // (IRL we would have to account for timeline deletion, but this
+                        //  is just test helper)
+                        any_behind = true;
+                    }
+                }
+            }
+
+            if !any_behind {
+                println!("✅ LSN caught up.  Proceeding...");
+                break;
+            } else {
+                std::thread::sleep(Duration::from_millis(500));
+            }
+        }
+
+        Ok(())
+    }
+
+    pub async fn live_migrate(
+        &mut self,
+        origin_ps_id: NodeId,
+        dest_ps_id: NodeId,
+    ) -> anyhow::Result<()> {
+        // `maybe_live_migrate` is responsibble for sanity of inputs
+        assert!(origin_ps_id != dest_ps_id);
+
+        fn build_location_config(
+            shard: &ShardIdentity,
+            config: &TenantConfig,
+            mode: LocationConfigMode,
+            generation: Option<Generation>,
+            secondary_conf: Option<LocationConfigSecondary>,
+        ) -> LocationConfig {
+            LocationConfig {
+                mode,
+                generation: generation.map(|g| g.into().unwrap()),
+                secondary_conf,
+                tenant_conf: config.clone(),
+                shard_number: shard.number.0,
+                shard_count: shard.count.0,
+                shard_stripe_size: shard.stripe_size.0,
+            }
+        }
+
+        tracing::info!(
+            "🔁 Switching origin pageserver {} to stale mode",
+            origin_ps_id
+        );
+
+        // FIXME: it is incorrect to use self.generation here, we should use the generation
+        // from the ObservedState of the origin pageserver (it might be older than self.generation)
+        let stale_conf = build_location_config(
+            &self.shard,
+            &self.config,
+            LocationConfigMode::AttachedStale,
+            Some(self.generation),
+            None,
+        );
+        self.location_config(origin_ps_id, stale_conf, Some(Duration::from_secs(10)))
+            .await?;
+
+        let baseline_lsns = Some(self.get_lsns(self.tenant_shard_id, &origin_ps_id).await?);
+
+        // If we are migrating to a destination that has a secondary location, warm it up first
+        if let Some(destination_conf) = self.observed.locations.get(&dest_ps_id) {
+            if let Some(destination_conf) = &destination_conf.conf {
+                if destination_conf.mode == LocationConfigMode::Secondary {
+                    tracing::info!(
+                        "🔁 Downloading latest layers to destination pageserver {}",
+                        dest_ps_id,
+                    );
+                    self.secondary_download(self.tenant_shard_id, &dest_ps_id)
+                        .await;
+                }
+            }
+        }
+
+        // Increment generation before attaching to new pageserver
+        self.generation = self
+            .persistence
+            .increment_generation(self.tenant_shard_id, Some(dest_ps_id))
+            .await?;
+
+        let dest_conf = build_location_config(
+            &self.shard,
+            &self.config,
+            LocationConfigMode::AttachedMulti,
+            Some(self.generation),
+            None,
+        );
+
+        tracing::info!("🔁 Attaching to pageserver {}", dest_ps_id);
+        self.location_config(dest_ps_id, dest_conf, None).await?;
+
+        if let Some(baseline) = baseline_lsns {
+            tracing::info!("🕑 Waiting for LSN to catch up...");
+            self.await_lsn(self.tenant_shard_id, &dest_ps_id, baseline)
+                .await?;
+        }
+
+        tracing::info!("🔁 Notifying compute to use pageserver {}", dest_ps_id);
+        self.compute_hook
+            .notify(self.tenant_shard_id, dest_ps_id)
+            .await?;
+
+        // Downgrade the origin to secondary.  If the tenant's policy is PlacementPolicy::Single, then
+        // this location will be deleted in the general case reconciliation that runs after this.
+        let origin_secondary_conf = build_location_config(
+            &self.shard,
+            &self.config,
+            LocationConfigMode::Secondary,
+            None,
+            Some(LocationConfigSecondary { warm: true }),
+        );
+        self.location_config(origin_ps_id, origin_secondary_conf.clone(), None)
+            .await?;
+        // TODO: we should also be setting the ObservedState on earlier API calls, in case we fail
+        // partway through.  In fact, all location conf API calls should be in a wrapper that sets
+        // the observed state to None, then runs, then sets it to what we wrote.
+        self.observed.locations.insert(
+            origin_ps_id,
+            ObservedStateLocation {
+                conf: Some(origin_secondary_conf),
+            },
+        );
+
+        println!(
+            "🔁 Switching to AttachedSingle mode on pageserver {}",
+            dest_ps_id
+        );
+        let dest_final_conf = build_location_config(
+            &self.shard,
+            &self.config,
+            LocationConfigMode::AttachedSingle,
+            Some(self.generation),
+            None,
+        );
+        self.location_config(dest_ps_id, dest_final_conf.clone(), None)
+            .await?;
+        self.observed.locations.insert(
+            dest_ps_id,
+            ObservedStateLocation {
+                conf: Some(dest_final_conf),
+            },
+        );
+
+        println!("✅ Migration complete");
+
+        Ok(())
+    }
+
+    /// Reconciling a tenant makes API calls to pageservers until the observed state
+    /// matches the intended state.
+    ///
+    /// First we apply special case handling (e.g. for live migrations), and then a
+    /// general case reconciliation where we walk through the intent by pageserver
+    /// and call out to the pageserver to apply the desired state.
+    pub(crate) async fn reconcile(&mut self) -> Result<(), ReconcileError> {
+        // TODO: if any of self.observed is None, call to remote pageservers
+        // to learn correct state.
+
+        // Special case: live migration
+        self.maybe_live_migrate().await?;
+
+        // If the attached pageserver is not attached, do so now.
+        if let Some(node_id) = self.intent.attached {
+            let mut wanted_conf =
+                attached_location_conf(self.generation, &self.shard, &self.config);
+            match self.observed.locations.get(&node_id) {
+                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
+                    // Nothing to do
+                    tracing::info!("Observed configuration already correct.")
+                }
+                _ => {
+                    // In all cases other than a matching observed configuration, we will
+                    // reconcile this location.  This includes locations with different configurations, as well
+                    // as locations with unknown (None) observed state.
+                    self.generation = self
+                        .persistence
+                        .increment_generation(self.tenant_shard_id, Some(node_id))
+                        .await?;
+                    wanted_conf.generation = self.generation.into();
+                    tracing::info!("Observed configuration requires update.");
+                    self.location_config(node_id, wanted_conf, None).await?;
+                    if let Err(e) = self
+                        .compute_hook
+                        .notify(self.tenant_shard_id, node_id)
+                        .await
+                    {
+                        tracing::warn!(
+                            "Failed to notify compute of newly attached pageserver {node_id}: {e}"
+                        );
+                    }
+                }
+            }
+        }
+
+        // Configure secondary locations: if these were previously attached this
+        // implicitly downgrades them from attached to secondary.
+        let mut changes = Vec::new();
+        for node_id in &self.intent.secondary {
+            let wanted_conf = secondary_location_conf(&self.shard, &self.config);
+            match self.observed.locations.get(node_id) {
+                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {
+                    // Nothing to do
+                    tracing::info!(%node_id, "Observed configuration already correct.")
+                }
+                _ => {
+                    // In all cases other than a matching observed configuration, we will
+                    // reconcile this location.
+                    tracing::info!(%node_id, "Observed configuration requires update.");
+                    changes.push((*node_id, wanted_conf))
+                }
+            }
+        }
+
+        // Detach any extraneous pageservers that are no longer referenced
+        // by our intent.
+        let all_pageservers = self.intent.all_pageservers();
+        for node_id in self.observed.locations.keys() {
+            if all_pageservers.contains(node_id) {
+                // We are only detaching pageservers that aren't used at all.
+                continue;
+            }
+
+            changes.push((
+                *node_id,
+                LocationConfig {
+                    mode: LocationConfigMode::Detached,
+                    generation: None,
+                    secondary_conf: None,
+                    shard_number: self.shard.number.0,
+                    shard_count: self.shard.count.0,
+                    shard_stripe_size: self.shard.stripe_size.0,
+                    tenant_conf: self.config.clone(),
+                },
+            ));
+        }
+
+        for (node_id, conf) in changes {
+            self.location_config(node_id, conf, None).await?;
+        }
+
+        Ok(())
+    }
+}
+
+pub(crate) fn attached_location_conf(
+    generation: Generation,
+    shard: &ShardIdentity,
+    config: &TenantConfig,
+) -> LocationConfig {
+    LocationConfig {
+        mode: LocationConfigMode::AttachedSingle,
+        generation: generation.into(),
+        secondary_conf: None,
+        shard_number: shard.number.0,
+        shard_count: shard.count.0,
+        shard_stripe_size: shard.stripe_size.0,
+        tenant_conf: config.clone(),
+    }
+}
+
+pub(crate) fn secondary_location_conf(
+    shard: &ShardIdentity,
+    config: &TenantConfig,
+) -> LocationConfig {
+    LocationConfig {
+        mode: LocationConfigMode::Secondary,
+        generation: None,
+        secondary_conf: Some(LocationConfigSecondary { warm: true }),
+        shard_number: shard.number.0,
+        shard_count: shard.count.0,
+        shard_stripe_size: shard.stripe_size.0,
+        tenant_conf: config.clone(),
+    }
+}
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -0,0 +1,89 @@
+use pageserver_api::shard::TenantShardId;
+use std::collections::{BTreeMap, HashMap};
+use utils::{http::error::ApiError, id::NodeId};
+
+use crate::{node::Node, tenant_state::TenantState};
+
+/// Scenarios in which we cannot find a suitable location for a tenant shard
+#[derive(thiserror::Error, Debug)]
+pub enum ScheduleError {
+    #[error("No pageservers found")]
+    NoPageservers,
+    #[error("No pageserver found matching constraint")]
+    ImpossibleConstraint,
+}
+
+impl From<ScheduleError> for ApiError {
+    fn from(value: ScheduleError) -> Self {
+        ApiError::Conflict(format!("Scheduling error: {}", value))
+    }
+}
+
+pub(crate) struct Scheduler {
+    tenant_counts: HashMap<NodeId, usize>,
+}
+
+impl Scheduler {
+    pub(crate) fn new(
+        tenants: &BTreeMap<TenantShardId, TenantState>,
+        nodes: &HashMap<NodeId, Node>,
+    ) -> Self {
+        let mut tenant_counts = HashMap::new();
+        for node_id in nodes.keys() {
+            tenant_counts.insert(*node_id, 0);
+        }
+
+        for tenant in tenants.values() {
+            if let Some(ps) = tenant.intent.attached {
+                let entry = tenant_counts.entry(ps).or_insert(0);
+                *entry += 1;
+            }
+        }
+
+        for (node_id, node) in nodes {
+            if !node.may_schedule() {
+                tenant_counts.remove(node_id);
+            }
+        }
+
+        Self { tenant_counts }
+    }
+
+    pub(crate) fn schedule_shard(
+        &mut self,
+        hard_exclude: &[NodeId],
+    ) -> Result<NodeId, ScheduleError> {
+        if self.tenant_counts.is_empty() {
+            return Err(ScheduleError::NoPageservers);
+        }
+
+        let mut tenant_counts: Vec<(NodeId, usize)> = self
+            .tenant_counts
+            .iter()
+            .filter_map(|(k, v)| {
+                if hard_exclude.contains(k) {
+                    None
+                } else {
+                    Some((*k, *v))
+                }
+            })
+            .collect();
+
+        // Sort by tenant count.  Nodes with the same tenant count are sorted by ID.
+        tenant_counts.sort_by_key(|i| (i.1, i.0));
+
+        if tenant_counts.is_empty() {
+            // After applying constraints, no pageservers were left
+            return Err(ScheduleError::ImpossibleConstraint);
+        }
+
+        for (node_id, count) in &tenant_counts {
+            tracing::info!("tenant_counts[{node_id}]={count}");
+        }
+
+        let node_id = tenant_counts.first().unwrap().0;
+        tracing::info!("scheduler selected node {node_id}");
+        *self.tenant_counts.get_mut(&node_id).unwrap() += 1;
+        Ok(node_id)
+    }
+}
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -0,0 +1,455 @@
+use std::{collections::HashMap, sync::Arc, time::Duration};
+
+use control_plane::attachment_service::NodeAvailability;
+use pageserver_api::{
+    models::{LocationConfig, LocationConfigMode, TenantConfig},
+    shard::{ShardIdentity, TenantShardId},
+};
+use tokio::task::JoinHandle;
+use tokio_util::sync::CancellationToken;
+use utils::{
+    generation::Generation,
+    id::NodeId,
+    seqwait::{SeqWait, SeqWaitError},
+};
+
+use crate::{
+    compute_hook::ComputeHook,
+    node::Node,
+    persistence::Persistence,
+    reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
+    scheduler::{ScheduleError, Scheduler},
+    service, PlacementPolicy, Sequence,
+};
+
+pub(crate) struct TenantState {
+    pub(crate) tenant_shard_id: TenantShardId,
+
+    pub(crate) shard: ShardIdentity,
+
+    // Runtime only: sequence used to coordinate when updating this object while
+    // with background reconcilers may be running.  A reconciler runs to a particular
+    // sequence.
+    pub(crate) sequence: Sequence,
+
+    // Latest generation number: next time we attach, increment this
+    // and use the incremented number when attaching
+    pub(crate) generation: Generation,
+
+    // High level description of how the tenant should be set up.  Provided
+    // externally.
+    pub(crate) policy: PlacementPolicy,
+
+    // Low level description of exactly which pageservers should fulfil
+    // which role.  Generated by `Self::schedule`.
+    pub(crate) intent: IntentState,
+
+    // Low level description of how the tenant is configured on pageservers:
+    // if this does not match `Self::intent` then the tenant needs reconciliation
+    // with `Self::reconcile`.
+    pub(crate) observed: ObservedState,
+
+    // Tenant configuration, passed through opaquely to the pageserver.  Identical
+    // for all shards in a tenant.
+    pub(crate) config: TenantConfig,
+
+    /// If a reconcile task is currently in flight, it may be joined here (it is
+    /// only safe to join if either the result has been received or the reconciler's
+    /// cancellation token has been fired)
+    pub(crate) reconciler: Option<ReconcilerHandle>,
+
+    /// Optionally wait for reconciliation to complete up to a particular
+    /// sequence number.
+    pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
+
+    /// Indicates sequence number for which we have encountered an error reconciling.  If
+    /// this advances ahead of [`Self::waiter`] then a reconciliation error has occurred,
+    /// and callers should stop waiting for `waiter` and propagate the error.
+    pub(crate) error_waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
+
+    /// The most recent error from a reconcile on this tenant
+    /// TODO: generalize to an array of recent events
+    /// TOOD: use a ArcSwap instead of mutex for faster reads?
+    pub(crate) last_error: std::sync::Arc<std::sync::Mutex<String>>,
+}
+
+#[derive(Default, Clone, Debug)]
+pub(crate) struct IntentState {
+    pub(crate) attached: Option<NodeId>,
+    pub(crate) secondary: Vec<NodeId>,
+}
+
+#[derive(Default, Clone)]
+pub(crate) struct ObservedState {
+    pub(crate) locations: HashMap<NodeId, ObservedStateLocation>,
+}
+
+/// Our latest knowledge of how this tenant is configured in the outside world.
+///
+/// Meaning:
+///     * No instance of this type exists for a node: we are certain that we have nothing configured on that
+///       node for this shard.
+///     * Instance exists with conf==None: we *might* have some state on that node, but we don't know
+///       what it is (e.g. we failed partway through configuring it)
+///     * Instance exists with conf==Some: this tells us what we last successfully configured on this node,
+///       and that configuration will still be present unless something external interfered.
+#[derive(Clone)]
+pub(crate) struct ObservedStateLocation {
+    /// If None, it means we do not know the status of this shard's location on this node, but
+    /// we know that we might have some state on this node.
+    pub(crate) conf: Option<LocationConfig>,
+}
+pub(crate) struct ReconcilerWaiter {
+    // For observability purposes, remember the ID of the shard we're
+    // waiting for.
+    pub(crate) tenant_shard_id: TenantShardId,
+
+    seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
+    error_seq_wait: std::sync::Arc<SeqWait<Sequence, Sequence>>,
+    error: std::sync::Arc<std::sync::Mutex<String>>,
+    seq: Sequence,
+}
+
+#[derive(thiserror::Error, Debug)]
+pub enum ReconcileWaitError {
+    #[error("Timeout waiting for shard {0}")]
+    Timeout(TenantShardId),
+    #[error("shutting down")]
+    Shutdown,
+    #[error("Reconcile error on shard {0}: {1}")]
+    Failed(TenantShardId, String),
+}
+
+impl ReconcilerWaiter {
+    pub(crate) async fn wait_timeout(&self, timeout: Duration) -> Result<(), ReconcileWaitError> {
+        tokio::select! {
+            result = self.seq_wait.wait_for_timeout(self.seq, timeout)=> {
+                result.map_err(|e| match e {
+                    SeqWaitError::Timeout => ReconcileWaitError::Timeout(self.tenant_shard_id),
+                    SeqWaitError::Shutdown => ReconcileWaitError::Shutdown
+                })?;
+            },
+            result = self.error_seq_wait.wait_for(self.seq) => {
+                result.map_err(|e| match e {
+                    SeqWaitError::Shutdown => ReconcileWaitError::Shutdown,
+                    SeqWaitError::Timeout => unreachable!()
+                })?;
+
+                return Err(ReconcileWaitError::Failed(self.tenant_shard_id, self.error.lock().unwrap().clone()))
+            }
+        }
+
+        Ok(())
+    }
+}
+
+/// Having spawned a reconciler task, the tenant shard's state will carry enough
+/// information to optionally cancel & await it later.
+pub(crate) struct ReconcilerHandle {
+    sequence: Sequence,
+    handle: JoinHandle<()>,
+    cancel: CancellationToken,
+}
+
+/// When a reconcile task completes, it sends this result object
+/// to be applied to the primary TenantState.
+pub(crate) struct ReconcileResult {
+    pub(crate) sequence: Sequence,
+    /// On errors, `observed` should be treated as an incompleted description
+    /// of state (i.e. any nodes present in the result should override nodes
+    /// present in the parent tenant state, but any unmentioned nodes should
+    /// not be removed from parent tenant state)
+    pub(crate) result: Result<(), ReconcileError>,
+
+    pub(crate) tenant_shard_id: TenantShardId,
+    pub(crate) generation: Generation,
+    pub(crate) observed: ObservedState,
+}
+
+impl IntentState {
+    pub(crate) fn new() -> Self {
+        Self {
+            attached: None,
+            secondary: vec![],
+        }
+    }
+    pub(crate) fn all_pageservers(&self) -> Vec<NodeId> {
+        let mut result = Vec::new();
+        if let Some(p) = self.attached {
+            result.push(p)
+        }
+
+        result.extend(self.secondary.iter().copied());
+
+        result
+    }
+
+    /// When a node goes offline, we update intents to avoid using it
+    /// as their attached pageserver.
+    ///
+    /// Returns true if a change was made
+    pub(crate) fn notify_offline(&mut self, node_id: NodeId) -> bool {
+        if self.attached == Some(node_id) {
+            self.attached = None;
+            self.secondary.push(node_id);
+            true
+        } else {
+            false
+        }
+    }
+}
+
+impl ObservedState {
+    pub(crate) fn new() -> Self {
+        Self {
+            locations: HashMap::new(),
+        }
+    }
+}
+
+impl TenantState {
+    pub(crate) fn new(
+        tenant_shard_id: TenantShardId,
+        shard: ShardIdentity,
+        policy: PlacementPolicy,
+    ) -> Self {
+        Self {
+            tenant_shard_id,
+            policy,
+            intent: IntentState::default(),
+            generation: Generation::new(0),
+            shard,
+            observed: ObservedState::default(),
+            config: TenantConfig::default(),
+            reconciler: None,
+            sequence: Sequence(1),
+            waiter: Arc::new(SeqWait::new(Sequence(0))),
+            error_waiter: Arc::new(SeqWait::new(Sequence(0))),
+            last_error: Arc::default(),
+        }
+    }
+
+    /// For use on startup when learning state from pageservers: generate my [`IntentState`] from my
+    /// [`ObservedState`], even if it violates my [`PlacementPolicy`].  Call [`Self::schedule`] next,
+    /// to get an intent state that complies with placement policy.  The overall goal is to do scheduling
+    /// in a way that makes use of any configured locations that already exist in the outside world.
+    pub(crate) fn intent_from_observed(&mut self) {
+        // Choose an attached location by filtering observed locations, and then sorting to get the highest
+        // generation
+        let mut attached_locs = self
+            .observed
+            .locations
+            .iter()
+            .filter_map(|(node_id, l)| {
+                if let Some(conf) = &l.conf {
+                    if conf.mode == LocationConfigMode::AttachedMulti
+                        || conf.mode == LocationConfigMode::AttachedSingle
+                        || conf.mode == LocationConfigMode::AttachedStale
+                    {
+                        Some((node_id, conf.generation))
+                    } else {
+                        None
+                    }
+                } else {
+                    None
+                }
+            })
+            .collect::<Vec<_>>();
+
+        attached_locs.sort_by_key(|i| i.1);
+        if let Some((node_id, _gen)) = attached_locs.into_iter().last() {
+            self.intent.attached = Some(*node_id);
+        }
+
+        // All remaining observed locations generate secondary intents.  This includes None
+        // observations, as these may well have some local content on disk that is usable (this
+        // is an edge case that might occur if we restarted during a migration or other change)
+        self.observed.locations.keys().for_each(|node_id| {
+            if Some(*node_id) != self.intent.attached {
+                self.intent.secondary.push(*node_id);
+            }
+        });
+    }
+
+    pub(crate) fn schedule(&mut self, scheduler: &mut Scheduler) -> Result<(), ScheduleError> {
+        // TODO: before scheduling new nodes, check if any existing content in
+        // self.intent refers to pageservers that are offline, and pick other
+        // pageservers if so.
+
+        // Build the set of pageservers already in use by this tenant, to avoid scheduling
+        // more work on the same pageservers we're already using.
+        let mut used_pageservers = self.intent.all_pageservers();
+        let mut modified = false;
+
+        use PlacementPolicy::*;
+        match self.policy {
+            Single => {
+                // Should have exactly one attached, and zero secondaries
+                if self.intent.attached.is_none() {
+                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
+                    self.intent.attached = Some(node_id);
+                    used_pageservers.push(node_id);
+                    modified = true;
+                }
+                if !self.intent.secondary.is_empty() {
+                    self.intent.secondary.clear();
+                    modified = true;
+                }
+            }
+            Double(secondary_count) => {
+                // Should have exactly one attached, and N secondaries
+                if self.intent.attached.is_none() {
+                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
+                    self.intent.attached = Some(node_id);
+                    used_pageservers.push(node_id);
+                    modified = true;
+                }
+
+                while self.intent.secondary.len() < secondary_count {
+                    let node_id = scheduler.schedule_shard(&used_pageservers)?;
+                    self.intent.secondary.push(node_id);
+                    used_pageservers.push(node_id);
+                    modified = true;
+                }
+            }
+        }
+
+        if modified {
+            self.sequence.0 += 1;
+        }
+
+        Ok(())
+    }
+
+    fn dirty(&self) -> bool {
+        if let Some(node_id) = self.intent.attached {
+            let wanted_conf = attached_location_conf(self.generation, &self.shard, &self.config);
+            match self.observed.locations.get(&node_id) {
+                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
+                Some(_) | None => {
+                    return true;
+                }
+            }
+        }
+
+        for node_id in &self.intent.secondary {
+            let wanted_conf = secondary_location_conf(&self.shard, &self.config);
+            match self.observed.locations.get(node_id) {
+                Some(conf) if conf.conf.as_ref() == Some(&wanted_conf) => {}
+                Some(_) | None => {
+                    return true;
+                }
+            }
+        }
+
+        false
+    }
+
+    pub(crate) fn maybe_reconcile(
+        &mut self,
+        result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
+        pageservers: &Arc<HashMap<NodeId, Node>>,
+        compute_hook: &Arc<ComputeHook>,
+        service_config: &service::Config,
+        persistence: &Arc<Persistence>,
+    ) -> Option<ReconcilerWaiter> {
+        // If there are any ambiguous observed states, and the nodes they refer to are available,
+        // we should reconcile to clean them up.
+        let mut dirty_observed = false;
+        for (node_id, observed_loc) in &self.observed.locations {
+            let node = pageservers
+                .get(node_id)
+                .expect("Nodes may not be removed while referenced");
+            if observed_loc.conf.is_none()
+                && !matches!(node.availability, NodeAvailability::Offline)
+            {
+                dirty_observed = true;
+                break;
+            }
+        }
+
+        if !self.dirty() && !dirty_observed {
+            tracing::info!("Not dirty, no reconciliation needed.");
+            return None;
+        }
+
+        // Reconcile already in flight for the current sequence?
+        if let Some(handle) = &self.reconciler {
+            if handle.sequence == self.sequence {
+                return Some(ReconcilerWaiter {
+                    tenant_shard_id: self.tenant_shard_id,
+                    seq_wait: self.waiter.clone(),
+                    error_seq_wait: self.error_waiter.clone(),
+                    error: self.last_error.clone(),
+                    seq: self.sequence,
+                });
+            }
+        }
+
+        // Reconcile in flight for a stale sequence?  Our sequence's task will wait for it before
+        // doing our sequence's work.
+        let old_handle = self.reconciler.take();
+
+        let cancel = CancellationToken::new();
+        let mut reconciler = Reconciler {
+            tenant_shard_id: self.tenant_shard_id,
+            shard: self.shard,
+            generation: self.generation,
+            intent: self.intent.clone(),
+            config: self.config.clone(),
+            observed: self.observed.clone(),
+            pageservers: pageservers.clone(),
+            compute_hook: compute_hook.clone(),
+            service_config: service_config.clone(),
+            cancel: cancel.clone(),
+            persistence: persistence.clone(),
+        };
+
+        let reconcile_seq = self.sequence;
+
+        tracing::info!("Spawning Reconciler for sequence {}", self.sequence);
+        let join_handle = tokio::task::spawn(async move {
+            // Wait for any previous reconcile task to complete before we start
+            if let Some(old_handle) = old_handle {
+                old_handle.cancel.cancel();
+                if let Err(e) = old_handle.handle.await {
+                    // We can't do much with this other than log it: the task is done, so
+                    // we may proceed with our work.
+                    tracing::error!("Unexpected join error waiting for reconcile task: {e}");
+                }
+            }
+
+            // Early check for cancellation before doing any work
+            // TODO: wrap all remote API operations in cancellation check
+            // as well.
+            if reconciler.cancel.is_cancelled() {
+                return;
+            }
+
+            let result = reconciler.reconcile().await;
+            result_tx
+                .send(ReconcileResult {
+                    sequence: reconcile_seq,
+                    result,
+                    tenant_shard_id: reconciler.tenant_shard_id,
+                    generation: reconciler.generation,
+                    observed: reconciler.observed,
+                })
+                .ok();
+        });
+
+        self.reconciler = Some(ReconcilerHandle {
+            sequence: self.sequence,
+            handle: join_handle,
+            cancel,
+        });
+
+        Some(ReconcilerWaiter {
+            tenant_shard_id: self.tenant_shard_id,
+            seq_wait: self.waiter.clone(),
+            error_seq_wait: self.error_waiter.clone(),
+            error: self.last_error.clone(),
+            seq: self.sequence,
+        })
+    }
+}
--- a/control_plane/src/attachment_service.rs
+++ b/control_plane/src/attachment_service.rs
@@ -1,14 +1,27 @@
 use crate::{background_process, local_env::LocalEnv};
-use anyhow::anyhow;
 use camino::Utf8PathBuf;
-use serde::{Deserialize, Serialize};
-use std::{path::PathBuf, process::Child};
-use utils::id::{NodeId, TenantId};
+use hyper::Method;
+use pageserver_api::{
+    models::{ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo},
+    shard::TenantShardId,
+};
+use pageserver_client::mgmt_api::ResponseErrorMessageExt;
+use postgres_backend::AuthType;
+use postgres_connection::parse_host_port;
+use serde::{de::DeserializeOwned, Deserialize, Serialize};
+use std::{path::PathBuf, process::Child, str::FromStr};
+use tracing::instrument;
+use utils::{
+    auth::{Claims, Scope},
+    id::{NodeId, TenantId},
+};

 pub struct AttachmentService {
    env: LocalEnv,
    listen: String,
    path: PathBuf,
+    jwt_token: Option<String>,
+    public_key_path: Option<Utf8PathBuf>,
    client: reqwest::Client,
 }

@@ -16,7 +29,7 @@ const COMMAND: &str = "attachment_service";

 #[derive(Serialize, Deserialize)]
 pub struct AttachHookRequest {
-    pub tenant_id: TenantId,
+    pub tenant_shard_id: TenantShardId,
    pub node_id: Option<NodeId>,
 }

@@ -27,7 +40,7 @@ pub struct AttachHookResponse {

 #[derive(Serialize, Deserialize)]
 pub struct InspectRequest {
-    pub tenant_id: TenantId,
+    pub tenant_shard_id: TenantShardId,
 }

 #[derive(Serialize, Deserialize)]
@@ -35,6 +48,125 @@ pub struct InspectResponse {
    pub attachment: Option<(u32, NodeId)>,
 }

+#[derive(Serialize, Deserialize)]
+pub struct TenantCreateResponseShard {
+    pub node_id: NodeId,
+    pub generation: u32,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantCreateResponse {
+    pub shards: Vec<TenantCreateResponseShard>,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct NodeRegisterRequest {
+    pub node_id: NodeId,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct NodeConfigureRequest {
+    pub node_id: NodeId,
+
+    pub availability: Option<NodeAvailability>,
+    pub scheduling: Option<NodeSchedulingPolicy>,
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantLocateResponseShard {
+    pub shard_id: TenantShardId,
+    pub node_id: NodeId,
+
+    pub listen_pg_addr: String,
+    pub listen_pg_port: u16,
+
+    pub listen_http_addr: String,
+    pub listen_http_port: u16,
+}
+
+#[derive(Serialize, Deserialize)]
+pub struct TenantLocateResponse {
+    pub shards: Vec<TenantLocateResponseShard>,
+    pub shard_params: ShardParameters,
+}
+
+/// Explicitly migrating a particular shard is a low level operation
+/// TODO: higher level "Reschedule tenant" operation where the request
+/// specifies some constraints, e.g. asking it to get off particular node(s)
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantShardMigrateRequest {
+    pub tenant_shard_id: TenantShardId,
+    pub node_id: NodeId,
+}
+
+#[derive(Serialize, Deserialize, Clone, Copy)]
+pub enum NodeAvailability {
+    // Normal, happy state
+    Active,
+    // Offline: Tenants shouldn't try to attach here, but they may assume that their
+    // secondary locations on this node still exist.  Newly added nodes are in this
+    // state until we successfully contact them.
+    Offline,
+}
+
+impl FromStr for NodeAvailability {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "offline" => Ok(Self::Offline),
+            _ => Err(anyhow::anyhow!("Unknown availability state '{s}'")),
+        }
+    }
+}
+
+/// FIXME: this is a duplicate of the type in the attachment_service crate, because the
+/// type needs to be defined with diesel traits in there.
+#[derive(Serialize, Deserialize, Clone, Copy)]
+pub enum NodeSchedulingPolicy {
+    Active,
+    Filling,
+    Pause,
+    Draining,
+}
+
+impl FromStr for NodeSchedulingPolicy {
+    type Err = anyhow::Error;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        match s {
+            "active" => Ok(Self::Active),
+            "filling" => Ok(Self::Filling),
+            "pause" => Ok(Self::Pause),
+            "draining" => Ok(Self::Draining),
+            _ => Err(anyhow::anyhow!("Unknown scheduling state '{s}'")),
+        }
+    }
+}
+
+impl From<NodeSchedulingPolicy> for String {
+    fn from(value: NodeSchedulingPolicy) -> String {
+        use NodeSchedulingPolicy::*;
+        match value {
+            Active => "active",
+            Filling => "filling",
+            Pause => "pause",
+            Draining => "draining",
+        }
+        .to_string()
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantShardMigrateResponse {}
+
 impl AttachmentService {
    pub fn from_env(env: &LocalEnv) -> Self {
        let path = env.base_data_dir.join("attachments.json");
@@ -49,10 +181,34 @@ impl AttachmentService {
            listen_url.port().unwrap()
        );

+        // Assume all pageservers have symmetric auth configuration: this service
+        // expects to use one JWT token to talk to all of them.
+        let ps_conf = env
+            .pageservers
+            .first()
+            .expect("Config is validated to contain at least one pageserver");
+        let (jwt_token, public_key_path) = match ps_conf.http_auth_type {
+            AuthType::Trust => (None, None),
+            AuthType::NeonJWT => {
+                let jwt_token = env
+                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
+                    .unwrap();
+
+                // If pageserver auth is enabled, this implicitly enables auth for this service,
+                // using the same credentials.
+                let public_key_path =
+                    camino::Utf8PathBuf::try_from(env.base_data_dir.join("auth_public_key.pem"))
+                        .unwrap();
+                (Some(jwt_token), Some(public_key_path))
+            }
+        };
+
        Self {
            env: env.clone(),
            path,
            listen,
+            jwt_token,
+            public_key_path,
            client: reqwest::ClientBuilder::new()
                .build()
                .expect("Failed to construct http client"),
@@ -67,72 +223,199 @@ impl AttachmentService {
    pub async fn start(&self) -> anyhow::Result<Child> {
        let path_str = self.path.to_string_lossy();

-        background_process::start_process(
+        let mut args = vec!["-l", &self.listen, "-p", &path_str]
+            .into_iter()
+            .map(|s| s.to_string())
+            .collect::<Vec<_>>();
+        if let Some(jwt_token) = &self.jwt_token {
+            args.push(format!("--jwt-token={jwt_token}"));
+        }
+
+        if let Some(public_key_path) = &self.public_key_path {
+            args.push(format!("--public-key={public_key_path}"));
+        }
+
+        let result = background_process::start_process(
            COMMAND,
            &self.env.base_data_dir,
            &self.env.attachment_service_bin(),
-            ["-l", &self.listen, "-p", &path_str],
-            [],
+            args,
+            [(
+                "NEON_REPO_DIR".to_string(),
+                self.env.base_data_dir.to_string_lossy().to_string(),
+            )],
            background_process::InitialPidFile::Create(self.pid_file()),
-            // TODO: a real status check
-            || async move { anyhow::Ok(true) },
+            || async {
+                match self.status().await {
+                    Ok(_) => Ok(true),
+                    Err(_) => Ok(false),
+                }
+            },
        )
-        .await
+        .await;
+
+        for ps_conf in &self.env.pageservers {
+            let (pg_host, pg_port) =
+                parse_host_port(&ps_conf.listen_pg_addr).expect("Unable to parse listen_pg_addr");
+            let (http_host, http_port) = parse_host_port(&ps_conf.listen_http_addr)
+                .expect("Unable to parse listen_http_addr");
+            self.node_register(NodeRegisterRequest {
+                node_id: ps_conf.id,
+                listen_pg_addr: pg_host.to_string(),
+                listen_pg_port: pg_port.unwrap_or(5432),
+                listen_http_addr: http_host.to_string(),
+                listen_http_port: http_port.unwrap_or(80),
+            })
+            .await?;
+        }
+
+        result
    }

    pub fn stop(&self, immediate: bool) -> anyhow::Result<()> {
        background_process::stop_process(immediate, COMMAND, &self.pid_file())
    }
-
-    /// Call into the attach_hook API, for use before handing out attachments to pageservers
-    pub async fn attach_hook(
+    /// Simple HTTP request wrapper for calling into attachment service
+    async fn dispatch<RQ, RS>(
        &self,
-        tenant_id: TenantId,
-        pageserver_id: NodeId,
-    ) -> anyhow::Result<Option<u32>> {
-        use hyper::StatusCode;
-
+        method: hyper::Method,
+        path: String,
+        body: Option<RQ>,
+    ) -> anyhow::Result<RS>
+    where
+        RQ: Serialize + Sized,
+        RS: DeserializeOwned + Sized,
+    {
        let url = self
            .env
            .control_plane_api
            .clone()
            .unwrap()
-            .join("attach-hook")
+            .join(&path)
            .unwrap();

+        let mut builder = self.client.request(method, url);
+        if let Some(body) = body {
+            builder = builder.json(&body)
+        }
+        if let Some(jwt_token) = &self.jwt_token {
+            builder = builder.header(
+                reqwest::header::AUTHORIZATION,
+                format!("Bearer {jwt_token}"),
+            );
+        }
+
+        let response = builder.send().await?;
+        let response = response.error_from_body().await?;
+
+        Ok(response
+            .json()
+            .await
+            .map_err(pageserver_client::mgmt_api::Error::ReceiveBody)?)
+    }
+
+    /// Call into the attach_hook API, for use before handing out attachments to pageservers
+    #[instrument(skip(self))]
+    pub async fn attach_hook(
+        &self,
+        tenant_shard_id: TenantShardId,
+        pageserver_id: NodeId,
+    ) -> anyhow::Result<Option<u32>> {
        let request = AttachHookRequest {
-            tenant_id,
+            tenant_shard_id,
            node_id: Some(pageserver_id),
        };

-        let response = self.client.post(url).json(&request).send().await?;
-        if response.status() != StatusCode::OK {
-            return Err(anyhow!("Unexpected status {}", response.status()));
-        }
+        let response = self
+            .dispatch::<_, AttachHookResponse>(
+                Method::POST,
+                "attach-hook".to_string(),
+                Some(request),
+            )
+            .await?;

-        let response = response.json::<AttachHookResponse>().await?;
        Ok(response.gen)
    }

-    pub async fn inspect(&self, tenant_id: TenantId) -> anyhow::Result<Option<(u32, NodeId)>> {
-        use hyper::StatusCode;
+    #[instrument(skip(self))]
+    pub async fn inspect(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> anyhow::Result<Option<(u32, NodeId)>> {
+        let request = InspectRequest { tenant_shard_id };

-        let url = self
-            .env
-            .control_plane_api
-            .clone()
-            .unwrap()
-            .join("inspect")
-            .unwrap();
+        let response = self
+            .dispatch::<_, InspectResponse>(Method::POST, "inspect".to_string(), Some(request))
+            .await?;

-        let request = InspectRequest { tenant_id };
-
-        let response = self.client.post(url).json(&request).send().await?;
-        if response.status() != StatusCode::OK {
-            return Err(anyhow!("Unexpected status {}", response.status()));
-        }
-
-        let response = response.json::<InspectResponse>().await?;
        Ok(response.attachment)
    }
+
+    #[instrument(skip(self))]
+    pub async fn tenant_create(
+        &self,
+        req: TenantCreateRequest,
+    ) -> anyhow::Result<TenantCreateResponse> {
+        self.dispatch(Method::POST, "tenant".to_string(), Some(req))
+            .await
+    }
+
+    #[instrument(skip(self))]
+    pub async fn tenant_locate(&self, tenant_id: TenantId) -> anyhow::Result<TenantLocateResponse> {
+        self.dispatch::<(), _>(Method::GET, format!("tenant/{tenant_id}/locate"), None)
+            .await
+    }
+
+    #[instrument(skip(self))]
+    pub async fn tenant_migrate(
+        &self,
+        tenant_shard_id: TenantShardId,
+        node_id: NodeId,
+    ) -> anyhow::Result<TenantShardMigrateResponse> {
+        self.dispatch(
+            Method::PUT,
+            format!("tenant/{tenant_shard_id}/migrate"),
+            Some(TenantShardMigrateRequest {
+                tenant_shard_id,
+                node_id,
+            }),
+        )
+        .await
+    }
+
+    #[instrument(skip_all, fields(node_id=%req.node_id))]
+    pub async fn node_register(&self, req: NodeRegisterRequest) -> anyhow::Result<()> {
+        self.dispatch::<_, ()>(Method::POST, "node".to_string(), Some(req))
+            .await
+    }
+
+    #[instrument(skip_all, fields(node_id=%req.node_id))]
+    pub async fn node_configure(&self, req: NodeConfigureRequest) -> anyhow::Result<()> {
+        self.dispatch::<_, ()>(
+            Method::PUT,
+            format!("node/{}/config", req.node_id),
+            Some(req),
+        )
+        .await
+    }
+
+    #[instrument(skip(self))]
+    pub async fn status(&self) -> anyhow::Result<()> {
+        self.dispatch::<(), ()>(Method::GET, "status".to_string(), None)
+            .await
+    }
+
+    #[instrument(skip_all, fields(%tenant_id, timeline_id=%req.new_timeline_id))]
+    pub async fn tenant_timeline_create(
+        &self,
+        tenant_id: TenantId,
+        req: TimelineCreateRequest,
+    ) -> anyhow::Result<TimelineInfo> {
+        self.dispatch(
+            Method::POST,
+            format!("tenant/{tenant_id}/timeline"),
+            Some(req),
+        )
+        .await
+    }
 }
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -1,337 +0,0 @@
-/// The attachment service mimics the aspects of the control plane API
-/// that are required for a pageserver to operate.
-///
-/// This enables running & testing pageservers without a full-blown
-/// deployment of the Neon cloud platform.
-///
-use anyhow::anyhow;
-use clap::Parser;
-use hex::FromHex;
-use hyper::StatusCode;
-use hyper::{Body, Request, Response};
-use pageserver_api::shard::TenantShardId;
-use serde::{Deserialize, Serialize};
-use std::path::{Path, PathBuf};
-use std::{collections::HashMap, sync::Arc};
-use utils::http::endpoint::request_span;
-use utils::logging::{self, LogFormat};
-use utils::signals::{ShutdownSignals, Signal};
-
-use utils::{
-    http::{
-        endpoint::{self},
-        error::ApiError,
-        json::{json_request, json_response},
-        RequestExt, RouterBuilder,
-    },
-    id::{NodeId, TenantId},
-    tcp_listener,
-};
-
-use pageserver_api::control_api::{
-    ReAttachRequest, ReAttachResponse, ReAttachResponseTenant, ValidateRequest, ValidateResponse,
-    ValidateResponseTenant,
-};
-
-use control_plane::attachment_service::{
-    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
-};
-
-#[derive(Parser)]
-#[command(author, version, about, long_about = None)]
-#[command(arg_required_else_help(true))]
-struct Cli {
-    /// Host and port to listen on, like `127.0.0.1:1234`
-    #[arg(short, long)]
-    listen: std::net::SocketAddr,
-
-    /// Path to the .json file to store state (will be created if it doesn't exist)
-    #[arg(short, long)]
-    path: PathBuf,
-}
-
-// The persistent state of each Tenant
-#[derive(Serialize, Deserialize, Clone)]
-struct TenantState {
-    // Currently attached pageserver
-    pageserver: Option<NodeId>,
-
-    // Latest generation number: next time we attach, increment this
-    // and use the incremented number when attaching
-    generation: u32,
-}
-
-fn to_hex_map<S, V>(input: &HashMap<TenantId, V>, serializer: S) -> Result<S::Ok, S::Error>
-where
-    S: serde::Serializer,
-    V: Clone + Serialize,
-{
-    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v.clone()));
-
-    transformed
-        .collect::<HashMap<String, V>>()
-        .serialize(serializer)
-}
-
-fn from_hex_map<'de, D, V>(deserializer: D) -> Result<HashMap<TenantId, V>, D::Error>
-where
-    D: serde::de::Deserializer<'de>,
-    V: Deserialize<'de>,
-{
-    let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
-    hex_map
-        .into_iter()
-        .map(|(k, v)| {
-            TenantId::from_hex(k)
-                .map(|k| (k, v))
-                .map_err(serde::de::Error::custom)
-        })
-        .collect()
-}
-
-// Top level state available to all HTTP handlers
-#[derive(Serialize, Deserialize)]
-struct PersistentState {
-    #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
-    tenants: HashMap<TenantId, TenantState>,
-
-    #[serde(skip)]
-    path: PathBuf,
-}
-
-impl PersistentState {
-    async fn save(&self) -> anyhow::Result<()> {
-        let bytes = serde_json::to_vec(self)?;
-        tokio::fs::write(&self.path, &bytes).await?;
-
-        Ok(())
-    }
-
-    async fn load(path: &Path) -> anyhow::Result<Self> {
-        let bytes = tokio::fs::read(path).await?;
-        let mut decoded = serde_json::from_slice::<Self>(&bytes)?;
-        decoded.path = path.to_owned();
-        Ok(decoded)
-    }
-
-    async fn load_or_new(path: &Path) -> Self {
-        match Self::load(path).await {
-            Ok(s) => {
-                tracing::info!("Loaded state file at {}", path.display());
-                s
-            }
-            Err(e)
-                if e.downcast_ref::<std::io::Error>()
-                    .map(|e| e.kind() == std::io::ErrorKind::NotFound)
-                    .unwrap_or(false) =>
-            {
-                tracing::info!("Will create state file at {}", path.display());
-                Self {
-                    tenants: HashMap::new(),
-                    path: path.to_owned(),
-                }
-            }
-            Err(e) => {
-                panic!("Failed to load state from '{}': {e:#} (maybe your .neon/ dir was written by an older version?)", path.display())
-            }
-        }
-    }
-}
-
-/// State available to HTTP request handlers
-#[derive(Clone)]
-struct State {
-    inner: Arc<tokio::sync::RwLock<PersistentState>>,
-}
-
-impl State {
-    fn new(persistent_state: PersistentState) -> State {
-        Self {
-            inner: Arc::new(tokio::sync::RwLock::new(persistent_state)),
-        }
-    }
-}
-
-#[inline(always)]
-fn get_state(request: &Request<Body>) -> &State {
-    request
-        .data::<Arc<State>>()
-        .expect("unknown state type")
-        .as_ref()
-}
-
-/// Pageserver calls into this on startup, to learn which tenants it should attach
-async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let reattach_req = json_request::<ReAttachRequest>(&mut req).await?;
-
-    let state = get_state(&req).inner.clone();
-    let mut locked = state.write().await;
-
-    let mut response = ReAttachResponse {
-        tenants: Vec::new(),
-    };
-    for (t, state) in &mut locked.tenants {
-        if state.pageserver == Some(reattach_req.node_id) {
-            state.generation += 1;
-            response.tenants.push(ReAttachResponseTenant {
-                // TODO(sharding): make this shard-aware
-                id: TenantShardId::unsharded(*t),
-                gen: state.generation,
-            });
-        }
-    }
-
-    locked.save().await.map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, response)
-}
-
-/// Pageserver calls into this before doing deletions, to confirm that it still
-/// holds the latest generation for the tenants with deletions enqueued
-async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let validate_req = json_request::<ValidateRequest>(&mut req).await?;
-
-    let locked = get_state(&req).inner.read().await;
-
-    let mut response = ValidateResponse {
-        tenants: Vec::new(),
-    };
-
-    for req_tenant in validate_req.tenants {
-        // TODO(sharding): make this shard-aware
-        if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
-            let valid = tenant_state.generation == req_tenant.gen;
-            tracing::info!(
-                "handle_validate: {}(gen {}): valid={valid} (latest {})",
-                req_tenant.id,
-                req_tenant.gen,
-                tenant_state.generation
-            );
-            response.tenants.push(ValidateResponseTenant {
-                id: req_tenant.id,
-                valid,
-            });
-        }
-    }
-
-    json_response(StatusCode::OK, response)
-}
-/// Call into this before attaching a tenant to a pageserver, to acquire a generation number
-/// (in the real control plane this is unnecessary, because the same program is managing
-///  generation numbers and doing attachments).
-async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let attach_req = json_request::<AttachHookRequest>(&mut req).await?;
-
-    let state = get_state(&req).inner.clone();
-    let mut locked = state.write().await;
-
-    let tenant_state = locked
-        .tenants
-        .entry(attach_req.tenant_id)
-        .or_insert_with(|| TenantState {
-            pageserver: attach_req.node_id,
-            generation: 0,
-        });
-
-    if let Some(attaching_pageserver) = attach_req.node_id.as_ref() {
-        tenant_state.generation += 1;
-        tracing::info!(
-            tenant_id = %attach_req.tenant_id,
-            ps_id = %attaching_pageserver,
-            generation = %tenant_state.generation,
-            "issuing",
-        );
-    } else if let Some(ps_id) = tenant_state.pageserver {
-        tracing::info!(
-            tenant_id = %attach_req.tenant_id,
-            %ps_id,
-            generation = %tenant_state.generation,
-            "dropping",
-        );
-    } else {
-        tracing::info!(
-            tenant_id = %attach_req.tenant_id,
-            "no-op: tenant already has no pageserver");
-    }
-    tenant_state.pageserver = attach_req.node_id;
-    let generation = tenant_state.generation;
-
-    tracing::info!(
-        "handle_attach_hook: tenant {} set generation {}, pageserver {}",
-        attach_req.tenant_id,
-        tenant_state.generation,
-        attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
-    );
-
-    locked.save().await.map_err(ApiError::InternalServerError)?;
-
-    json_response(
-        StatusCode::OK,
-        AttachHookResponse {
-            gen: attach_req.node_id.map(|_| generation),
-        },
-    )
-}
-
-async fn handle_inspect(mut req: Request<Body>) -> Result<Response<Body>, ApiError> {
-    let inspect_req = json_request::<InspectRequest>(&mut req).await?;
-
-    let state = get_state(&req).inner.clone();
-    let locked = state.write().await;
-    let tenant_state = locked.tenants.get(&inspect_req.tenant_id);
-
-    json_response(
-        StatusCode::OK,
-        InspectResponse {
-            attachment: tenant_state.and_then(|s| s.pageserver.map(|ps| (s.generation, ps))),
-        },
-    )
-}
-
-fn make_router(persistent_state: PersistentState) -> RouterBuilder<hyper::Body, ApiError> {
-    endpoint::make_router()
-        .data(Arc::new(State::new(persistent_state)))
-        .post("/re-attach", |r| request_span(r, handle_re_attach))
-        .post("/validate", |r| request_span(r, handle_validate))
-        .post("/attach-hook", |r| request_span(r, handle_attach_hook))
-        .post("/inspect", |r| request_span(r, handle_inspect))
-}
-
-#[tokio::main]
-async fn main() -> anyhow::Result<()> {
-    logging::init(
-        LogFormat::Plain,
-        logging::TracingErrorLayerEnablement::Disabled,
-        logging::Output::Stdout,
-    )?;
-
-    let args = Cli::parse();
-    tracing::info!(
-        "Starting, state at {}, listening on {}",
-        args.path.to_string_lossy(),
-        args.listen
-    );
-
-    let persistent_state = PersistentState::load_or_new(&args.path).await;
-
-    let http_listener = tcp_listener::bind(args.listen)?;
-    let router = make_router(persistent_state)
-        .build()
-        .map_err(|err| anyhow!(err))?;
-    let service = utils::http::RouterService::new(router).unwrap();
-    let server = hyper::Server::from_tcp(http_listener)?.serve(service);
-
-    tracing::info!("Serving on {0}", args.listen);
-
-    tokio::task::spawn(server);
-
-    ShutdownSignals::handle(|signal| match signal {
-        Signal::Interrupt | Signal::Terminate | Signal::Quit => {
-            tracing::info!("Got {}. Terminating", signal.name());
-            // We're just a test helper: no graceful shutdown.
-            std::process::exit(0);
-        }
-    })?;
-
-    Ok(())
-}
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -6,21 +6,26 @@
 //! rely on `neon_local` to set up the environment for each test.
 //!
 use anyhow::{anyhow, bail, Context, Result};
-use clap::{value_parser, Arg, ArgAction, ArgMatches, Command};
+use clap::{value_parser, Arg, ArgAction, ArgMatches, Command, ValueEnum};
 use compute_api::spec::ComputeMode;
-use control_plane::attachment_service::AttachmentService;
+use control_plane::attachment_service::{
+    AttachmentService, NodeAvailability, NodeConfigureRequest, NodeSchedulingPolicy,
+};
 use control_plane::endpoint::ComputeControlPlane;
-use control_plane::local_env::LocalEnv;
+use control_plane::local_env::{InitForceMode, LocalEnv};
 use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
-use control_plane::tenant_migration::migrate_tenant;
 use control_plane::{broker, local_env};
-use pageserver_api::models::TimelineInfo;
+use pageserver_api::models::{
+    ShardParameters, TenantCreateRequest, TimelineCreateRequest, TimelineInfo,
+};
+use pageserver_api::shard::{ShardCount, ShardStripeSize, TenantShardId};
 use pageserver_api::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
 };
 use postgres_backend::AuthType;
+use postgres_connection::parse_host_port;
 use safekeeper_api::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_SAFEKEEPER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
@@ -30,6 +35,7 @@ use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
 use storage_broker::DEFAULT_LISTEN_ADDR as DEFAULT_BROKER_ADDR;
+use url::Host;
 use utils::{
    auth::{Claims, Scope},
    id::{NodeId, TenantId, TenantTimelineId, TimelineId},
@@ -276,10 +282,10 @@ fn print_timeline(
 /// Connects to the pageserver to query this information.
 async fn get_timeline_infos(
    env: &local_env::LocalEnv,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
 ) -> Result<HashMap<TimelineId, TimelineInfo>> {
    Ok(get_default_pageserver(env)
-        .timeline_list(tenant_id)
+        .timeline_list(tenant_shard_id)
        .await?
        .into_iter()
        .map(|timeline_info| (timeline_info.timeline_id, timeline_info))
@@ -297,6 +303,20 @@ fn get_tenant_id(sub_match: &ArgMatches, env: &local_env::LocalEnv) -> anyhow::R
    }
 }

+// Helper function to parse --tenant_id option, for commands that accept a shard suffix
+fn get_tenant_shard_id(
+    sub_match: &ArgMatches,
+    env: &local_env::LocalEnv,
+) -> anyhow::Result<TenantShardId> {
+    if let Some(tenant_id_from_arguments) = parse_tenant_shard_id(sub_match).transpose() {
+        tenant_id_from_arguments
+    } else if let Some(default_id) = env.default_tenant_id {
+        Ok(TenantShardId::unsharded(default_id))
+    } else {
+        anyhow::bail!("No tenant shard id. Use --tenant-id, or set a default tenant");
+    }
+}
+
 fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
    sub_match
        .get_one::<String>("tenant-id")
@@ -305,6 +325,14 @@ fn parse_tenant_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantId>> {
        .context("Failed to parse tenant id from the argument string")
 }

+fn parse_tenant_shard_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TenantShardId>> {
+    sub_match
+        .get_one::<String>("tenant-id")
+        .map(|id_str| TenantShardId::from_str(id_str))
+        .transpose()
+        .context("Failed to parse tenant shard id from the argument string")
+}
+
 fn parse_timeline_id(sub_match: &ArgMatches) -> anyhow::Result<Option<TimelineId>> {
    sub_match
        .get_one::<String>("timeline-id")
@@ -338,7 +366,7 @@ fn handle_init(init_match: &ArgMatches) -> anyhow::Result<LocalEnv> {

    let mut env =
        LocalEnv::parse_config(&toml_file).context("Failed to create neon configuration")?;
-    let force = init_match.get_flag("force");
+    let force = init_match.get_one("force").expect("we set a default value");
    env.init(pg_version, force)
        .context("Failed to initialize neon repository")?;

@@ -393,47 +421,68 @@ async fn handle_tenant(
        Some(("create", create_match)) => {
            let tenant_conf: HashMap<_, _> = create_match
                .get_many::<String>("config")
-                .map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
+                .map(|vals: clap::parser::ValuesRef<'_, String>| {
+                    vals.flat_map(|c| c.split_once(':')).collect()
+                })
                .unwrap_or_default();

+            let shard_count: u8 = create_match
+                .get_one::<u8>("shard-count")
+                .cloned()
+                .unwrap_or(0);
+
+            let shard_stripe_size: Option<u32> =
+                create_match.get_one::<u32>("shard-stripe-size").cloned();
+
+            let tenant_conf = PageServerNode::parse_config(tenant_conf)?;
+
            // If tenant ID was not specified, generate one
            let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);

-            let generation = if env.control_plane_api.is_some() {
-                // We must register the tenant with the attachment service, so
-                // that when the pageserver restarts, it will be re-attached.
-                let attachment_service = AttachmentService::from_env(env);
-                attachment_service
-                    .attach_hook(tenant_id, pageserver.conf.id)
-                    .await?
-            } else {
-                None
-            };
-
-            pageserver
-                .tenant_create(tenant_id, generation, tenant_conf)
+            // We must register the tenant with the attachment service, so
+            // that when the pageserver restarts, it will be re-attached.
+            let attachment_service = AttachmentService::from_env(env);
+            attachment_service
+                .tenant_create(TenantCreateRequest {
+                    // Note that ::unsharded here isn't actually because the tenant is unsharded, its because the
+                    // attachment service expecfs a shard-naive tenant_id in this attribute, and the TenantCreateRequest
+                    // type is used both in attachment service (for creating tenants) and in pageserver (for creating shards)
+                    new_tenant_id: TenantShardId::unsharded(tenant_id),
+                    generation: None,
+                    shard_parameters: ShardParameters {
+                        count: ShardCount(shard_count),
+                        stripe_size: shard_stripe_size
+                            .map(ShardStripeSize)
+                            .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
+                    },
+                    config: tenant_conf,
+                })
                .await?;
            println!("tenant {tenant_id} successfully created on the pageserver");

            // Create an initial timeline for the new tenant
-            let new_timeline_id = parse_timeline_id(create_match)?;
+            let new_timeline_id =
+                parse_timeline_id(create_match)?.unwrap_or(TimelineId::generate());
            let pg_version = create_match
                .get_one::<u32>("pg-version")
                .copied()
                .context("Failed to parse postgres version from the argument string")?;

-            let timeline_info = pageserver
-                .timeline_create(
+            // FIXME: passing None for ancestor_start_lsn is not kosher in a sharded world: we can't have
+            // different shards picking different start lsns.  Maybe we have to teach attachment service
+            // to let shard 0 branch first and then propagate the chosen LSN to other shards.
+            attachment_service
+                .tenant_timeline_create(
                    tenant_id,
-                    new_timeline_id,
-                    None,
-                    None,
-                    Some(pg_version),
-                    None,
+                    TimelineCreateRequest {
+                        new_timeline_id,
+                        ancestor_timeline_id: None,
+                        ancestor_start_lsn: None,
+                        existing_initdb_timeline_id: None,
+                        pg_version: Some(pg_version),
+                    },
                )
                .await?;
-            let new_timeline_id = timeline_info.timeline_id;
-            let last_record_lsn = timeline_info.last_record_lsn;

            env.register_branch_mapping(
                DEFAULT_BRANCH_NAME.to_string(),
@@ -441,9 +490,7 @@ async fn handle_tenant(
                new_timeline_id,
            )?;

-            println!(
-                "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {tenant_id}",
-            );
+            println!("Created an initial timeline '{new_timeline_id}' for tenant: {tenant_id}",);

            if create_match.get_flag("set-default") {
                println!("Setting tenant {tenant_id} as a default one");
@@ -470,14 +517,64 @@ async fn handle_tenant(
            println!("tenant {tenant_id} successfully configured on the pageserver");
        }
        Some(("migrate", matches)) => {
-            let tenant_id = get_tenant_id(matches, env)?;
+            let tenant_shard_id = get_tenant_shard_id(matches, env)?;
            let new_pageserver = get_pageserver(env, matches)?;
            let new_pageserver_id = new_pageserver.conf.id;

-            migrate_tenant(env, tenant_id, new_pageserver).await?;
-            println!("tenant {tenant_id} migrated to {}", new_pageserver_id);
-        }
+            let attachment_service = AttachmentService::from_env(env);
+            attachment_service
+                .tenant_migrate(tenant_shard_id, new_pageserver_id)
+                .await?;

+            println!("tenant {tenant_shard_id} migrated to {}", new_pageserver_id);
+        }
+        Some(("status", matches)) => {
+            let tenant_id = get_tenant_id(matches, env)?;
+
+            let mut shard_table = comfy_table::Table::new();
+            shard_table.set_header(["Shard", "Pageserver", "Physical Size"]);
+
+            let mut tenant_synthetic_size = None;
+
+            let attachment_service = AttachmentService::from_env(env);
+            for shard in attachment_service.tenant_locate(tenant_id).await?.shards {
+                let pageserver =
+                    PageServerNode::from_env(env, env.get_pageserver_conf(shard.node_id)?);
+
+                let size = pageserver
+                    .http_client
+                    .tenant_details(shard.shard_id)
+                    .await?
+                    .tenant_info
+                    .current_physical_size
+                    .unwrap();
+
+                shard_table.add_row([
+                    format!("{}", shard.shard_id.shard_slug()),
+                    format!("{}", shard.node_id.0),
+                    format!("{} MiB", size / (1024 * 1024)),
+                ]);
+
+                if shard.shard_id.is_zero() {
+                    tenant_synthetic_size =
+                        Some(pageserver.tenant_synthetic_size(shard.shard_id).await?);
+                }
+            }
+
+            let Some(synthetic_size) = tenant_synthetic_size else {
+                bail!("Shard 0 not found")
+            };
+
+            let mut tenant_table = comfy_table::Table::new();
+            tenant_table.add_row(["Tenant ID".to_string(), tenant_id.to_string()]);
+            tenant_table.add_row([
+                "Synthetic size".to_string(),
+                format!("{} MiB", synthetic_size.size.unwrap_or(0) / (1024 * 1024)),
+            ]);
+
+            println!("{tenant_table}");
+            println!("{shard_table}");
+        }
        Some((sub_name, _)) => bail!("Unexpected tenant subcommand '{}'", sub_name),
        None => bail!("no tenant subcommand provided"),
    }
@@ -489,8 +586,10 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local

    match timeline_match.subcommand() {
        Some(("list", list_match)) => {
-            let tenant_id = get_tenant_id(list_match, env)?;
-            let timelines = pageserver.timeline_list(&tenant_id).await?;
+            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
+            // where shard 0 is attached, and query there.
+            let tenant_shard_id = get_tenant_shard_id(list_match, env)?;
+            let timelines = pageserver.timeline_list(&tenant_shard_id).await?;
            print_timelines_tree(timelines, env.timeline_name_mappings())?;
        }
        Some(("create", create_match)) => {
@@ -505,18 +604,19 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                .context("Failed to parse postgres version from the argument string")?;

            let new_timeline_id_opt = parse_timeline_id(create_match)?;
+            let new_timeline_id = new_timeline_id_opt.unwrap_or(TimelineId::generate());

-            let timeline_info = pageserver
-                .timeline_create(
-                    tenant_id,
-                    new_timeline_id_opt,
-                    None,
-                    None,
-                    Some(pg_version),
-                    None,
-                )
+            let attachment_service = AttachmentService::from_env(env);
+            let create_req = TimelineCreateRequest {
+                new_timeline_id,
+                ancestor_timeline_id: None,
+                existing_initdb_timeline_id: None,
+                ancestor_start_lsn: None,
+                pg_version: Some(pg_version),
+            };
+            let timeline_info = attachment_service
+                .tenant_timeline_create(tenant_id, create_req)
                .await?;
-            let new_timeline_id = timeline_info.timeline_id;

            let last_record_lsn = timeline_info.last_record_lsn;
            env.register_branch_mapping(new_branch_name.to_string(), tenant_id, new_timeline_id)?;
@@ -574,7 +674,6 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                None,
                pg_version,
                ComputeMode::Primary,
-                DEFAULT_PAGESERVER_ID,
            )?;
            println!("Done");
        }
@@ -598,17 +697,18 @@ async fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::Local
                .map(|lsn_str| Lsn::from_str(lsn_str))
                .transpose()
                .context("Failed to parse ancestor start Lsn from the request")?;
-            let timeline_info = pageserver
-                .timeline_create(
-                    tenant_id,
-                    None,
-                    start_lsn,
-                    Some(ancestor_timeline_id),
-                    None,
-                    None,
-                )
+            let new_timeline_id = TimelineId::generate();
+            let attachment_service = AttachmentService::from_env(env);
+            let create_req = TimelineCreateRequest {
+                new_timeline_id,
+                ancestor_timeline_id: Some(ancestor_timeline_id),
+                existing_initdb_timeline_id: None,
+                ancestor_start_lsn: start_lsn,
+                pg_version: None,
+            };
+            let timeline_info = attachment_service
+                .tenant_timeline_create(tenant_id, create_req)
                .await?;
-            let new_timeline_id = timeline_info.timeline_id;

            let last_record_lsn = timeline_info.last_record_lsn;

@@ -635,8 +735,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re

    match sub_name {
        "list" => {
-            let tenant_id = get_tenant_id(sub_args, env)?;
-            let timeline_infos = get_timeline_infos(env, &tenant_id)
+            // TODO(sharding): this command shouldn't have to specify a shard ID: we should ask the attachment service
+            // where shard 0 is attached, and query there.
+            let tenant_shard_id = get_tenant_shard_id(sub_args, env)?;
+            let timeline_infos = get_timeline_infos(env, &tenant_shard_id)
                .await
                .unwrap_or_else(|e| {
                    eprintln!("Failed to load timeline info: {}", e);
@@ -661,7 +763,7 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
            for (endpoint_id, endpoint) in cplane
                .endpoints
                .iter()
-                .filter(|(_, endpoint)| endpoint.tenant_id == tenant_id)
+                .filter(|(_, endpoint)| endpoint.tenant_id == tenant_shard_id.tenant_id)
            {
                let lsn_str = match endpoint.mode {
                    ComputeMode::Static(lsn) => {
@@ -680,7 +782,10 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                };

                let branch_name = timeline_name_mappings
-                    .get(&TenantTimelineId::new(tenant_id, endpoint.timeline_id))
+                    .get(&TenantTimelineId::new(
+                        tenant_shard_id.tenant_id,
+                        endpoint.timeline_id,
+                    ))
                    .map(|name| name.as_str())
                    .unwrap_or("?");

@@ -728,13 +833,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                .copied()
                .unwrap_or(false);

-            let pageserver_id =
-                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    NodeId(id_str.parse().context("while parsing pageserver id")?)
-                } else {
-                    DEFAULT_PAGESERVER_ID
-                };
-
            let mode = match (lsn, hot_standby) {
                (Some(lsn), false) => ComputeMode::Static(lsn),
                (None, true) => ComputeMode::Replica,
@@ -762,7 +860,6 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                http_port,
                pg_version,
                mode,
-                pageserver_id,
            )?;
        }
        "start" => {
@@ -772,9 +869,11 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re

            let pageserver_id =
                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    NodeId(id_str.parse().context("while parsing pageserver id")?)
+                    Some(NodeId(
+                        id_str.parse().context("while parsing pageserver id")?,
+                    ))
                } else {
-                    DEFAULT_PAGESERVER_ID
+                    None
                };

            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");
@@ -805,7 +904,38 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                endpoint.timeline_id,
            )?;

-            let ps_conf = env.get_pageserver_conf(pageserver_id)?;
+            let (pageservers, stripe_size) = if let Some(pageserver_id) = pageserver_id {
+                let conf = env.get_pageserver_conf(pageserver_id).unwrap();
+                let parsed = parse_host_port(&conf.listen_pg_addr).expect("Bad config");
+                (
+                    vec![(parsed.0, parsed.1.unwrap_or(5432))],
+                    // If caller is telling us what pageserver to use, this is not a tenant which is
+                    // full managed by attachment service, therefore not sharded.
+                    ShardParameters::DEFAULT_STRIPE_SIZE,
+                )
+            } else {
+                // Look up the currently attached location of the tenant, and its striping metadata,
+                // to pass these on to postgres.
+                let attachment_service = AttachmentService::from_env(env);
+                let locate_result = attachment_service.tenant_locate(endpoint.tenant_id).await?;
+                let pageservers = locate_result
+                    .shards
+                    .into_iter()
+                    .map(|shard| {
+                        (
+                            Host::parse(&shard.listen_pg_addr)
+                                .expect("Attachment service reported bad hostname"),
+                            shard.listen_pg_port,
+                        )
+                    })
+                    .collect::<Vec<_>>();
+                let stripe_size = locate_result.shard_params.stripe_size;
+
+                (pageservers, stripe_size)
+            };
+            assert!(!pageservers.is_empty());
+
+            let ps_conf = env.get_pageserver_conf(DEFAULT_PAGESERVER_ID)?;
            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
                let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);

@@ -816,7 +946,13 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re

            println!("Starting existing endpoint {endpoint_id}...");
            endpoint
-                .start(&auth_token, safekeepers, remote_ext_config)
+                .start(
+                    &auth_token,
+                    safekeepers,
+                    pageservers,
+                    remote_ext_config,
+                    stripe_size.0 as usize,
+                )
                .await?;
        }
        "reconfigure" => {
@@ -827,15 +963,31 @@ async fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Re
                .endpoints
                .get(endpoint_id.as_str())
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            let pageserver_id =
+            let pageservers =
                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    Some(NodeId(
-                        id_str.parse().context("while parsing pageserver id")?,
-                    ))
+                    let ps_id = NodeId(id_str.parse().context("while parsing pageserver id")?);
+                    let pageserver = PageServerNode::from_env(env, env.get_pageserver_conf(ps_id)?);
+                    vec![(
+                        pageserver.pg_connection_config.host().clone(),
+                        pageserver.pg_connection_config.port(),
+                    )]
                } else {
-                    None
+                    let attachment_service = AttachmentService::from_env(env);
+                    attachment_service
+                        .tenant_locate(endpoint.tenant_id)
+                        .await?
+                        .shards
+                        .into_iter()
+                        .map(|shard| {
+                            (
+                                Host::parse(&shard.listen_pg_addr)
+                                    .expect("Attachment service reported malformed host"),
+                                shard.listen_pg_port,
+                            )
+                        })
+                        .collect::<Vec<_>>()
                };
-            endpoint.reconfigure(pageserver_id).await?;
+            endpoint.reconfigure(pageservers).await?;
        }
        "stop" => {
            let endpoint_id = sub_args
@@ -959,6 +1111,21 @@ async fn handle_pageserver(sub_match: &ArgMatches, env: &local_env::LocalEnv) ->
            }
        }

+        Some(("set-state", subcommand_args)) => {
+            let pageserver = get_pageserver(env, subcommand_args)?;
+            let scheduling = subcommand_args.get_one("scheduling");
+            let availability = subcommand_args.get_one("availability");
+
+            let attachment_service = AttachmentService::from_env(env);
+            attachment_service
+                .node_configure(NodeConfigureRequest {
+                    node_id: pageserver.conf.id,
+                    scheduling: scheduling.cloned(),
+                    availability: availability.cloned(),
+                })
+                .await?;
+        }
+
        Some(("status", subcommand_args)) => {
            match get_pageserver(env, subcommand_args)?.check_status().await {
                Ok(_) => println!("Page server is up and running"),
@@ -1266,9 +1433,15 @@ fn cli() -> Command {
        .required(false);

    let force_arg = Arg::new("force")
-        .value_parser(value_parser!(bool))
+        .value_parser(value_parser!(InitForceMode))
        .long("force")
-        .action(ArgAction::SetTrue)
+        .default_value(
+            InitForceMode::MustNotExist
+                .to_possible_value()
+                .unwrap()
+                .get_name()
+                .to_owned(),
+        )
        .help("Force initialization even if the repository is not empty")
        .required(false);

@@ -1352,6 +1525,8 @@ fn cli() -> Command {
                .arg(pg_version_arg.clone())
                .arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false)
                    .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
+                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
+                .arg(Arg::new("shard-stripe-size").value_parser(value_parser!(u32)).long("shard-stripe-size").action(ArgAction::Set).help("Sharding stripe size in pages"))
                )
            .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
                .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
@@ -1362,6 +1537,9 @@ fn cli() -> Command {
                .about("Migrate a tenant from one pageserver to another")
                .arg(tenant_id_arg.clone())
                .arg(pageserver_id_arg.clone()))
+            .subcommand(Command::new("status")
+                .about("Human readable summary of the tenant's shards and attachment locations")
+                .arg(tenant_id_arg.clone()))
        )
        .subcommand(
            Command::new("pageserver")
@@ -1381,6 +1559,12 @@ fn cli() -> Command {
                    .about("Restart local pageserver")
                    .arg(pageserver_config_args.clone())
                )
+                .subcommand(Command::new("set-state")
+                    .arg(Arg::new("availability").value_parser(value_parser!(NodeAvailability)).long("availability").action(ArgAction::Set).help("Availability state: offline,active"))
+                    .arg(Arg::new("scheduling").value_parser(value_parser!(NodeSchedulingPolicy)).long("scheduling").action(ArgAction::Set).help("Scheduling state: draining,pause,filling,active"))
+                    .about("Set scheduling or availability state of pageserver node")
+                    .arg(pageserver_config_args.clone())
+                )
        )
        .subcommand(
            Command::new("attachment_service")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -49,10 +49,11 @@ use compute_api::spec::RemoteExtSpec;
 use nix::sys::signal::kill;
 use nix::sys::signal::Signal;
 use serde::{Deserialize, Serialize};
+use url::Host;
 use utils::id::{NodeId, TenantId, TimelineId};

+use crate::attachment_service::AttachmentService;
 use crate::local_env::LocalEnv;
-use crate::pageserver::PageServerNode;
 use crate::postgresql_conf::PostgresConf;

 use compute_api::responses::{ComputeState, ComputeStatus};
@@ -69,7 +70,6 @@ pub struct EndpointConf {
    http_port: u16,
    pg_version: u32,
    skip_pg_catalog_updates: bool,
-    pageserver_id: NodeId,
 }

 //
@@ -121,19 +121,14 @@ impl ComputeControlPlane {
        http_port: Option<u16>,
        pg_version: u32,
        mode: ComputeMode,
-        pageserver_id: NodeId,
    ) -> Result<Arc<Endpoint>> {
        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
-        let pageserver =
-            PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
-
        let ep = Arc::new(Endpoint {
            endpoint_id: endpoint_id.to_owned(),
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
            env: self.env.clone(),
-            pageserver,
            timeline_id,
            mode,
            tenant_id,
@@ -159,7 +154,6 @@ impl ComputeControlPlane {
                pg_port,
                pg_version,
                skip_pg_catalog_updates: true,
-                pageserver_id,
            })?,
        )?;
        std::fs::write(
@@ -218,7 +212,6 @@ pub struct Endpoint {
    // These are not part of the endpoint as such, but the environment
    // the endpoint runs in.
    pub env: LocalEnv,
-    pageserver: PageServerNode,

    // Optimizations
    skip_pg_catalog_updates: bool,
@@ -241,15 +234,11 @@ impl Endpoint {
        let conf: EndpointConf =
            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;

-        let pageserver =
-            PageServerNode::from_env(env, env.get_pageserver_conf(conf.pageserver_id)?);
-
        Ok(Endpoint {
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
            endpoint_id,
            env: env.clone(),
-            pageserver,
            timeline_id: conf.timeline_id,
            mode: conf.mode,
            tenant_id: conf.tenant_id,
@@ -469,11 +458,21 @@ impl Endpoint {
        }
    }

+    fn build_pageserver_connstr(pageservers: &[(Host, u16)]) -> String {
+        pageservers
+            .iter()
+            .map(|(host, port)| format!("postgresql://no_user@{host}:{port}"))
+            .collect::<Vec<_>>()
+            .join(",")
+    }
+
    pub async fn start(
        &self,
        auth_token: &Option<String>,
        safekeepers: Vec<NodeId>,
+        pageservers: Vec<(Host, u16)>,
        remote_ext_config: Option<&String>,
+        shard_stripe_size: usize,
    ) -> Result<()> {
        if self.status() == "running" {
            anyhow::bail!("The endpoint is already running");
@@ -487,13 +486,9 @@ impl Endpoint {
            std::fs::remove_dir_all(self.pgdata())?;
        }

-        let pageserver_connstring = {
-            let config = &self.pageserver.pg_connection_config;
-            let (host, port) = (config.host(), config.port());
+        let pageserver_connstring = Self::build_pageserver_connstr(&pageservers);
+        assert!(!pageserver_connstring.is_empty());

-            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
-            format!("postgresql://no_user@{host}:{port}")
-        };
        let mut safekeeper_connstrings = Vec::new();
        if self.mode == ComputeMode::Primary {
            for sk_id in safekeepers {
@@ -543,6 +538,7 @@ impl Endpoint {
            storage_auth_token: auth_token.clone(),
            remote_extensions,
            pgbouncer_settings: None,
+            shard_stripe_size: Some(shard_stripe_size),
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
@@ -665,7 +661,7 @@ impl Endpoint {
        }
    }

-    pub async fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
+    pub async fn reconfigure(&self, mut pageservers: Vec<(Host, u16)>) -> Result<()> {
        let mut spec: ComputeSpec = {
            let spec_path = self.endpoint_path().join("spec.json");
            let file = std::fs::File::open(spec_path)?;
@@ -675,25 +671,27 @@ impl Endpoint {
        let postgresql_conf = self.read_postgresql_conf()?;
        spec.cluster.postgresql_conf = Some(postgresql_conf);

-        if let Some(pageserver_id) = pageserver_id {
-            let endpoint_config_path = self.endpoint_path().join("endpoint.json");
-            let mut endpoint_conf: EndpointConf = {
-                let file = std::fs::File::open(&endpoint_config_path)?;
-                serde_json::from_reader(file)?
-            };
-            endpoint_conf.pageserver_id = pageserver_id;
-            std::fs::write(
-                endpoint_config_path,
-                serde_json::to_string_pretty(&endpoint_conf)?,
-            )?;
-
-            let pageserver =
-                PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
-            let ps_http_conf = &pageserver.pg_connection_config;
-            let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
-            spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
+        // If we weren't given explicit pageservers, query the attachment service
+        if pageservers.is_empty() {
+            let attachment_service = AttachmentService::from_env(&self.env);
+            let locate_result = attachment_service.tenant_locate(self.tenant_id).await?;
+            pageservers = locate_result
+                .shards
+                .into_iter()
+                .map(|shard| {
+                    (
+                        Host::parse(&shard.listen_pg_addr)
+                            .expect("Attachment service reported bad hostname"),
+                        shard.listen_pg_port,
+                    )
+                })
+                .collect::<Vec<_>>();
        }

+        let pageserver_connstr = Self::build_pageserver_connstr(&pageservers);
+        assert!(!pageserver_connstr.is_empty());
+        spec.pageserver_connstring = Some(pageserver_connstr);
+
        let client = reqwest::Client::new();
        let response = client
            .post(format!(
--- a/control_plane/src/lib.rs
+++ b/control_plane/src/lib.rs
@@ -14,4 +14,3 @@ pub mod local_env;
 pub mod pageserver;
 pub mod postgresql_conf;
 pub mod safekeeper;
-pub mod tenant_migration;
--- a/control_plane/src/local_env.rs
+++ b/control_plane/src/local_env.rs
@@ -5,6 +5,7 @@

 use anyhow::{bail, ensure, Context};

+use clap::ValueEnum;
 use postgres_backend::AuthType;
 use reqwest::Url;
 use serde::{Deserialize, Serialize};
@@ -162,6 +163,31 @@ impl Default for SafekeeperConf {
    }
 }

+#[derive(Clone, Copy)]
+pub enum InitForceMode {
+    MustNotExist,
+    EmptyDirOk,
+    RemoveAllContents,
+}
+
+impl ValueEnum for InitForceMode {
+    fn value_variants<'a>() -> &'a [Self] {
+        &[
+            Self::MustNotExist,
+            Self::EmptyDirOk,
+            Self::RemoveAllContents,
+        ]
+    }
+
+    fn to_possible_value(&self) -> Option<clap::builder::PossibleValue> {
+        Some(clap::builder::PossibleValue::new(match self {
+            InitForceMode::MustNotExist => "must-not-exist",
+            InitForceMode::EmptyDirOk => "empty-dir-ok",
+            InitForceMode::RemoveAllContents => "remove-all-contents",
+        }))
+    }
+}
+
 impl SafekeeperConf {
    /// Compute is served by port on which only tenant scoped tokens allowed, if
    /// it is configured.
@@ -225,7 +251,13 @@ impl LocalEnv {
        if let Some(conf) = self.pageservers.iter().find(|node| node.id == id) {
            Ok(conf)
        } else {
-            bail!("could not find pageserver {id}")
+            let have_ids = self
+                .pageservers
+                .iter()
+                .map(|node| format!("{}:{}", node.id, node.listen_http_addr))
+                .collect::<Vec<_>>();
+            let joined = have_ids.join(",");
+            bail!("could not find pageserver {id}, have ids {joined}")
        }
    }

@@ -384,7 +416,7 @@ impl LocalEnv {
    //
    // Initialize a new Neon repository
    //
-    pub fn init(&mut self, pg_version: u32, force: bool) -> anyhow::Result<()> {
+    pub fn init(&mut self, pg_version: u32, force: &InitForceMode) -> anyhow::Result<()> {
        // check if config already exists
        let base_path = &self.base_data_dir;
        ensure!(
@@ -393,25 +425,34 @@ impl LocalEnv {
        );

        if base_path.exists() {
-            if force {
-                println!("removing all contents of '{}'", base_path.display());
-                // instead of directly calling `remove_dir_all`, we keep the original dir but removing
-                // all contents inside. This helps if the developer symbol links another directory (i.e.,
-                // S3 local SSD) to the `.neon` base directory.
-                for entry in std::fs::read_dir(base_path)? {
-                    let entry = entry?;
-                    let path = entry.path();
-                    if path.is_dir() {
-                        fs::remove_dir_all(&path)?;
-                    } else {
-                        fs::remove_file(&path)?;
+            match force {
+                InitForceMode::MustNotExist => {
+                    bail!(
+                        "directory '{}' already exists. Perhaps already initialized?",
+                        base_path.display()
+                    );
+                }
+                InitForceMode::EmptyDirOk => {
+                    if let Some(res) = std::fs::read_dir(base_path)?.next() {
+                        res.context("check if directory is empty")?;
+                        anyhow::bail!("directory not empty: {base_path:?}");
+                    }
+                }
+                InitForceMode::RemoveAllContents => {
+                    println!("removing all contents of '{}'", base_path.display());
+                    // instead of directly calling `remove_dir_all`, we keep the original dir but removing
+                    // all contents inside. This helps if the developer symbol links another directory (i.e.,
+                    // S3 local SSD) to the `.neon` base directory.
+                    for entry in std::fs::read_dir(base_path)? {
+                        let entry = entry?;
+                        let path = entry.path();
+                        if path.is_dir() {
+                            fs::remove_dir_all(&path)?;
+                        } else {
+                            fs::remove_file(&path)?;
+                        }
                    }
                }
-            } else {
-                bail!(
-                    "directory '{}' already exists. Perhaps already initialized? (Hint: use --force to remove all contents)",
-                    base_path.display()
-                );
            }
        }

--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -17,7 +17,9 @@ use std::time::Duration;
 use anyhow::{bail, Context};
 use camino::Utf8PathBuf;
 use futures::SinkExt;
-use pageserver_api::models::{self, LocationConfig, TenantInfo, TimelineInfo};
+use pageserver_api::models::{
+    self, LocationConfig, ShardParameters, TenantHistorySize, TenantInfo, TimelineInfo,
+};
 use pageserver_api::shard::TenantShardId;
 use pageserver_client::mgmt_api;
 use postgres_backend::AuthType;
@@ -106,6 +108,16 @@ impl PageServerNode {
                "control_plane_api='{}'",
                control_plane_api.as_str()
            ));
+
+            // Attachment service uses the same auth as pageserver: if JWT is enabled
+            // for us, we will also need it to talk to them.
+            if matches!(self.conf.http_auth_type, AuthType::NeonJWT) {
+                let jwt_token = self
+                    .env
+                    .generate_auth_token(&Claims::new(None, Scope::PageServerApi))
+                    .unwrap();
+                overrides.push(format!("control_plane_api_token='{}'", jwt_token));
+            }
        }

        if !cli_overrides
@@ -301,16 +313,8 @@ impl PageServerNode {
    pub async fn tenant_list(&self) -> mgmt_api::Result<Vec<TenantInfo>> {
        self.http_client.list_tenants().await
    }
-
-    pub async fn tenant_create(
-        &self,
-        new_tenant_id: TenantId,
-        generation: Option<u32>,
-        settings: HashMap<&str, &str>,
-    ) -> anyhow::Result<TenantId> {
-        let mut settings = settings.clone();
-
-        let config = models::TenantConfig {
+    pub fn parse_config(mut settings: HashMap<&str, &str>) -> anyhow::Result<models::TenantConfig> {
+        let result = models::TenantConfig {
            checkpoint_distance: settings
                .remove("checkpoint_distance")
                .map(|x| x.parse::<u64>())
@@ -371,11 +375,26 @@ impl PageServerNode {
                .context("Failed to parse 'gc_feedback' as bool")?,
            heatmap_period: settings.remove("heatmap_period").map(|x| x.to_string()),
        };
+        if !settings.is_empty() {
+            bail!("Unrecognized tenant settings: {settings:?}")
+        } else {
+            Ok(result)
+        }
+    }
+
+    pub async fn tenant_create(
+        &self,
+        new_tenant_id: TenantId,
+        generation: Option<u32>,
+        settings: HashMap<&str, &str>,
+    ) -> anyhow::Result<TenantId> {
+        let config = Self::parse_config(settings.clone())?;

        let request = models::TenantCreateRequest {
            new_tenant_id: TenantShardId::unsharded(new_tenant_id),
            generation,
            config,
+            shard_parameters: ShardParameters::default(),
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
@@ -471,31 +490,39 @@ impl PageServerNode {

    pub async fn location_config(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        config: LocationConfig,
        flush_ms: Option<Duration>,
    ) -> anyhow::Result<()> {
        Ok(self
            .http_client
-            .location_config(tenant_id, config, flush_ms)
+            .location_config(tenant_shard_id, config, flush_ms)
            .await?)
    }

-    pub async fn timeline_list(&self, tenant_id: &TenantId) -> anyhow::Result<Vec<TimelineInfo>> {
-        Ok(self.http_client.list_timelines(*tenant_id).await?)
+    pub async fn timeline_list(
+        &self,
+        tenant_shard_id: &TenantShardId,
+    ) -> anyhow::Result<Vec<TimelineInfo>> {
+        Ok(self.http_client.list_timelines(*tenant_shard_id).await?)
+    }
+
+    pub async fn tenant_secondary_download(&self, tenant_id: &TenantShardId) -> anyhow::Result<()> {
+        Ok(self
+            .http_client
+            .tenant_secondary_download(*tenant_id)
+            .await?)
    }

    pub async fn timeline_create(
        &self,
-        tenant_id: TenantId,
-        new_timeline_id: Option<TimelineId>,
+        tenant_shard_id: TenantShardId,
+        new_timeline_id: TimelineId,
        ancestor_start_lsn: Option<Lsn>,
        ancestor_timeline_id: Option<TimelineId>,
        pg_version: Option<u32>,
        existing_initdb_timeline_id: Option<TimelineId>,
    ) -> anyhow::Result<TimelineInfo> {
-        // If timeline ID was not specified, generate one
-        let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
        let req = models::TimelineCreateRequest {
            new_timeline_id,
            ancestor_start_lsn,
@@ -503,7 +530,10 @@ impl PageServerNode {
            pg_version,
            existing_initdb_timeline_id,
        };
-        Ok(self.http_client.timeline_create(tenant_id, &req).await?)
+        Ok(self
+            .http_client
+            .timeline_create(tenant_shard_id, &req)
+            .await?)
    }

    /// Import a basebackup prepared using either:
@@ -581,4 +611,14 @@ impl PageServerNode {

        Ok(())
    }
+
+    pub async fn tenant_synthetic_size(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> anyhow::Result<TenantHistorySize> {
+        Ok(self
+            .http_client
+            .tenant_synthetic_size(tenant_shard_id)
+            .await?)
+    }
 }
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -1,205 +0,0 @@
-//!
-//! Functionality for migrating tenants across pageservers: unlike most of neon_local, this code
-//! isn't scoped to a particular physical service, as it needs to update compute endpoints to
-//! point to the new pageserver.
-//!
-use crate::local_env::LocalEnv;
-use crate::{
-    attachment_service::AttachmentService, endpoint::ComputeControlPlane,
-    pageserver::PageServerNode,
-};
-use pageserver_api::models::{
-    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
-};
-use std::collections::HashMap;
-use std::time::Duration;
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
-
-/// Given an attached pageserver, retrieve the LSN for all timelines
-async fn get_lsns(
-    tenant_id: TenantId,
-    pageserver: &PageServerNode,
-) -> anyhow::Result<HashMap<TimelineId, Lsn>> {
-    let timelines = pageserver.timeline_list(&tenant_id).await?;
-    Ok(timelines
-        .into_iter()
-        .map(|t| (t.timeline_id, t.last_record_lsn))
-        .collect())
-}
-
-/// Wait for the timeline LSNs on `pageserver` to catch up with or overtake
-/// `baseline`.
-async fn await_lsn(
-    tenant_id: TenantId,
-    pageserver: &PageServerNode,
-    baseline: HashMap<TimelineId, Lsn>,
-) -> anyhow::Result<()> {
-    loop {
-        let latest = match get_lsns(tenant_id, pageserver).await {
-            Ok(l) => l,
-            Err(e) => {
-                println!(
-                    "🕑 Can't get LSNs on pageserver {} yet, waiting ({e})",
-                    pageserver.conf.id
-                );
-                std::thread::sleep(Duration::from_millis(500));
-                continue;
-            }
-        };
-
-        let mut any_behind: bool = false;
-        for (timeline_id, baseline_lsn) in &baseline {
-            match latest.get(timeline_id) {
-                Some(latest_lsn) => {
-                    println!("🕑 LSN origin {baseline_lsn} vs destination {latest_lsn}");
-                    if latest_lsn < baseline_lsn {
-                        any_behind = true;
-                    }
-                }
-                None => {
-                    // Expected timeline isn't yet visible on migration destination.
-                    // (IRL we would have to account for timeline deletion, but this
-                    //  is just test helper)
-                    any_behind = true;
-                }
-            }
-        }
-
-        if !any_behind {
-            println!("✅ LSN caught up.  Proceeding...");
-            break;
-        } else {
-            std::thread::sleep(Duration::from_millis(500));
-        }
-    }
-
-    Ok(())
-}
-
-/// This function spans multiple services, to demonstrate live migration of a tenant
-/// between pageservers:
-///  - Coordinate attach/secondary/detach on pageservers
-///  - call into attachment_service for generations
-///  - reconfigure compute endpoints to point to new attached pageserver
-pub async fn migrate_tenant(
-    env: &LocalEnv,
-    tenant_id: TenantId,
-    dest_ps: PageServerNode,
-) -> anyhow::Result<()> {
-    // Get a new generation
-    let attachment_service = AttachmentService::from_env(env);
-
-    fn build_location_config(
-        mode: LocationConfigMode,
-        generation: Option<u32>,
-        secondary_conf: Option<LocationConfigSecondary>,
-    ) -> LocationConfig {
-        LocationConfig {
-            mode,
-            generation,
-            secondary_conf,
-            tenant_conf: TenantConfig::default(),
-            shard_number: 0,
-            shard_count: 0,
-            shard_stripe_size: 0,
-        }
-    }
-
-    let previous = attachment_service.inspect(tenant_id).await?;
-    let mut baseline_lsns = None;
-    if let Some((generation, origin_ps_id)) = &previous {
-        let origin_ps = PageServerNode::from_env(env, env.get_pageserver_conf(*origin_ps_id)?);
-
-        if origin_ps_id == &dest_ps.conf.id {
-            println!("🔁 Already attached to {origin_ps_id}, freshening...");
-            let gen = attachment_service
-                .attach_hook(tenant_id, dest_ps.conf.id)
-                .await?;
-            let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-            dest_ps.location_config(tenant_id, dest_conf, None).await?;
-            println!("✅ Migration complete");
-            return Ok(());
-        }
-
-        println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");
-
-        let stale_conf =
-            build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
-        origin_ps
-            .location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))
-            .await?;
-
-        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps).await?);
-    }
-
-    let gen = attachment_service
-        .attach_hook(tenant_id, dest_ps.conf.id)
-        .await?;
-    let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);
-
-    println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
-    dest_ps.location_config(tenant_id, dest_conf, None).await?;
-
-    if let Some(baseline) = baseline_lsns {
-        println!("🕑 Waiting for LSN to catch up...");
-        await_lsn(tenant_id, &dest_ps, baseline).await?;
-    }
-
-    let cplane = ComputeControlPlane::load(env.clone())?;
-    for (endpoint_name, endpoint) in &cplane.endpoints {
-        if endpoint.tenant_id == tenant_id {
-            println!(
-                "🔁 Reconfiguring endpoint {} to use pageserver {}",
-                endpoint_name, dest_ps.conf.id
-            );
-            endpoint.reconfigure(Some(dest_ps.conf.id)).await?;
-        }
-    }
-
-    for other_ps_conf in &env.pageservers {
-        if other_ps_conf.id == dest_ps.conf.id {
-            continue;
-        }
-
-        let other_ps = PageServerNode::from_env(env, other_ps_conf);
-        let other_ps_tenants = other_ps.tenant_list().await?;
-
-        // Check if this tenant is attached
-        let found = other_ps_tenants
-            .into_iter()
-            .map(|t| t.id)
-            .any(|i| i.tenant_id == tenant_id);
-        if !found {
-            continue;
-        }
-
-        // Downgrade to a secondary location
-        let secondary_conf = build_location_config(
-            LocationConfigMode::Secondary,
-            None,
-            Some(LocationConfigSecondary { warm: true }),
-        );
-
-        println!(
-            "💤 Switching to secondary mode on pageserver {}",
-            other_ps.conf.id
-        );
-        other_ps
-            .location_config(tenant_id, secondary_conf, None)
-            .await?;
-    }
-
-    println!(
-        "🔁 Switching to AttachedSingle mode on pageserver {}",
-        dest_ps.conf.id
-    );
-    let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-    dest_ps.location_config(tenant_id, dest_conf, None).await?;
-
-    println!("✅ Migration complete");
-
-    Ok(())
-}
--- a/docs/rfcs/030-vectored-timeline-get.md
+++ b/docs/rfcs/030-vectored-timeline-get.md
@@ -0,0 +1,142 @@
+# Vectored Timeline Get
+
+Created on: 2024-01-02
+Author: Christian Schwarz
+
+# Summary
+
+A brief RFC / GitHub Epic describing a vectored version of the `Timeline::get` method that is at the heart of Pageserver.
+
+# Motivation
+
+During basebackup, we issue many `Timeline::get` calls for SLRU pages that are *adjacent* in key space.
+For an example, see
+https://github.com/neondatabase/neon/blob/5c88213eaf1b1e29c610a078d0b380f69ed49a7e/pageserver/src/basebackup.rs#L281-L302.
+
+Each of these `Timeline::get` calls must traverse the layer map to gather reconstruct data (`Timeline::get_reconstruct_data`) for the requested page number (`blknum` in the example).
+For each layer visited by layer map traversal, we do a `DiskBtree` point lookup.
+If it's negative (no entry), we resume layer map traversal.
+If it's positive, we collect the result in our reconstruct data bag.
+If the reconstruct data bag contents suffice to reconstruct the page, we're done with `get_reconstruct_data` and move on to walredo.
+Otherwise, we resume layer map traversal.
+
+Doing this many `Timeline::get` calls is quite inefficient because:
+
+1. We do the layer map traversal repeatedly, even if, e.g., all the data sits in the same image layer at the bottom of the stack.
+2. We may visit many DiskBtree inner pages multiple times for point lookup of different keys.
+   This is likely particularly bad for L0s which span the whole key space and hence must be visited by layer map traversal, but
+   may not contain the data we're looking for.
+3. Anecdotally, keys adjacent in keyspace and written simultaneously also end up physically adjacent in the layer files [^1].
+   So, to provide the reconstruct data for N adjacent keys, we would actually only _need_ to issue a single large read to the filesystem, instead of the N reads we currently do.
+   The filesystem, in turn, ideally stores the layer file physically contiguously, so our large read will turn into one IOP toward the disk.
+
+[^1]: https://www.notion.so/neondatabase/Christian-Investigation-Slow-Basebackups-Early-2023-12-34ea5c7dcdc1485d9ac3731da4d2a6fc?pvs=4#15ee4e143392461fa64590679c8f54c9
+
+# Solution
+
+We should have a vectored aka batched aka scatter-gather style alternative API for `Timeline::get`. Having such an API  unlocks:
+
+* more efficient basebackup
+* batched IO during compaction (useful for strides of unchanged pages)
+* page_service: expose vectored get_page_at_lsn for compute (=> good for seqscan / prefetch)
+  * if [on-demand SLRU downloads](https://github.com/neondatabase/neon/pull/6151) land before vectored Timeline::get, on-demand SLRU downloads will still benefit from this API
+
+# DoD
+
+There is a new variant of `Timeline::get`, called `Timeline::get_vectored`.
+It takes as arguments an `lsn: Lsn` and a `src: &[KeyVec]` where `struct KeyVec { base: Key, count: usize }`.
+
+It is up to the implementor to figure out a suitable and efficient way to return the reconstructed page images.
+It is sufficient to simply return a `Vec<Bytes>`, but, likely more efficient solutions can be found after studying all the callers of `Timeline::get`.
+
+Functionally, the behavior of `Timeline::get_vectored` is equivalent to
+
+```rust
+let mut keys_iter: impl Iterator<Item=Key>
+  = src.map(|KeyVec{ base, count }| (base..base+count)).flatten();
+let mut out = Vec::new();
+for key in keys_iter {
+    let data = Timeline::get(key, lsn)?;
+    out.push(data);
+}
+return out;
+```
+
+However, unlike above, an ideal solution will
+
+* Visit each `struct Layer` at most once.
+* For each visited layer, call `Layer::get_value_reconstruct_data` at most once.
+  * This means, read each `DiskBtree` page at most once.
+* Facilitate merging of the reads we issue to the OS and eventually NVMe.
+
+Each of these items above represents a signficant amount of work.
+
+## Performance
+
+Ideally, the **base performance** of a vectored get of a single page should be identical to the current `Timeline::get`.
+A reasonable constant overhead over current `Timeline::get` is acceptable.
+
+The performance improvement for the vectored use case is demonstrated in some way, e.g., using the `pagebench` basebackup benchmark against a tenant with a lot of SLRU segments.
+
+# Implementation
+
+High-level set of tasks / changes to be made:
+
+- **Get clarity on API**:
+  - Define naive `Timeline::get_vectored` implementation & adopt it across pageserver.
+  - The tricky thing here will be the return type (e.g. `Vec<Bytes>` vs `impl Stream`).
+  - Start with something simple to explore the different usages of the API.
+    Then iterate with peers until we have something that is good enough.
+- **Vectored Layer Map traversal**
+  - Vectored `LayerMap::search` (take 1 LSN and N `Key`s instead of just 1 LSN and 1 `Key`)
+  - Refactor `Timeline::get_reconstruct_data` to hold & return state for N `Key`s instead of 1
+    - The slightly tricky part here is what to do about `cont_lsn` [after we've found some reconstruct data for some keys](https://github.com/neondatabase/neon/blob/d066dad84b076daf3781cdf9a692098889d3974e/pageserver/src/tenant/timeline.rs#L2378-L2385)
+      but need more.
+      Likely we'll need to keep track of `cont_lsn` per key and continue next iteration at `max(cont_lsn)` of all keys that still need data.
+- **Vectored `Layer::get_value_reconstruct_data` / `DiskBtree`**
+  - Current code calls it [here](https://github.com/neondatabase/neon/blob/d066dad84b076daf3781cdf9a692098889d3974e/pageserver/src/tenant/timeline.rs#L2378-L2384).
+  - Delta layers use `DiskBtreeReader::visit()` to collect the `(offset,len)` pairs for delta record blobs to load.
+  - Image layers use `DiskBtreeReader::get` to get the offset of the image blob to load. Underneath, that's just a `::visit()` call.
+  - What needs to happen to `DiskBtree::visit()`?
+    * Minimally
+      * take a single `KeyVec` instead of a single `Key` as argument, i.e., take a single contiguous key range to visit.
+      * Change the visit code to to invoke the callback for all values in the `KeyVec`'s key range
+      * This should be good enough for what we've seen when investigating basebackup slowness, because there, the key ranges are contiguous.
+    * Ideally:
+      * Take a `&[KeyVec]`, sort it;
+      * during Btree traversal, peek at the next `KeyVec` range to determine whether we need to descend or back out.
+      * NB: this should be a straight-forward extension of the minimal solution above, as we'll already be checking for "is there more key range in the requested `KeyVec`".
+- **Facilitate merging of the reads we issue to the OS and eventually NVMe.**
+  - The `DiskBtree::visit` produces a set of offsets which we then read from a `VirtualFile` [here](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/delta_layer.rs#L772-L804)
+    - [Delta layer reads](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/delta_layer.rs#L772-L804)
+      - We hit (and rely) on `PageCache` and `VirtualFile here (not great under pressure)
+    - [Image layer reads](https://github.com/neondatabase/neon/blob/292281c9dfb24152b728b1a846cc45105dac7fe0/pageserver/src/tenant/storage_layer/image_layer.rs#L429-L435)
+  - What needs to happen is the **vectorization of the `blob_io` interface and then the `VirtualFile` API**.
+  - That is tricky because
+    - the `VirtualFile` API, which sits underneath `blob_io`, is being touched by ongoing [io_uring work](https://github.com/neondatabase/neon/pull/5824)
+    - there's the question how IO buffers will be managed; currently this area relies heavily on `PageCache`, but there's controversy around the future of `PageCache`.
+      - The guiding principle here should be to avoid coupling this work to the `PageCache`.
+      - I.e., treat `PageCache` as an extra hop in the I/O chain, rather than as an integral part of buffer management.
+
+
+Let's see how we can improve by doing the first three items in above list first, then revisit.
+
+## Rollout / Feature Flags
+
+No feature flags are required for this epic.
+
+At the end of this epic, `Timeline::get` forwards to `Timeline::get_vectored`, i.e., it's an all-or-nothing type of change.
+
+It is encouraged to deliver this feature incrementally, i.e., do many small PRs over multiple weeks.
+That will help isolate performance regressions across weekly releases.
+
+# Interaction With Sharding
+
+[Sharding](https://github.com/neondatabase/neon/pull/5432) splits up the key space, see functions `is_key_local` / `key_to_shard_number`.
+
+Just as with `Timeline::get`, callers of `Timeline::get_vectored` are responsible for ensuring that they only ask for blocks of the given `struct Timeline`'s shard.
+
+Given that this is already the case, there shouldn't be significant interaction/interference with sharding.
+
+However, let's have a safety check for this constraint (error or assertion) because there are currently few affordances at the higher layers of Pageserver for sharding<=>keyspace interaction.
+For example, `KeySpace` is not broken up by shard stripe, so if someone naively converted the compaction code to issue a vectored get for a keyspace range it would violate this constraint.
--- a/docs/sourcetree.md
+++ b/docs/sourcetree.md
@@ -129,13 +129,13 @@ Run `poetry shell` to activate the virtual environment.
 Alternatively, use `poetry run` to run a single command in the venv, e.g. `poetry run pytest`.

 ### Obligatory checks
-We force code formatting via `black`, `ruff`, and type hints via `mypy`.
+We force code formatting via `ruff`, and type hints via `mypy`.
 Run the following commands in the repository's root (next to `pyproject.toml`):

 ```bash
-poetry run black .  # All code is reformatted
-poetry run ruff .  # Python linter
-poetry run mypy .  # Ensure there are no typing errors
+poetry run ruff format . # All code is reformatted
+poetry run ruff check .  # Python linter
+poetry run mypy .        # Ensure there are no typing errors
 ```

 **WARNING**: do not run `mypy` from a directory other than the root of the repository.
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -75,6 +75,10 @@ pub struct ComputeSpec {
    pub remote_extensions: Option<RemoteExtSpec>,

    pub pgbouncer_settings: Option<HashMap<String, String>>,
+
+    // Stripe size for pageserver sharding, in pages
+    #[serde(default)]
+    pub shard_stripe_size: Option<usize>,
 }

 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
@@ -82,10 +86,13 @@ pub struct ComputeSpec {
 #[serde(rename_all = "snake_case")]
 pub enum ComputeFeature {
    // XXX: Add more feature flags here.
+    /// Enable the experimental activity monitor logic, which uses `pg_stat_database` to
+    /// track short-lived connections as user activity.
+    ActivityMonitorExperimental,

-    // This is a special feature flag that is used to represent unknown feature flags.
-    // Basically all unknown to enum flags are represented as this one. See unit test
-    // `parse_unknown_features()` for more details.
+    /// This is a special feature flag that is used to represent unknown feature flags.
+    /// Basically all unknown to enum flags are represented as this one. See unit test
+    /// `parse_unknown_features()` for more details.
    #[serde(other)]
    UnknownFeature,
 }
@@ -282,4 +289,23 @@ mod tests {
        assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
        assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
    }
+
+    #[test]
+    fn parse_known_features() {
+        // Test that we can properly parse known feature flags.
+        let file = File::open("tests/cluster_spec.json").unwrap();
+        let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
+        let ob = json.as_object_mut().unwrap();
+
+        // Add known feature flags.
+        let features = vec!["activity_monitor_experimental"];
+        ob.insert("features".into(), features.into());
+
+        let spec: ComputeSpec = serde_json::from_value(json).unwrap();
+
+        assert_eq!(
+            spec.features,
+            vec![ComputeFeature::ActivityMonitorExperimental]
+        );
+    }
 }
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -19,6 +19,7 @@ strum.workspace = true
 strum_macros.workspace = true
 hex.workspace = true
 thiserror.workspace = true
+humantime-serde.workspace = true

 workspace_hack.workspace = true

--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -3,6 +3,8 @@ use byteorder::{ByteOrder, BE};
 use serde::{Deserialize, Serialize};
 use std::fmt;

+use crate::reltag::{BlockNumber, RelTag};
+
 /// Key used in the Repository kv-store.
 ///
 /// The Repository treats this as an opaque struct, but see the code in pgdatadir_mapping.rs
@@ -141,8 +143,25 @@ impl Key {
    }
 }

+#[inline(always)]
 pub fn is_rel_block_key(key: &Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0
+    key.field1 == 0x00 && key.field4 != 0 && key.field6 != 0xffffffff
+}
+
+/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
+pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
+    Ok(match key.field1 {
+        0x00 => (
+            RelTag {
+                spcnode: key.field2,
+                dbnode: key.field3,
+                relnode: key.field4,
+                forknum: key.field5,
+            },
+            key.field6,
+        ),
+        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
+    })
 }

 impl std::str::FromStr for Key {
--- a/libs/pageserver_api/src/keyspace.rs
+++ b/libs/pageserver_api/src/keyspace.rs
@@ -114,16 +114,21 @@ impl KeySpaceAccum {
        }
    }

+    #[inline(always)]
    pub fn add_key(&mut self, key: Key) {
        self.add_range(singleton_range(key))
    }

+    #[inline(always)]
    pub fn add_range(&mut self, range: Range<Key>) {
        match self.accum.as_mut() {
            Some(accum) => {
                if range.start == accum.end {
                    accum.end = range.end;
                } else {
+                    // TODO: to efficiently support small sharding stripe sizes, we should avoid starting
+                    // a new range here if the skipped region was all keys that don't belong on this shard.
+                    // (https://github.com/neondatabase/neon/issues/6247)
                    assert!(range.start > accum.end);
                    self.ranges.push(accum.clone());
                    *accum = range;
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -2,9 +2,9 @@ pub mod partitioning;

 use std::{
    collections::HashMap,
-    io::Read,
+    io::{BufRead, Read},
    num::{NonZeroU64, NonZeroUsize},
-    time::SystemTime,
+    time::{Duration, SystemTime},
 };

 use byteorder::{BigEndian, ReadBytesExt};
@@ -18,7 +18,10 @@ use utils::{
    lsn::Lsn,
 };

-use crate::{reltag::RelTag, shard::TenantShardId};
+use crate::{
+    reltag::RelTag,
+    shard::{ShardCount, ShardStripeSize, TenantShardId},
+};
 use anyhow::bail;
 use bytes::{Buf, BufMut, Bytes, BytesMut};

@@ -188,6 +191,31 @@ pub struct TimelineCreateRequest {
    pub pg_version: Option<u32>,
 }

+/// Parameters that apply to all shards in a tenant.  Used during tenant creation.
+#[derive(Serialize, Deserialize, Debug)]
+#[serde(deny_unknown_fields)]
+pub struct ShardParameters {
+    pub count: ShardCount,
+    pub stripe_size: ShardStripeSize,
+}
+
+impl ShardParameters {
+    pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
+
+    pub fn is_unsharded(&self) -> bool {
+        self.count == ShardCount(0)
+    }
+}
+
+impl Default for ShardParameters {
+    fn default() -> Self {
+        Self {
+            count: ShardCount(0),
+            stripe_size: Self::DEFAULT_STRIPE_SIZE,
+        }
+    }
+}
+
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantCreateRequest {
@@ -195,6 +223,12 @@ pub struct TenantCreateRequest {
    #[serde(default)]
    #[serde(skip_serializing_if = "Option::is_none")]
    pub generation: Option<u32>,
+
+    // If omitted, create a single shard with TenantShardId::unsharded()
+    #[serde(default)]
+    #[serde(skip_serializing_if = "ShardParameters::is_unsharded")]
+    pub shard_parameters: ShardParameters,
+
    #[serde(flatten)]
    pub config: TenantConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
@@ -217,7 +251,7 @@ impl std::ops::Deref for TenantCreateRequest {

 /// An alternative representation of `pageserver::tenant::TenantConf` with
 /// simpler types.
-#[derive(Serialize, Deserialize, Debug, Default)]
+#[derive(Serialize, Deserialize, Debug, Default, Clone, Eq, PartialEq)]
 pub struct TenantConfig {
    pub checkpoint_distance: Option<u64>,
    pub checkpoint_timeout: Option<String>,
@@ -232,21 +266,41 @@ pub struct TenantConfig {
    pub lagging_wal_timeout: Option<String>,
    pub max_lsn_wal_lag: Option<NonZeroU64>,
    pub trace_read_requests: Option<bool>,
-    // We defer the parsing of the eviction_policy field to the request handler.
-    // Otherwise we'd have to move the types for eviction policy into this package.
-    // We might do that once the eviction feature has stabilizied.
-    // For now, this field is not even documented in the openapi_spec.yml.
-    pub eviction_policy: Option<serde_json::Value>,
+    pub eviction_policy: Option<EvictionPolicy>,
    pub min_resident_size_override: Option<u64>,
    pub evictions_low_residence_duration_metric_threshold: Option<String>,
    pub gc_feedback: Option<bool>,
    pub heatmap_period: Option<String>,
 }

+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[serde(tag = "kind")]
+pub enum EvictionPolicy {
+    NoEviction,
+    LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
+}
+
+impl EvictionPolicy {
+    pub fn discriminant_str(&self) -> &'static str {
+        match self {
+            EvictionPolicy::NoEviction => "NoEviction",
+            EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
+        }
+    }
+}
+
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+pub struct EvictionPolicyLayerAccessThreshold {
+    #[serde(with = "humantime_serde")]
+    pub period: Duration,
+    #[serde(with = "humantime_serde")]
+    pub threshold: Duration,
+}
+
 /// A flattened analog of a `pagesever::tenant::LocationMode`, which
 /// lists out all possible states (and the virtual "Detached" state)
 /// in a flat form rather than using rust-style enums.
-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
 pub enum LocationConfigMode {
    AttachedSingle,
    AttachedMulti,
@@ -255,19 +309,21 @@ pub enum LocationConfigMode {
    Detached,
 }

-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
 pub struct LocationConfigSecondary {
    pub warm: bool,
 }

 /// An alternative representation of `pageserver::tenant::LocationConf`,
 /// for use in external-facing APIs.
-#[derive(Serialize, Deserialize, Debug)]
+#[derive(Serialize, Deserialize, Debug, Clone, Eq, PartialEq)]
 pub struct LocationConfig {
    pub mode: LocationConfigMode,
    /// If attaching, in what generation?
    #[serde(default)]
    pub generation: Option<u32>,
+
+    // If requesting mode `Secondary`, configuration for that.
    #[serde(default)]
    pub secondary_conf: Option<LocationConfigSecondary>,

@@ -280,11 +336,17 @@ pub struct LocationConfig {
    #[serde(default)]
    pub shard_stripe_size: u32,

-    // If requesting mode `Secondary`, configuration for that.
-    // Custom storage configuration for the tenant, if any
+    // This configuration only affects attached mode, but should be provided irrespective
+    // of the mode, as a secondary location might transition on startup if the response
+    // to the `/re-attach` control plane API requests it.
    pub tenant_conf: TenantConfig,
 }

+#[derive(Serialize, Deserialize)]
+pub struct LocationConfigListResponse {
+    pub tenant_shards: Vec<(TenantShardId, Option<LocationConfig>)>,
+}
+
 #[derive(Serialize, Deserialize)]
 #[serde(transparent)]
 pub struct TenantCreateResponse(pub TenantId);
@@ -297,7 +359,7 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
-    pub tenant_id: TenantId,
+    pub tenant_id: TenantShardId,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
@@ -368,6 +430,8 @@ pub struct TenantInfo {
    /// If a layer is present in both local FS and S3, it counts only once.
    pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
    pub attachment_status: TenantAttachmentStatus,
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub generation: Option<u32>,
 }

 #[derive(Serialize, Deserialize, Clone)]
@@ -658,6 +722,17 @@ pub struct PagestreamDbSizeResponse {
    pub db_size: i64,
 }

+// This is a cut-down version of TenantHistorySize from the pageserver crate, omitting fields
+// that require pageserver-internal types.  It is sufficient to get the total size.
+#[derive(Serialize, Deserialize, Debug)]
+pub struct TenantHistorySize {
+    pub id: TenantId,
+    /// Size is a mixture of WAL and logical size, so the unit is bytes.
+    ///
+    /// Will be none if `?inputs_only=true` was given.
+    pub size: Option<u64>,
+}
+
 impl PagestreamFeMessage {
    pub fn serialize(&self) -> Bytes {
        let mut bytes = BytesMut::new();
@@ -813,9 +888,10 @@ impl PagestreamBeMessage {
                    PagestreamBeMessage::GetPage(PagestreamGetPageResponse { page: page.into() })
                }
                Tag::Error => {
-                    let buf = buf.get_ref();
-                    let cstr = std::ffi::CStr::from_bytes_until_nul(buf)?;
-                    let rust_str = cstr.to_str()?;
+                    let mut msg = Vec::new();
+                    buf.read_until(0, &mut msg)?;
+                    let cstring = std::ffi::CString::from_vec_with_nul(msg)?;
+                    let rust_str = cstring.to_str()?;
                    PagestreamBeMessage::Error(PagestreamErrorResponse {
                        message: rust_str.to_owned(),
                    })
@@ -909,6 +985,7 @@ mod tests {
            state: TenantState::Active,
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
+            generation: None,
        };
        let expected_active = json!({
            "id": original_active.id.to_string(),
@@ -929,6 +1006,7 @@ mod tests {
            },
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
+            generation: None,
        };
        let expected_broken = json!({
            "id": original_broken.id.to_string(),
--- a/libs/pageserver_api/src/reltag.rs
+++ b/libs/pageserver_api/src/reltag.rs
@@ -32,6 +32,9 @@ pub struct RelTag {
    pub relnode: Oid,
 }

+/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
+pub type BlockNumber = u32;
+
 impl PartialOrd for RelTag {
    fn partial_cmp(&self, other: &Self) -> Option<Ordering> {
        Some(self.cmp(other))
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,6 +1,9 @@
 use std::{ops::RangeInclusive, str::FromStr};

-use crate::key::{is_rel_block_key, Key};
+use crate::{
+    key::{is_rel_block_key, Key},
+    models::ShardParameters,
+};
 use hex::FromHex;
 use serde::{Deserialize, Serialize};
 use thiserror;
@@ -85,6 +88,12 @@ impl TenantShardId {
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
    }
+    pub fn to_index(&self) -> ShardIndex {
+        ShardIndex {
+            shard_number: self.shard_number,
+            shard_count: self.shard_count,
+        }
+    }
 }

 /// Formatting helper
@@ -333,7 +342,7 @@ const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
 pub struct ShardIdentity {
    pub number: ShardNumber,
    pub count: ShardCount,
-    stripe_size: ShardStripeSize,
+    pub stripe_size: ShardStripeSize,
    layout: ShardLayout,
 }

@@ -403,6 +412,17 @@ impl ShardIdentity {
        }
    }

+    /// For use when creating ShardIdentity instances for new shards, where a creation request
+    /// specifies the ShardParameters that apply to all shards.
+    pub fn from_params(number: ShardNumber, params: &ShardParameters) -> Self {
+        Self {
+            number,
+            count: params.count,
+            layout: LAYOUT_V1,
+            stripe_size: params.stripe_size,
+        }
+    }
+
    fn is_broken(&self) -> bool {
        self.layout == LAYOUT_BROKEN
    }
@@ -422,6 +442,21 @@ impl ShardIdentity {
        }
    }

+    /// Return true if the key should be discarded if found in this shard's
+    /// data store, e.g. during compaction after a split
+    pub fn is_key_disposable(&self, key: &Key) -> bool {
+        if key_is_shard0(key) {
+            // Q: Why can't we dispose of shard0 content if we're not shard 0?
+            // A: because the WAL ingestion logic currently ingests some shard 0
+            //    content on all shards, even though it's only read on shard 0.  If we
+            //    dropped it, then subsequent WAL ingest to these keys would encounter
+            //    an error.
+            false
+        } else {
+            !self.is_key_local(key)
+        }
+    }
+
    pub fn shard_slug(&self) -> String {
        if self.count > ShardCount(0) {
            format!("-{:02x}{:02x}", self.number.0, self.count.0)
@@ -515,12 +550,7 @@ fn key_is_shard0(key: &Key) -> bool {
    // relation pages are distributed to shards other than shard zero. Everything else gets
    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
    // requests, and any request other than those for particular blocks in relations.
-    //
-    // In this condition:
-    // - is_rel_block_key includes only relations, i.e. excludes SLRU data and
-    // all metadata.
-    // - field6 is set to -1 for relation size pages.
-    !(is_rel_block_key(key) && key.field6 != 0xffffffff)
+    !is_rel_block_key(key)
 }

 /// Provide the same result as the function in postgres `hashfn.h` with the same name
--- a/libs/postgres_backend/src/lib.rs
+++ b/libs/postgres_backend/src/lib.rs
@@ -35,6 +35,12 @@ pub enum QueryError {
    /// We were instructed to shutdown while processing the query
    #[error("Shutting down")]
    Shutdown,
+    /// Query handler indicated that client should reconnect
+    #[error("Server requested reconnect")]
+    Reconnect,
+    /// Query named an entity that was not found
+    #[error("Not found: {0}")]
+    NotFound(std::borrow::Cow<'static, str>),
    /// Authentication failure
    #[error("Unauthorized: {0}")]
    Unauthorized(std::borrow::Cow<'static, str>),
@@ -54,9 +60,9 @@ impl From<io::Error> for QueryError {
 impl QueryError {
    pub fn pg_error_code(&self) -> &'static [u8; 5] {
        match self {
-            Self::Disconnected(_) | Self::SimulatedConnectionError => b"08006", // connection failure
+            Self::Disconnected(_) | Self::SimulatedConnectionError | Self::Reconnect => b"08006", // connection failure
            Self::Shutdown => SQLSTATE_ADMIN_SHUTDOWN,
-            Self::Unauthorized(_) => SQLSTATE_INTERNAL_ERROR,
+            Self::Unauthorized(_) | Self::NotFound(_) => SQLSTATE_INTERNAL_ERROR,
            Self::Other(_) => SQLSTATE_INTERNAL_ERROR, // internal error
        }
    }
@@ -425,6 +431,11 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> PostgresBackend<IO> {
                info!("Stopped due to shutdown");
                Ok(())
            }
+            Err(QueryError::Reconnect) => {
+                // Dropping out of this loop implicitly disconnects
+                info!("Stopped due to handler reconnect request");
+                Ok(())
+            }
            Err(QueryError::Disconnected(e)) => {
                info!("Disconnected ({e:#})");
                // Disconnection is not an error: we just use it that way internally to drop
@@ -974,7 +985,9 @@ impl<'a, IO: AsyncRead + AsyncWrite + Unpin> AsyncWrite for CopyDataWriter<'a, I
 pub fn short_error(e: &QueryError) -> String {
    match e {
        QueryError::Disconnected(connection_error) => connection_error.to_string(),
+        QueryError::Reconnect => "reconnect".to_string(),
        QueryError::Shutdown => "shutdown".to_string(),
+        QueryError::NotFound(_) => "not found".to_string(),
        QueryError::Unauthorized(_e) => "JWT authentication error".to_string(),
        QueryError::SimulatedConnectionError => "simulated connection error".to_string(),
        QueryError::Other(e) => format!("{e:#}"),
@@ -996,9 +1009,15 @@ fn log_query_error(query: &str, e: &QueryError) {
        QueryError::SimulatedConnectionError => {
            error!("query handler for query '{query}' failed due to a simulated connection error")
        }
+        QueryError::Reconnect => {
+            info!("query handler for '{query}' requested client to reconnect")
+        }
        QueryError::Shutdown => {
            info!("query handler for '{query}' cancelled during tenant shutdown")
        }
+        QueryError::NotFound(reason) => {
+            info!("query handler for '{query}' entity not found: {reason}")
+        }
        QueryError::Unauthorized(e) => {
            warn!("query handler for '{query}' failed with authentication error: {e}");
        }
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -5,7 +5,9 @@ use std::collections::HashMap;
 use std::env;
 use std::num::NonZeroU32;
 use std::pin::Pin;
+use std::str::FromStr;
 use std::sync::Arc;
+use std::time::Duration;

 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
@@ -13,12 +15,14 @@ use azure_core::request_options::{MaxResults, Metadata, Range};
 use azure_core::RetryOptions;
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
+use azure_storage_blobs::blob::CopyStatus;
 use azure_storage_blobs::prelude::ClientBuilder;
 use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
 use bytes::Bytes;
 use futures::stream::Stream;
 use futures_util::StreamExt;
-use http_types::StatusCode;
+use http_types::{StatusCode, Url};
+use tokio::time::Instant;
 use tracing::debug;

 use crate::s3_bucket::RequestKind;
@@ -322,6 +326,51 @@ impl RemoteStorage for AzureBlobStorage {
        }
        Ok(())
    }
+
+    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+        let _permit = self.permit(RequestKind::Copy).await;
+        let blob_client = self.client.blob_client(self.relative_path_to_name(to));
+
+        let source_url = format!(
+            "{}/{}",
+            self.client.url()?,
+            self.relative_path_to_name(from)
+        );
+        let builder = blob_client.copy(Url::from_str(&source_url)?);
+
+        let result = builder.into_future().await?;
+
+        let mut copy_status = result.copy_status;
+        let start_time = Instant::now();
+        const MAX_WAIT_TIME: Duration = Duration::from_secs(60);
+        loop {
+            match copy_status {
+                CopyStatus::Aborted => {
+                    anyhow::bail!("Received abort for copy from {from} to {to}.");
+                }
+                CopyStatus::Failed => {
+                    anyhow::bail!("Received failure response for copy from {from} to {to}.");
+                }
+                CopyStatus::Success => return Ok(()),
+                CopyStatus::Pending => (),
+            }
+            // The copy is taking longer. Waiting a second and then re-trying.
+            // TODO estimate time based on copy_progress and adjust time based on that
+            tokio::time::sleep(Duration::from_millis(1000)).await;
+            let properties = blob_client.get_properties().into_future().await?;
+            let Some(status) = properties.blob.properties.copy_status else {
+                tracing::warn!("copy_status for copy is None!, from={from}, to={to}");
+                return Ok(());
+            };
+            if start_time.elapsed() > MAX_WAIT_TIME {
+                anyhow::bail!("Copy from from {from} to {to} took longer than limit MAX_WAIT_TIME={}s. copy_pogress={:?}.",
+                    MAX_WAIT_TIME.as_secs_f32(),
+                    properties.blob.properties.copy_progress,
+                );
+            }
+            copy_status = status;
+        }
+    }
 }

 pin_project_lite::pin_project! {
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -207,6 +207,9 @@ pub trait RemoteStorage: Send + Sync + 'static {
    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;

    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
+
+    /// Copy a remote object inside a bucket from one path to another.
+    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
 }

 pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
@@ -374,6 +377,15 @@ impl GenericRemoteStorage {
            Self::Unreliable(s) => s.delete_objects(paths).await,
        }
    }
+
+    pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+        match self {
+            Self::LocalFs(s) => s.copy(from, to).await,
+            Self::AwsS3(s) => s.copy(from, to).await,
+            Self::AzureBlob(s) => s.copy(from, to).await,
+            Self::Unreliable(s) => s.copy(from, to).await,
+        }
+    }
 }

 impl GenericRemoteStorage {
@@ -660,6 +672,7 @@ impl ConcurrencyLimiter {
            RequestKind::Put => &self.write,
            RequestKind::List => &self.read,
            RequestKind::Delete => &self.write,
+            RequestKind::Copy => &self.write,
        }
    }

--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -409,6 +409,20 @@ impl RemoteStorage for LocalFs {
        }
        Ok(())
    }
+
+    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+        let from_path = from.with_base(&self.storage_root);
+        let to_path = to.with_base(&self.storage_root);
+        create_target_directory(&to_path).await?;
+        fs::copy(&from_path, &to_path).await.with_context(|| {
+            format!(
+                "Failed to copy file from '{from_path}' to '{to_path}'",
+                from_path = from_path,
+                to_path = to_path
+            )
+        })?;
+        Ok(())
+    }
 }

 fn storage_metadata_path(original_path: &Utf8Path) -> Utf8PathBuf {
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -493,6 +493,38 @@ impl RemoteStorage for S3Bucket {
        Ok(())
    }

+    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+        let kind = RequestKind::Copy;
+        let _guard = self.permit(kind).await;
+
+        let started_at = start_measuring_requests(kind);
+
+        // we need to specify bucket_name as a prefix
+        let copy_source = format!(
+            "{}/{}",
+            self.bucket_name,
+            self.relative_path_to_s3_object(from)
+        );
+
+        let res = self
+            .client
+            .copy_object()
+            .bucket(self.bucket_name.clone())
+            .key(self.relative_path_to_s3_object(to))
+            .copy_source(copy_source)
+            .send()
+            .await;
+
+        let started_at = ScopeGuard::into_inner(started_at);
+        metrics::BUCKET_METRICS
+            .req_seconds
+            .observe_elapsed(kind, &res, started_at);
+
+        res?;
+
+        Ok(())
+    }
+
    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
        // if prefix is not none then download file `prefix/from`
        // if prefix is none then download file `from`
--- a/libs/remote_storage/src/s3_bucket/metrics.rs
+++ b/libs/remote_storage/src/s3_bucket/metrics.rs
@@ -11,6 +11,7 @@ pub(crate) enum RequestKind {
    Put = 1,
    Delete = 2,
    List = 3,
+    Copy = 4,
 }

 use RequestKind::*;
@@ -22,6 +23,7 @@ impl RequestKind {
            Put => "put_object",
            Delete => "delete_object",
            List => "list_objects",
+            Copy => "copy_object",
        }
    }
    const fn as_index(&self) -> usize {
@@ -29,7 +31,7 @@ impl RequestKind {
    }
 }

-pub(super) struct RequestTyped<C>([C; 4]);
+pub(super) struct RequestTyped<C>([C; 5]);

 impl<C> RequestTyped<C> {
    pub(super) fn get(&self, kind: RequestKind) -> &C {
@@ -38,8 +40,8 @@ impl<C> RequestTyped<C> {

    fn build_with(mut f: impl FnMut(RequestKind) -> C) -> Self {
        use RequestKind::*;
-        let mut it = [Get, Put, Delete, List].into_iter();
-        let arr = std::array::from_fn::<C, 4, _>(|index| {
+        let mut it = [Get, Put, Delete, List, Copy].into_iter();
+        let arr = std::array::from_fn::<C, 5, _>(|index| {
            let next = it.next().unwrap();
            assert_eq!(index, next.as_index());
            f(next)
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -162,4 +162,11 @@ impl RemoteStorage for UnreliableWrapper {
        }
        Ok(())
    }
+
+    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+        // copy is equivalent to download + upload
+        self.attempt(RemoteOp::Download(from.clone()))?;
+        self.attempt(RemoteOp::Upload(to.clone()))?;
+        self.inner.copy_object(from, to).await
+    }
 }
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -0,0 +1,288 @@
+use anyhow::Context;
+use camino::Utf8Path;
+use remote_storage::RemotePath;
+use std::collections::HashSet;
+use std::sync::Arc;
+use test_context::test_context;
+use tracing::debug;
+
+use crate::common::{download_to_vec, upload_stream, wrap_stream};
+
+use super::{
+    MaybeEnabledStorage, MaybeEnabledStorageWithSimpleTestBlobs, MaybeEnabledStorageWithTestBlobs,
+};
+
+/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
+/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
+/// See the client creation in [`create_s3_client`] for details on the required env vars.
+/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
+/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
+///
+/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
+/// where
+/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
+/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
+///
+/// Then, verifies that the client does return correct prefixes when queried:
+/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
+/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
+///
+/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
+/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
+/// since current default AWS S3 pagination limit is 1000.
+/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
+///
+/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
+/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
+#[test_context(MaybeEnabledStorageWithTestBlobs)]
+#[tokio::test]
+async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorageWithTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledStorageWithTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledStorageWithTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("S3 init failed: {e:?}")
+        }
+    };
+
+    let test_client = Arc::clone(&ctx.enabled.client);
+    let expected_remote_prefixes = ctx.remote_prefixes.clone();
+
+    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
+        .context("common_prefix construction")?;
+    let root_remote_prefixes = test_client
+        .list_prefixes(None)
+        .await
+        .context("client list root prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
+        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
+    );
+
+    let nested_remote_prefixes = test_client
+        .list_prefixes(Some(&base_prefix))
+        .await
+        .context("client list nested prefixes failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let remote_only_prefixes = nested_remote_prefixes
+        .difference(&expected_remote_prefixes)
+        .collect::<HashSet<_>>();
+    let missing_uploaded_prefixes = expected_remote_prefixes
+        .difference(&nested_remote_prefixes)
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
+        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
+    );
+
+    Ok(())
+}
+
+/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
+/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
+/// See `s3_pagination_should_work` for more information.
+///
+/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
+/// Then performs the following queries:
+///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
+///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
+#[test_context(MaybeEnabledStorageWithSimpleTestBlobs)]
+#[tokio::test]
+async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorageWithSimpleTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledStorageWithSimpleTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledStorageWithSimpleTestBlobs::UploadsFailed(e, _) => {
+            anyhow::bail!("S3 init failed: {e:?}")
+        }
+    };
+    let test_client = Arc::clone(&ctx.enabled.client);
+    let base_prefix =
+        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
+    let root_files = test_client
+        .list_files(None)
+        .await
+        .context("client list root files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    assert_eq!(
+        root_files,
+        ctx.remote_blobs.clone(),
+        "remote storage list_files on root mismatches with the uploads."
+    );
+    let nested_remote_files = test_client
+        .list_files(Some(&base_prefix))
+        .await
+        .context("client list nested files failure")?
+        .into_iter()
+        .collect::<HashSet<_>>();
+    let trim_remote_blobs: HashSet<_> = ctx
+        .remote_blobs
+        .iter()
+        .map(|x| x.get_path())
+        .filter(|x| x.starts_with("folder1"))
+        .map(|x| RemotePath::new(x).expect("must be valid path"))
+        .collect();
+    assert_eq!(
+        nested_remote_files, trim_remote_blobs,
+        "remote storage list_files on subdirrectory mismatches with the uploads."
+    );
+    Ok(())
+}
+
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorage::Enabled(ctx) => ctx,
+        MaybeEnabledStorage::Disabled => return Ok(()),
+    };
+
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
+    ))
+    .with_context(|| "RemotePath conversion")?;
+
+    ctx.client.delete(&path).await.expect("should succeed");
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledStorage::Enabled(ctx) => ctx,
+        MaybeEnabledStorage::Disabled => return Ok(()),
+    };
+
+    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
+    ctx.client.upload(data, len, &path1, None).await?;
+
+    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
+    ctx.client.upload(data, len, &path2, None).await?;
+
+    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
+    ctx.client.upload(data, len, &path3, None).await?;
+
+    ctx.client.delete_objects(&[path1, path2]).await?;
+
+    let prefixes = ctx.client.list_prefixes(None).await?;
+
+    assert_eq!(prefixes.len(), 1);
+
+    ctx.client.delete_objects(&[path3]).await?;
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
+    let MaybeEnabledStorage::Enabled(ctx) = ctx else {
+        return Ok(());
+    };
+
+    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
+        .with_context(|| "RemotePath conversion")?;
+
+    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
+
+    let (data, len) = wrap_stream(orig.clone());
+
+    ctx.client.upload(data, len, &path, None).await?;
+
+    // Normal download request
+    let dl = ctx.client.download(&path).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    // Full range (end specified)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 0, Some(len as u64))
+        .await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    // partial range (end specified)
+    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[4..10]);
+
+    // partial range (end beyond real end)
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 8, Some(len as u64 * 100))
+        .await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[8..]);
+
+    // Partial range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig[4..]);
+
+    // Full range (end unspecified)
+    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    debug!("Cleanup: deleting file at path {path:?}");
+    ctx.client
+        .delete(&path)
+        .await
+        .with_context(|| format!("{path:?} removal"))?;
+
+    Ok(())
+}
+
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
+    let MaybeEnabledStorage::Enabled(ctx) = ctx else {
+        return Ok(());
+    };
+
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/file_to_copy", ctx.base_prefix).as_str(),
+    ))
+    .with_context(|| "RemotePath conversion")?;
+    let path_dest = RemotePath::new(Utf8Path::new(
+        format!("{}/file_dest", ctx.base_prefix).as_str(),
+    ))
+    .with_context(|| "RemotePath conversion")?;
+
+    let orig = bytes::Bytes::from_static("remote blob data content".as_bytes());
+
+    let (data, len) = wrap_stream(orig.clone());
+
+    ctx.client.upload(data, len, &path, None).await?;
+
+    // Normal download request
+    ctx.client.copy_object(&path, &path_dest).await?;
+
+    let dl = ctx.client.download(&path_dest).await?;
+    let buf = download_to_vec(dl).await?;
+    assert_eq!(&buf, &orig);
+
+    debug!("Cleanup: deleting file at path {path:?}");
+    ctx.client
+        .delete_objects(&[path.clone(), path_dest.clone()])
+        .await
+        .with_context(|| format!("{path:?} removal"))?;
+
+    Ok(())
+}
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -6,263 +6,23 @@ use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
-use camino::Utf8Path;
 use remote_storage::{
    AzureConfig, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
 };
-use test_context::{test_context, AsyncTestContext};
-use tracing::{debug, info};
+use test_context::AsyncTestContext;
+use tracing::info;

 mod common;

-use common::{
-    cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
-    upload_stream, wrap_stream,
-};
+#[path = "common/tests.rs"]
+mod tests_azure;
+
+use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};

 const ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_AZURE_REMOTE_STORAGE";

 const BASE_PREFIX: &str = "test";

-/// Tests that the Azure client can list all prefixes, even if the response comes paginated and requires multiple HTTP queries.
-/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified.
-/// See the client creation in [`create_azure_client`] for details on the required env vars.
-/// If real Azure tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
-/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
-///
-/// First, the test creates a set of Azure blobs with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
-/// where
-/// * `random_prefix_part` is set for the entire Azure client during the Azure client creation in [`create_azure_client`], to avoid multiple test runs interference
-/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
-///
-/// Then, verifies that the client does return correct prefixes when queried:
-/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
-/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
-///
-/// With the real Azure enabled and `#[cfg(test)]` Rust configuration used, the Azure client test adds a `max-keys` param to limit the response keys.
-/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to Azure.
-///
-/// Lastly, the test attempts to clean up and remove all uploaded Azure files.
-/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
-#[test_context(MaybeEnabledAzureWithTestBlobs)]
-#[tokio::test]
-async fn azure_pagination_should_work(
-    ctx: &mut MaybeEnabledAzureWithTestBlobs,
-) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzureWithTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledAzureWithTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledAzureWithTestBlobs::UploadsFailed(e, _) => {
-            anyhow::bail!("Azure init failed: {e:?}")
-        }
-    };
-
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let expected_remote_prefixes = ctx.remote_prefixes.clone();
-
-    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
-        .context("common_prefix construction")?;
-    let root_remote_prefixes = test_client
-        .list_prefixes(None)
-        .await
-        .context("client list root prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
-        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
-    );
-
-    let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix))
-        .await
-        .context("client list nested prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let remote_only_prefixes = nested_remote_prefixes
-        .difference(&expected_remote_prefixes)
-        .collect::<HashSet<_>>();
-    let missing_uploaded_prefixes = expected_remote_prefixes
-        .difference(&nested_remote_prefixes)
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
-        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
-    );
-
-    Ok(())
-}
-
-/// Tests that Azure client can list all files in a folder, even if the response comes paginated and requirees multiple Azure queries.
-/// Uses real Azure and requires [`ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME`] and related Azure cred env vars specified. Test will skip real code and pass if env vars not set.
-/// See `Azure_pagination_should_work` for more information.
-///
-/// First, create a set of Azure objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
-/// Then performs the following queries:
-///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
-#[test_context(MaybeEnabledAzureWithSimpleTestBlobs)]
-#[tokio::test]
-async fn azure_list_files_works(
-    ctx: &mut MaybeEnabledAzureWithSimpleTestBlobs,
-) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzureWithSimpleTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledAzureWithSimpleTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledAzureWithSimpleTestBlobs::UploadsFailed(e, _) => {
-            anyhow::bail!("Azure init failed: {e:?}")
-        }
-    };
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let base_prefix =
-        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
-    let root_files = test_client
-        .list_files(None)
-        .await
-        .context("client list root files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_files,
-        ctx.remote_blobs.clone(),
-        "remote storage list_files on root mismatches with the uploads."
-    );
-    let nested_remote_files = test_client
-        .list_files(Some(&base_prefix))
-        .await
-        .context("client list nested files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let trim_remote_blobs: HashSet<_> = ctx
-        .remote_blobs
-        .iter()
-        .map(|x| x.get_path())
-        .filter(|x| x.starts_with("folder1"))
-        .map(|x| RemotePath::new(x).expect("must be valid path"))
-        .collect();
-    assert_eq!(
-        nested_remote_files, trim_remote_blobs,
-        "remote storage list_files on subdirrectory mismatches with the uploads."
-    );
-    Ok(())
-}
-
-#[test_context(MaybeEnabledAzure)]
-#[tokio::test]
-async fn azure_delete_non_exising_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzure::Enabled(ctx) => ctx,
-        MaybeEnabledAzure::Disabled => return Ok(()),
-    };
-
-    let path = RemotePath::new(Utf8Path::new(
-        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
-    ))
-    .with_context(|| "RemotePath conversion")?;
-
-    ctx.client.delete(&path).await.expect("should succeed");
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledAzure)]
-#[tokio::test]
-async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledAzure::Enabled(ctx) => ctx,
-        MaybeEnabledAzure::Disabled => return Ok(()),
-    };
-
-    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-    ctx.client.upload(data, len, &path1, None).await?;
-
-    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
-    ctx.client.upload(data, len, &path2, None).await?;
-
-    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-    ctx.client.upload(data, len, &path3, None).await?;
-
-    ctx.client.delete_objects(&[path1, path2]).await?;
-
-    let prefixes = ctx.client.list_prefixes(None).await?;
-
-    assert_eq!(prefixes.len(), 1);
-
-    ctx.client.delete_objects(&[path3]).await?;
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledAzure)]
-#[tokio::test]
-async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Result<()> {
-    let MaybeEnabledAzure::Enabled(ctx) = ctx else {
-        return Ok(());
-    };
-
-    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
-
-    let (data, len) = wrap_stream(orig.clone());
-
-    ctx.client.upload(data, len, &path, None).await?;
-
-    // Normal download request
-    let dl = ctx.client.download(&path).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    // Full range (end specified)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 0, Some(len as u64))
-        .await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    // partial range (end specified)
-    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[4..10]);
-
-    // partial range (end beyond real end)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 8, Some(len as u64 * 100))
-        .await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[8..]);
-
-    // Partial range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[4..]);
-
-    // Full range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    debug!("Cleanup: deleting file at path {path:?}");
-    ctx.client
-        .delete(&path)
-        .await
-        .with_context(|| format!("{path:?} removal"))?;
-
-    Ok(())
-}
-
 struct EnabledAzure {
    client: Arc<GenericRemoteStorage>,
    base_prefix: &'static str,
@@ -281,13 +41,13 @@ impl EnabledAzure {
    }
 }

-enum MaybeEnabledAzure {
+enum MaybeEnabledStorage {
    Enabled(EnabledAzure),
    Disabled,
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledAzure {
+impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();

@@ -303,7 +63,7 @@ impl AsyncTestContext for MaybeEnabledAzure {
    }
 }

-enum MaybeEnabledAzureWithTestBlobs {
+enum MaybeEnabledStorageWithTestBlobs {
    Enabled(AzureWithTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, AzureWithTestBlobs),
@@ -316,7 +76,7 @@ struct AzureWithTestBlobs {
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
+impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
@@ -367,7 +127,7 @@ impl AsyncTestContext for MaybeEnabledAzureWithTestBlobs {
 // However, they are not idential. The list_prefixes function is concerned with listing prefixes,
 // whereas the list_files function is concerned with listing files.
 // See `RemoteStorage::list_files` documentation for more details
-enum MaybeEnabledAzureWithSimpleTestBlobs {
+enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(AzureWithSimpleTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, AzureWithSimpleTestBlobs),
@@ -378,7 +138,7 @@ struct AzureWithSimpleTestBlobs {
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledAzureWithSimpleTestBlobs {
+impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_AZURE_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -6,259 +6,23 @@ use std::sync::Arc;
 use std::time::UNIX_EPOCH;

 use anyhow::Context;
-use camino::Utf8Path;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
-use test_context::{test_context, AsyncTestContext};
-use tracing::{debug, info};
+use test_context::AsyncTestContext;
+use tracing::info;

 mod common;

-use common::{
-    cleanup, download_to_vec, ensure_logging_ready, upload_remote_data, upload_simple_remote_data,
-    upload_stream, wrap_stream,
-};
+#[path = "common/tests.rs"]
+mod tests_s3;
+
+use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_remote_data};

 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";

 const BASE_PREFIX: &str = "test";

-/// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
-/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
-/// See the client creation in [`create_s3_client`] for details on the required env vars.
-/// If real S3 tests are disabled, the test passes, skipping any real test run: currently, there's no way to mark the test ignored in runtime with the
-/// deafult test framework, see https://github.com/rust-lang/rust/issues/68007 for details.
-///
-/// First, the test creates a set of S3 objects with keys `/${random_prefix_part}/${base_prefix_str}/sub_prefix_${i}/blob_${i}` in [`upload_remote_data`]
-/// where
-/// * `random_prefix_part` is set for the entire S3 client during the S3 client creation in [`create_s3_client`], to avoid multiple test runs interference
-/// * `base_prefix_str` is a common prefix to use in the client requests: we would want to ensure that the client is able to list nested prefixes inside the bucket
-///
-/// Then, verifies that the client does return correct prefixes when queried:
-/// * with no prefix, it lists everything after its `${random_prefix_part}/` — that should be `${base_prefix_str}` value only
-/// * with `${base_prefix_str}/` prefix, it lists every `sub_prefix_${i}`
-///
-/// With the real S3 enabled and `#[cfg(test)]` Rust configuration used, the S3 client test adds a `max-keys` param to limit the response keys.
-/// This way, we are able to test the pagination implicitly, by ensuring all results are returned from the remote storage and avoid uploading too many blobs to S3,
-/// since current default AWS S3 pagination limit is 1000.
-/// (see https://docs.aws.amazon.com/AmazonS3/latest/API/API_ListObjectsV2.html#API_ListObjectsV2_RequestSyntax)
-///
-/// Lastly, the test attempts to clean up and remove all uploaded S3 files.
-/// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
-#[test_context(MaybeEnabledS3WithTestBlobs)]
-#[tokio::test]
-async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledS3WithTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledS3WithTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledS3WithTestBlobs::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
-    };
-
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let expected_remote_prefixes = ctx.remote_prefixes.clone();
-
-    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
-        .context("common_prefix construction")?;
-    let root_remote_prefixes = test_client
-        .list_prefixes(None)
-        .await
-        .context("client list root prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_remote_prefixes, HashSet::from([base_prefix.clone()]),
-        "remote storage root prefixes list mismatches with the uploads. Returned prefixes: {root_remote_prefixes:?}"
-    );
-
-    let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix))
-        .await
-        .context("client list nested prefixes failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let remote_only_prefixes = nested_remote_prefixes
-        .difference(&expected_remote_prefixes)
-        .collect::<HashSet<_>>();
-    let missing_uploaded_prefixes = expected_remote_prefixes
-        .difference(&nested_remote_prefixes)
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        remote_only_prefixes.len() + missing_uploaded_prefixes.len(), 0,
-        "remote storage nested prefixes list mismatches with the uploads. Remote only prefixes: {remote_only_prefixes:?}, missing uploaded prefixes: {missing_uploaded_prefixes:?}",
-    );
-
-    Ok(())
-}
-
-/// Tests that S3 client can list all files in a folder, even if the response comes paginated and requirees multiple S3 queries.
-/// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified. Test will skip real code and pass if env vars not set.
-/// See `s3_pagination_should_work` for more information.
-///
-/// First, create a set of S3 objects with keys `random_prefix/folder{j}/blob_{i}.txt` in [`upload_remote_data`]
-/// Then performs the following queries:
-///    1. `list_files(None)`. This should return all files `random_prefix/folder{j}/blob_{i}.txt`
-///    2. `list_files("folder1")`.  This  should return all files `random_prefix/folder1/blob_{i}.txt`
-#[test_context(MaybeEnabledS3WithSimpleTestBlobs)]
-#[tokio::test]
-async fn s3_list_files_works(ctx: &mut MaybeEnabledS3WithSimpleTestBlobs) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledS3WithSimpleTestBlobs::Enabled(ctx) => ctx,
-        MaybeEnabledS3WithSimpleTestBlobs::Disabled => return Ok(()),
-        MaybeEnabledS3WithSimpleTestBlobs::UploadsFailed(e, _) => {
-            anyhow::bail!("S3 init failed: {e:?}")
-        }
-    };
-    let test_client = Arc::clone(&ctx.enabled.client);
-    let base_prefix =
-        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
-    let root_files = test_client
-        .list_files(None)
-        .await
-        .context("client list root files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    assert_eq!(
-        root_files,
-        ctx.remote_blobs.clone(),
-        "remote storage list_files on root mismatches with the uploads."
-    );
-    let nested_remote_files = test_client
-        .list_files(Some(&base_prefix))
-        .await
-        .context("client list nested files failure")?
-        .into_iter()
-        .collect::<HashSet<_>>();
-    let trim_remote_blobs: HashSet<_> = ctx
-        .remote_blobs
-        .iter()
-        .map(|x| x.get_path())
-        .filter(|x| x.starts_with("folder1"))
-        .map(|x| RemotePath::new(x).expect("must be valid path"))
-        .collect();
-    assert_eq!(
-        nested_remote_files, trim_remote_blobs,
-        "remote storage list_files on subdirrectory mismatches with the uploads."
-    );
-    Ok(())
-}
-
-#[test_context(MaybeEnabledS3)]
-#[tokio::test]
-async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledS3::Enabled(ctx) => ctx,
-        MaybeEnabledS3::Disabled => return Ok(()),
-    };
-
-    let path = RemotePath::new(Utf8Path::new(
-        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
-    ))
-    .with_context(|| "RemotePath conversion")?;
-
-    ctx.client.delete(&path).await.expect("should succeed");
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledS3)]
-#[tokio::test]
-async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
-    let ctx = match ctx {
-        MaybeEnabledS3::Enabled(ctx) => ctx,
-        MaybeEnabledS3::Disabled => return Ok(()),
-    };
-
-    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path2 = RemotePath::new(Utf8Path::new(format!("{}/path2", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-    ctx.client.upload(data, len, &path1, None).await?;
-
-    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
-    ctx.client.upload(data, len, &path2, None).await?;
-
-    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-    ctx.client.upload(data, len, &path3, None).await?;
-
-    ctx.client.delete_objects(&[path1, path2]).await?;
-
-    let prefixes = ctx.client.list_prefixes(None).await?;
-
-    assert_eq!(prefixes.len(), 1);
-
-    ctx.client.delete_objects(&[path3]).await?;
-
-    Ok(())
-}
-
-#[test_context(MaybeEnabledS3)]
-#[tokio::test]
-async fn s3_upload_download_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
-    let MaybeEnabledS3::Enabled(ctx) = ctx else {
-        return Ok(());
-    };
-
-    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
-        .with_context(|| "RemotePath conversion")?;
-
-    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
-
-    let (data, len) = wrap_stream(orig.clone());
-
-    ctx.client.upload(data, len, &path, None).await?;
-
-    // Normal download request
-    let dl = ctx.client.download(&path).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    // Full range (end specified)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 0, Some(len as u64))
-        .await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    // partial range (end specified)
-    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[4..10]);
-
-    // partial range (end beyond real end)
-    let dl = ctx
-        .client
-        .download_byte_range(&path, 8, Some(len as u64 * 100))
-        .await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[8..]);
-
-    // Partial range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig[4..]);
-
-    // Full range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
-    let buf = download_to_vec(dl).await?;
-    assert_eq!(&buf, &orig);
-
-    debug!("Cleanup: deleting file at path {path:?}");
-    ctx.client
-        .delete(&path)
-        .await
-        .with_context(|| format!("{path:?} removal"))?;
-
-    Ok(())
-}
-
 struct EnabledS3 {
    client: Arc<GenericRemoteStorage>,
    base_prefix: &'static str,
@@ -277,13 +41,13 @@ impl EnabledS3 {
    }
 }

-enum MaybeEnabledS3 {
+enum MaybeEnabledStorage {
    Enabled(EnabledS3),
    Disabled,
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledS3 {
+impl AsyncTestContext for MaybeEnabledStorage {
    async fn setup() -> Self {
        ensure_logging_ready();

@@ -299,7 +63,7 @@ impl AsyncTestContext for MaybeEnabledS3 {
    }
 }

-enum MaybeEnabledS3WithTestBlobs {
+enum MaybeEnabledStorageWithTestBlobs {
    Enabled(S3WithTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, S3WithTestBlobs),
@@ -312,7 +76,7 @@ struct S3WithTestBlobs {
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
+impl AsyncTestContext for MaybeEnabledStorageWithTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
@@ -363,7 +127,7 @@ impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
 // However, they are not idential. The list_prefixes function is concerned with listing prefixes,
 // whereas the list_files function is concerned with listing files.
 // See `RemoteStorage::list_files` documentation for more details
-enum MaybeEnabledS3WithSimpleTestBlobs {
+enum MaybeEnabledStorageWithSimpleTestBlobs {
    Enabled(S3WithSimpleTestBlobs),
    Disabled,
    UploadsFailed(anyhow::Error, S3WithSimpleTestBlobs),
@@ -374,7 +138,7 @@ struct S3WithSimpleTestBlobs {
 }

 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledS3WithSimpleTestBlobs {
+impl AsyncTestContext for MaybeEnabledStorageWithSimpleTestBlobs {
    async fn setup() -> Self {
        ensure_logging_ready();
        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
--- a/libs/safekeeper_api/src/models.rs
+++ b/libs/safekeeper_api/src/models.rs
@@ -51,3 +51,9 @@ pub struct SkTimelineInfo {
    #[serde(default)]
    pub http_connstr: Option<String>,
 }
+
+#[derive(Debug, Clone, Deserialize, Serialize)]
+pub struct TimelineCopyRequest {
+    pub target_timeline_id: TimelineId,
+    pub until_lsn: Lsn,
+}
--- a/libs/utils/src/failpoint_support.rs
+++ b/libs/utils/src/failpoint_support.rs
@@ -15,6 +15,10 @@ use tracing::*;
 /// specified time (in milliseconds). The main difference is that we use async
 /// tokio sleep function. Another difference is that we print lines to the log,
 /// which can be useful in tests to check that the failpoint was hit.
+///
+/// Optionally pass a cancellation token, and this failpoint will drop out of
+/// its sleep when the cancellation token fires.  This is useful for testing
+/// cases where we would like to block something, but test its clean shutdown behavior.
 #[macro_export]
 macro_rules! __failpoint_sleep_millis_async {
    ($name:literal) => {{
@@ -30,6 +34,24 @@ macro_rules! __failpoint_sleep_millis_async {
            $crate::failpoint_support::failpoint_sleep_helper($name, duration_str).await
        }
    }};
+    ($name:literal, $cancel:expr) => {{
+        // If the failpoint is used with a "return" action, set should_sleep to the
+        // returned value (as string). Otherwise it's set to None.
+        let should_sleep = (|| {
+            ::fail::fail_point!($name, |x| x);
+            ::std::option::Option::None
+        })();
+
+        // Sleep if the action was a returned value
+        if let ::std::option::Option::Some(duration_str) = should_sleep {
+            $crate::failpoint_support::failpoint_sleep_cancellable_helper(
+                $name,
+                duration_str,
+                $cancel,
+            )
+            .await
+        }
+    }};
 }
 pub use __failpoint_sleep_millis_async as sleep_millis_async;

@@ -45,6 +67,22 @@ pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
    tracing::info!("failpoint {:?}: sleep done", name);
 }

+// Helper function used by the macro. (A function has nicer scoping so we
+// don't need to decorate everything with "::")
+#[doc(hidden)]
+pub async fn failpoint_sleep_cancellable_helper(
+    name: &'static str,
+    duration_str: String,
+    cancel: &CancellationToken,
+) {
+    let millis = duration_str.parse::<u64>().unwrap();
+    let d = std::time::Duration::from_millis(millis);
+
+    tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
+    tokio::time::timeout(d, cancel.cancelled()).await.ok();
+    tracing::info!("failpoint {:?}: sleep done", name);
+}
+
 pub fn init() -> fail::FailScenario<'static> {
    // The failpoints lib provides support for parsing the `FAILPOINTS` env var.
    // We want non-default behavior for `exit`, though, so, we handle it separately.
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -31,6 +31,9 @@ pub enum ApiError {
    #[error("Shutting down")]
    ShuttingDown,

+    #[error("Timeout")]
+    Timeout(Cow<'static, str>),
+
    #[error(transparent)]
    InternalServerError(anyhow::Error),
 }
@@ -67,6 +70,10 @@ impl ApiError {
                err.to_string(),
                StatusCode::SERVICE_UNAVAILABLE,
            ),
+            ApiError::Timeout(err) => HttpErrorBody::response_from_msg_and_status(
+                err.to_string(),
+                StatusCode::REQUEST_TIMEOUT,
+            ),
            ApiError::InternalServerError(err) => HttpErrorBody::response_from_msg_and_status(
                err.to_string(),
                StatusCode::INTERNAL_SERVER_ERROR,
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -1,3 +1,4 @@
+use std::num::ParseIntError;
 use std::{fmt, str::FromStr};

 use anyhow::Context;
@@ -374,6 +375,13 @@ impl fmt::Display for NodeId {
    }
 }

+impl FromStr for NodeId {
+    type Err = ParseIntError;
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        Ok(NodeId(u64::from_str(s)?))
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use serde_assert::{Deserializer, Serializer, Token, Tokens};
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -85,6 +85,8 @@ pub mod sync;

 pub mod failpoint_support;

+pub mod yielding_loop;
+
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///
 /// we have several cases:
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -15,6 +15,12 @@ pub struct Gate {
    name: String,
 }

+impl std::fmt::Debug for Gate {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "Gate<{}>", self.name)
+    }
+}
+
 /// RAII guard for a [`Gate`]: as long as this exists, calls to [`Gate::close`] will
 /// not complete.
 #[derive(Debug)]
--- a/libs/utils/src/yielding_loop.rs
+++ b/libs/utils/src/yielding_loop.rs
@@ -0,0 +1,35 @@
+use tokio_util::sync::CancellationToken;
+
+#[derive(thiserror::Error, Debug)]
+pub enum YieldingLoopError {
+    #[error("Cancelled")]
+    Cancelled,
+}
+
+/// Helper for long synchronous loops, e.g. over all tenants in the system.  Periodically
+/// yields to avoid blocking the executor, and after resuming checks the provided
+/// cancellation token to drop out promptly on shutdown.
+#[inline(always)]
+pub async fn yielding_loop<I, T, F>(
+    interval: usize,
+    cancel: &CancellationToken,
+    iter: I,
+    mut visitor: F,
+) -> Result<(), YieldingLoopError>
+where
+    I: Iterator<Item = T>,
+    F: FnMut(T),
+{
+    for (i, item) in iter.enumerate() {
+        visitor(item);
+
+        if i + 1 % interval == 0 {
+            tokio::task::yield_now().await;
+            if cancel.is_cancelled() {
+                return Err(YieldingLoopError::Cancelled);
+            }
+        }
+    }
+
+    Ok(())
+}
--- a/libs/vm_monitor/src/runner.rs
+++ b/libs/vm_monitor/src/runner.rs
@@ -446,12 +446,11 @@ impl Runner {
                    if let Some(t) = self.last_upscale_request_at {
                        let elapsed = t.elapsed();
                        if elapsed < Duration::from_secs(1) {
-                            info!(
-                                elapsed_millis = elapsed.as_millis(),
-                                avg_non_reclaimable = bytes_to_mebibytes(cgroup_mem_stat.avg_non_reclaimable),
-                                threshold = bytes_to_mebibytes(cgroup.threshold),
-                                "cgroup memory stats are high enough to upscale but too soon to forward the request, ignoring",
-                            );
+                            // *Ideally* we'd like to log here that we're ignoring the fact the
+                            // memory stats are too high, but in practice this can result in
+                            // spamming the logs with repetitive messages about ignoring the signal
+                            //
+                            // See https://github.com/neondatabase/neon/issues/5865 for more.
                            continue;
                        }
                    }
--- a/libs/walproposer/src/walproposer.rs
+++ b/libs/walproposer/src/walproposer.rs
@@ -425,7 +425,7 @@ mod tests {
        }

        fn log_internal(&self, _wp: &mut crate::bindings::WalProposer, level: Level, msg: &str) {
-            println!("walprop_log[{}] {}", level, msg);
+            println!("wp_log[{}] {}", level, msg);
        }

        fn after_election(&self, _wp: &mut crate::bindings::WalProposer) {
--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -13,6 +13,7 @@ use bytes::{Buf, Bytes};
 use pageserver::{
    config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
 };
+use pageserver_api::shard::TenantShardId;
 use utils::{id::TenantId, lsn::Lsn};

 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
@@ -26,9 +27,9 @@ fn redo_scenarios(c: &mut Criterion) {

    let conf = PageServerConf::dummy_conf(repo_dir.path().to_path_buf());
    let conf = Box::leak(Box::new(conf));
-    let tenant_id = TenantId::generate();
+    let tenant_shard_id = TenantShardId::unsharded(TenantId::generate());

-    let manager = PostgresRedoManager::new(conf, tenant_id);
+    let manager = PostgresRedoManager::new(conf, tenant_shard_id);

    let manager = Arc::new(manager);

--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -1,5 +1,5 @@
-use pageserver_api::models::*;
-use reqwest::{IntoUrl, Method};
+use pageserver_api::{models::*, shard::TenantShardId};
+use reqwest::{IntoUrl, Method, StatusCode};
 use utils::{
    http::error::HttpErrorBody,
    id::{TenantId, TimelineId},
@@ -22,20 +22,18 @@ pub enum Error {
    #[error("receive error body: {0}")]
    ReceiveErrorBody(String),

-    #[error("pageserver API: {0}")]
-    ApiError(String),
+    #[error("pageserver API: {1}")]
+    ApiError(StatusCode, String),
 }

 pub type Result<T> = std::result::Result<T, Error>;

-#[async_trait::async_trait]
 pub trait ResponseErrorMessageExt: Sized {
-    async fn error_from_body(self) -> Result<Self>;
+    fn error_from_body(self) -> impl std::future::Future<Output = Result<Self>> + Send;
 }

-#[async_trait::async_trait]
 impl ResponseErrorMessageExt for reqwest::Response {
-    async fn error_from_body(mut self) -> Result<Self> {
+    async fn error_from_body(self) -> Result<Self> {
        let status = self.status();
        if !(status.is_client_error() || status.is_server_error()) {
            return Ok(self);
@@ -43,7 +41,7 @@ impl ResponseErrorMessageExt for reqwest::Response {

        let url = self.url().to_owned();
        Err(match self.json::<HttpErrorBody>().await {
-            Ok(HttpErrorBody { msg }) => Error::ApiError(msg),
+            Ok(HttpErrorBody { msg }) => Error::ApiError(status, msg),
            Err(_) => {
                Error::ReceiveErrorBody(format!("Http error ({}) at {}.", status.as_u16(), url))
            }
@@ -51,6 +49,11 @@ impl ResponseErrorMessageExt for reqwest::Response {
    }
 }

+pub enum ForceAwaitLogicalSize {
+    Yes,
+    No,
+}
+
 impl Client {
    pub fn new(mgmt_api_endpoint: String, jwt: Option<&str>) -> Self {
        Self {
@@ -68,9 +71,9 @@ impl Client {

    pub async fn tenant_details(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
    ) -> Result<pageserver_api::models::TenantDetails> {
-        let uri = format!("{}/v1/tenant/{tenant_id}", self.mgmt_api_endpoint);
+        let uri = format!("{}/v1/tenant/{tenant_shard_id}", self.mgmt_api_endpoint);
        self.get(uri)
            .await?
            .json()
@@ -80,9 +83,12 @@ impl Client {

    pub async fn list_timelines(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
    ) -> Result<Vec<pageserver_api::models::TimelineInfo>> {
-        let uri = format!("{}/v1/tenant/{tenant_id}/timeline", self.mgmt_api_endpoint);
+        let uri = format!(
+            "{}/v1/tenant/{tenant_shard_id}/timeline",
+            self.mgmt_api_endpoint
+        );
        self.get(&uri)
            .await?
            .json()
@@ -94,11 +100,18 @@ impl Client {
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
+        force_await_logical_size: ForceAwaitLogicalSize,
    ) -> Result<pageserver_api::models::TimelineInfo> {
        let uri = format!(
            "{}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
            self.mgmt_api_endpoint
        );
+
+        let uri = match force_await_logical_size {
+            ForceAwaitLogicalSize::Yes => format!("{}?force-await-logical-size={}", uri, true),
+            ForceAwaitLogicalSize::No => uri,
+        };
+
        self.get(&uri)
            .await?
            .json()
@@ -164,16 +177,28 @@ impl Client {
        Ok(())
    }

+    pub async fn tenant_secondary_download(&self, tenant_id: TenantShardId) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{}/secondary/download",
+            self.mgmt_api_endpoint, tenant_id
+        );
+        self.request(Method::POST, &uri, ()).await?;
+        Ok(())
+    }
+
    pub async fn location_config(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        config: LocationConfig,
        flush_ms: Option<std::time::Duration>,
    ) -> Result<()> {
-        let req_body = TenantLocationConfigRequest { tenant_id, config };
+        let req_body = TenantLocationConfigRequest {
+            tenant_id: tenant_shard_id,
+            config,
+        };
        let path = format!(
            "{}/v1/tenant/{}/location_config",
-            self.mgmt_api_endpoint, tenant_id
+            self.mgmt_api_endpoint, tenant_shard_id
        );
        let path = if let Some(flush_ms) = flush_ms {
            format!("{}?flush_ms={}", path, flush_ms.as_millis())
@@ -184,14 +209,23 @@ impl Client {
        Ok(())
    }

+    pub async fn list_location_config(&self) -> Result<LocationConfigListResponse> {
+        let path = format!("{}/v1/location_config", self.mgmt_api_endpoint);
+        self.request(Method::GET, &path, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
    pub async fn timeline_create(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        req: &TimelineCreateRequest,
    ) -> Result<TimelineInfo> {
        let uri = format!(
            "{}/v1/tenant/{}/timeline",
-            self.mgmt_api_endpoint, tenant_id
+            self.mgmt_api_endpoint, tenant_shard_id
        );
        self.request(Method::POST, &uri, req)
            .await?
@@ -199,4 +233,46 @@ impl Client {
            .await
            .map_err(Error::ReceiveBody)
    }
+
+    pub async fn tenant_reset(&self, tenant_shard_id: TenantShardId) -> Result<()> {
+        let uri = format!(
+            "{}/v1/tenant/{}/reset",
+            self.mgmt_api_endpoint, tenant_shard_id
+        );
+        self.request(Method::POST, &uri, ())
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn timeline_list(
+        &self,
+        tenant_shard_id: &TenantShardId,
+    ) -> Result<Vec<TimelineInfo>> {
+        let uri = format!(
+            "{}/v1/tenant/{}/timeline",
+            self.mgmt_api_endpoint, tenant_shard_id
+        );
+        self.get(&uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
+
+    pub async fn tenant_synthetic_size(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Result<TenantHistorySize> {
+        let uri = format!(
+            "{}/v1/tenant/{}/synthetic_size",
+            self.mgmt_api_endpoint, tenant_shard_id
+        );
+        self.get(&uri)
+            .await?
+            .json()
+            .await
+            .map_err(Error::ReceiveBody)
+    }
 }
--- a/pageserver/client/src/mgmt_api/util.rs
+++ b/pageserver/client/src/mgmt_api/util.rs
@@ -2,6 +2,7 @@

 use std::sync::Arc;

+use pageserver_api::shard::TenantShardId;
 use tokio::task::JoinSet;
 use utils::id::{TenantId, TenantTimelineId};

@@ -31,7 +32,10 @@ pub async fn get_pageserver_tenant_timelines_unsharded(
            async move {
                (
                    tenant_id,
-                    mgmt_api_client.tenant_details(tenant_id).await.unwrap(),
+                    mgmt_api_client
+                        .tenant_details(TenantShardId::unsharded(tenant_id))
+                        .await
+                        .unwrap(),
                )
            }
        });
--- a/pageserver/client/src/page_service.rs
+++ b/pageserver/client/src/page_service.rs
@@ -108,22 +108,38 @@ pub struct RelTagBlockNo {
 }

 impl PagestreamClient {
-    pub async fn shutdown(mut self) {
-        let _ = self.cancel_on_client_drop.take();
-        self.conn_task.await.unwrap();
+    pub async fn shutdown(self) {
+        let Self {
+            copy_both,
+            cancel_on_client_drop: cancel_conn_task,
+            conn_task,
+        } = self;
+        // The `copy_both` contains internal channel sender, the receiver of which is polled by `conn_task`.
+        // When `conn_task` observes the sender has been dropped, it sends a `FeMessage::CopyFail` into the connection.
+        // (see https://github.com/neondatabase/rust-postgres/blob/2005bf79573b8add5cf205b52a2b208e356cc8b0/tokio-postgres/src/copy_both.rs#L56).
+        //
+        // If we drop(copy_both) first, but then immediately drop the `cancel_on_client_drop`,
+        // the CopyFail mesage only makes it to the socket sometimes (i.e., it's a race).
+        //
+        // Further, the pageserver makes a lot of noise when it receives CopyFail.
+        // Computes don't send it in practice, they just hard-close the connection.
+        //
+        // So, let's behave like the computes and suppress the CopyFail as follows:
+        // kill the socket first, then drop copy_both.
+        //
+        // See also: https://www.postgresql.org/docs/current/protocol-flow.html#PROTOCOL-COPY
+        //
+        // NB: page_service doesn't have a use case to exit the `pagestream` mode currently.
+        // => https://github.com/neondatabase/neon/issues/6390
+        let _ = cancel_conn_task.unwrap();
+        conn_task.await.unwrap();
+        drop(copy_both);
    }

    pub async fn getpage(
        &mut self,
-        key: RelTagBlockNo,
-        lsn: Lsn,
+        req: PagestreamGetPageRequest,
    ) -> anyhow::Result<PagestreamGetPageResponse> {
-        let req = PagestreamGetPageRequest {
-            latest: false,
-            rel: key.rel_tag,
-            blkno: key.block_no,
-            lsn,
-        };
        let req = PagestreamFeMessage::GetPage(req);
        let req: bytes::Bytes = req.serialize();
        // let mut req = tokio_util::io::ReaderStream::new(&req);
--- a/pageserver/pagebench/Cargo.toml
+++ b/pageserver/pagebench/Cargo.toml
@@ -8,6 +8,7 @@ license.workspace = true

 [dependencies]
 anyhow.workspace = true
+camino.workspace = true
 clap.workspace = true
 futures.workspace = true
 hdrhistogram.workspace = true
@@ -18,8 +19,8 @@ serde.workspace = true
 serde_json.workspace = true
 tracing.workspace = true
 tokio.workspace = true
+tokio-util.workspace = true

-pageserver = { path = ".." }
 pageserver_client.workspace = true
 pageserver_api.workspace = true
 utils = { path = "../../libs/utils/" }
--- a/pageserver/pagebench/src/cmd/basebackup.rs
+++ b/pageserver/pagebench/src/cmd/basebackup.rs
@@ -1,4 +1,5 @@
 use anyhow::Context;
+use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
 use pageserver_client::page_service::BasebackupRequest;

 use utils::id::TenantTimelineId;
@@ -92,10 +93,12 @@ async fn main_impl(
    for timeline in &timelines {
        js.spawn({
            let timeline = *timeline;
-            // FIXME: this triggers initial logical size calculation
-            // https://github.com/neondatabase/neon/issues/6168
            let info = mgmt_api_client
-                .timeline_info(timeline.tenant_id, timeline.timeline_id)
+                .timeline_info(
+                    timeline.tenant_id,
+                    timeline.timeline_id,
+                    ForceAwaitLogicalSize::No,
+                )
                .await
                .unwrap();
            async move {
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -1,10 +1,11 @@
 use anyhow::Context;
+use camino::Utf8PathBuf;
 use futures::future::join_all;
-use pageserver::pgdatadir_mapping::key_to_rel_block;
-use pageserver::repository;
-use pageserver_api::key::is_rel_block_key;
-use pageserver_client::page_service::RelTagBlockNo;
+use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
+use pageserver_api::keyspace::KeySpaceAccum;
+use pageserver_api::models::PagestreamGetPageRequest;

+use tokio_util::sync::CancellationToken;
 use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;

@@ -13,7 +14,7 @@ use tokio::sync::Barrier;
 use tokio::task::JoinSet;
 use tracing::{info, instrument};

-use std::collections::HashMap;
+use std::collections::{HashMap, HashSet};
 use std::future::Future;
 use std::num::NonZeroUsize;
 use std::pin::Pin;
@@ -39,8 +40,17 @@ pub(crate) struct Args {
    runtime: Option<humantime::Duration>,
    #[clap(long)]
    per_target_rate_limit: Option<usize>,
+    /// Probability for sending `latest=true` in the request (uniform distribution).
+    #[clap(long, default_value = "1")]
+    req_latest_probability: f64,
    #[clap(long)]
    limit_to_first_n_targets: Option<usize>,
+    /// For large pageserver installations, enumerating the keyspace takes a lot of time.
+    /// If specified, the specified path is used to maintain a cache of the keyspace enumeration result.
+    /// The cache is tagged and auto-invalided by the tenant/timeline ids only.
+    /// It doesn't get invalidated if the keyspace changes under the hood, e.g., due to new ingested data or compaction.
+    #[clap(long)]
+    keyspace_cache: Option<Utf8PathBuf>,
    targets: Option<Vec<TenantTimelineId>>,
 }

@@ -55,7 +65,7 @@ impl LiveStats {
    }
 }

-#[derive(Clone)]
+#[derive(Clone, serde::Serialize, serde::Deserialize)]
 struct KeyRange {
    timeline: TenantTimelineId,
    timeline_lsn: Lsn,
@@ -103,59 +113,107 @@ async fn main_impl(
    )
    .await?;

-    let mut js = JoinSet::new();
-    for timeline in &timelines {
-        js.spawn({
-            let mgmt_api_client = Arc::clone(&mgmt_api_client);
-            let timeline = *timeline;
-            async move {
-                let partitioning = mgmt_api_client
-                    .keyspace(timeline.tenant_id, timeline.timeline_id)
-                    .await?;
-                let lsn = partitioning.at_lsn;
-
-                let ranges = partitioning
-                    .keys
-                    .ranges
-                    .iter()
-                    .filter_map(|r| {
-                        let start = r.start;
-                        let end = r.end;
-                        // filter out non-relblock keys
-                        match (is_rel_block_key(&start), is_rel_block_key(&end)) {
-                            (true, true) => Some(KeyRange {
-                                timeline,
-                                timeline_lsn: lsn,
-                                start: start.to_i128(),
-                                end: end.to_i128(),
-                            }),
-                            (true, false) | (false, true) => {
-                                unimplemented!("split up range")
+    #[derive(serde::Deserialize)]
+    struct KeyspaceCacheDe {
+        tag: Vec<TenantTimelineId>,
+        data: Vec<KeyRange>,
+    }
+    #[derive(serde::Serialize)]
+    struct KeyspaceCacheSer<'a> {
+        tag: &'a [TenantTimelineId],
+        data: &'a [KeyRange],
+    }
+    let cache = args
+        .keyspace_cache
+        .as_ref()
+        .map(|keyspace_cache_file| {
+            let contents = match std::fs::read(keyspace_cache_file) {
+                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+                    return anyhow::Ok(None);
+                }
+                x => x.context("read keyspace cache file")?,
+            };
+            let cache: KeyspaceCacheDe =
+                serde_json::from_slice(&contents).context("deserialize cache file")?;
+            let tag_ok = HashSet::<TenantTimelineId>::from_iter(cache.tag.into_iter())
+                == HashSet::from_iter(timelines.iter().cloned());
+            info!("keyspace cache file matches tag: {tag_ok}");
+            anyhow::Ok(if tag_ok { Some(cache.data) } else { None })
+        })
+        .transpose()?
+        .flatten();
+    let all_ranges: Vec<KeyRange> = if let Some(cached) = cache {
+        info!("using keyspace cache file");
+        cached
+    } else {
+        let mut js = JoinSet::new();
+        for timeline in &timelines {
+            js.spawn({
+                let mgmt_api_client = Arc::clone(&mgmt_api_client);
+                let timeline = *timeline;
+                async move {
+                    let partitioning = mgmt_api_client
+                        .keyspace(timeline.tenant_id, timeline.timeline_id)
+                        .await?;
+                    let lsn = partitioning.at_lsn;
+                    let start = Instant::now();
+                    let mut filtered = KeySpaceAccum::new();
+                    // let's hope this is inlined and vectorized...
+                    // TODO: turn this loop into a is_rel_block_range() function.
+                    for r in partitioning.keys.ranges.iter() {
+                        let mut i = r.start;
+                        while i != r.end {
+                            if is_rel_block_key(&i) {
+                                filtered.add_key(i);
                            }
-                            (false, false) => None,
+                            i = i.next();
                        }
-                    })
-                    .collect::<Vec<_>>();
+                    }
+                    let filtered = filtered.to_keyspace();
+                    let filter_duration = start.elapsed();

-                anyhow::Ok(ranges)
-            }
-        });
-    }
-    let mut all_ranges: Vec<KeyRange> = Vec::new();
-    while let Some(res) = js.join_next().await {
-        all_ranges.extend(res.unwrap().unwrap());
-    }
+                    anyhow::Ok((
+                        filter_duration,
+                        filtered.ranges.into_iter().map(move |r| KeyRange {
+                            timeline,
+                            timeline_lsn: lsn,
+                            start: r.start.to_i128(),
+                            end: r.end.to_i128(),
+                        }),
+                    ))
+                }
+            });
+        }
+        let mut total_filter_duration = Duration::from_secs(0);
+        let mut all_ranges: Vec<KeyRange> = Vec::new();
+        while let Some(res) = js.join_next().await {
+            let (filter_duration, range) = res.unwrap().unwrap();
+            all_ranges.extend(range);
+            total_filter_duration += filter_duration;
+        }
+        info!("filter duration: {}", total_filter_duration.as_secs_f64());
+        if let Some(cachefile) = args.keyspace_cache.as_ref() {
+            let cache = KeyspaceCacheSer {
+                tag: &timelines,
+                data: &all_ranges,
+            };
+            let bytes = serde_json::to_vec(&cache).context("serialize keyspace for cache file")?;
+            std::fs::write(cachefile, bytes).context("write keyspace cache file to disk")?;
+            info!("successfully wrote keyspace cache file");
+        }
+        all_ranges
+    };

    let live_stats = Arc::new(LiveStats::default());

    let num_client_tasks = timelines.len();
    let num_live_stats_dump = 1;
    let num_work_sender_tasks = 1;
+    let num_main_impl = 1;

    let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
-        num_client_tasks + num_live_stats_dump + num_work_sender_tasks,
+        num_client_tasks + num_live_stats_dump + num_work_sender_tasks + num_main_impl,
    ));
-    let all_work_done_barrier = Arc::new(tokio::sync::Barrier::new(num_client_tasks));

    tokio::spawn({
        let stats = Arc::clone(&live_stats);
@@ -175,112 +233,143 @@ async fn main_impl(
        }
    });

-    let mut work_senders = HashMap::new();
+    let cancel = CancellationToken::new();
+
+    let mut work_senders: HashMap<TenantTimelineId, _> = HashMap::new();
    let mut tasks = Vec::new();
    for tl in &timelines {
        let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
-        work_senders.insert(tl, sender);
+        work_senders.insert(*tl, sender);
        tasks.push(tokio::spawn(client(
            args,
            *tl,
            Arc::clone(&start_work_barrier),
            receiver,
-            Arc::clone(&all_work_done_barrier),
            Arc::clone(&live_stats),
+            cancel.clone(),
        )));
    }

-    let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = match args.per_target_rate_limit {
-        None => Box::pin(async move {
-            let weights = rand::distributions::weighted::WeightedIndex::new(
-                all_ranges.iter().map(|v| v.len()),
-            )
-            .unwrap();
-
-            start_work_barrier.wait().await;
-
-            loop {
-                let (range, key) = {
-                    let mut rng = rand::thread_rng();
-                    let r = &all_ranges[weights.sample(&mut rng)];
-                    let key: i128 = rng.gen_range(r.start..r.end);
-                    let key = repository::Key::from_i128(key);
-                    let (rel_tag, block_no) =
-                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
-                    (r, RelTagBlockNo { rel_tag, block_no })
-                };
-                let sender = work_senders.get(&range.timeline).unwrap();
-                // TODO: what if this blocks?
-                sender.send((key, range.timeline_lsn)).await.ok().unwrap();
-            }
-        }),
-        Some(rps_limit) => Box::pin(async move {
-            let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
-
-            let make_timeline_task: &dyn Fn(
-                TenantTimelineId,
-            )
-                -> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
-                let sender = work_senders.get(&timeline).unwrap();
-                let ranges: Vec<KeyRange> = all_ranges
-                    .iter()
-                    .filter(|r| r.timeline == timeline)
-                    .cloned()
-                    .collect();
+    let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = {
+        let start_work_barrier = start_work_barrier.clone();
+        let cancel = cancel.clone();
+        match args.per_target_rate_limit {
+            None => Box::pin(async move {
                let weights = rand::distributions::weighted::WeightedIndex::new(
-                    ranges.iter().map(|v| v.len()),
+                    all_ranges.iter().map(|v| v.len()),
                )
                .unwrap();

-                Box::pin(async move {
-                    let mut ticker = tokio::time::interval(period);
-                    ticker.set_missed_tick_behavior(
-                        /* TODO review this choice */
-                        tokio::time::MissedTickBehavior::Burst,
-                    );
-                    loop {
-                        ticker.tick().await;
-                        let (range, key) = {
-                            let mut rng = rand::thread_rng();
-                            let r = &ranges[weights.sample(&mut rng)];
-                            let key: i128 = rng.gen_range(r.start..r.end);
-                            let key = repository::Key::from_i128(key);
-                            let (rel_tag, block_no) = key_to_rel_block(key)
-                                .expect("we filter non-rel-block keys out above");
-                            (r, RelTagBlockNo { rel_tag, block_no })
-                        };
-                        sender.send((key, range.timeline_lsn)).await.ok().unwrap();
+                start_work_barrier.wait().await;
+
+                while !cancel.is_cancelled() {
+                    let (timeline, req) = {
+                        let mut rng = rand::thread_rng();
+                        let r = &all_ranges[weights.sample(&mut rng)];
+                        let key: i128 = rng.gen_range(r.start..r.end);
+                        let key = Key::from_i128(key);
+                        let (rel_tag, block_no) =
+                            key_to_rel_block(key).expect("we filter non-rel-block keys out above");
+                        (
+                            r.timeline,
+                            PagestreamGetPageRequest {
+                                latest: rng.gen_bool(args.req_latest_probability),
+                                lsn: r.timeline_lsn,
+                                rel: rel_tag,
+                                blkno: block_no,
+                            },
+                        )
+                    };
+                    let sender = work_senders.get(&timeline).unwrap();
+                    // TODO: what if this blocks?
+                    if sender.send(req).await.is_err() {
+                        assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
                    }
-                })
-            };
+                }
+            }),
+            Some(rps_limit) => Box::pin(async move {
+                let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
+                let make_timeline_task: &dyn Fn(
+                    TenantTimelineId,
+                )
+                    -> Pin<Box<dyn Send + Future<Output = ()>>> = &|timeline| {
+                    let sender = work_senders.get(&timeline).unwrap();
+                    let ranges: Vec<KeyRange> = all_ranges
+                        .iter()
+                        .filter(|r| r.timeline == timeline)
+                        .cloned()
+                        .collect();
+                    let weights = rand::distributions::weighted::WeightedIndex::new(
+                        ranges.iter().map(|v| v.len()),
+                    )
+                    .unwrap();

-            let tasks: Vec<_> = work_senders
-                .keys()
-                .map(|tl| make_timeline_task(**tl))
-                .collect();
+                    let cancel = cancel.clone();
+                    Box::pin(async move {
+                        let mut ticker = tokio::time::interval(period);
+                        ticker.set_missed_tick_behavior(
+                            /* TODO review this choice */
+                            tokio::time::MissedTickBehavior::Burst,
+                        );
+                        while !cancel.is_cancelled() {
+                            ticker.tick().await;
+                            let req = {
+                                let mut rng = rand::thread_rng();
+                                let r = &ranges[weights.sample(&mut rng)];
+                                let key: i128 = rng.gen_range(r.start..r.end);
+                                let key = Key::from_i128(key);
+                                assert!(is_rel_block_key(&key));
+                                let (rel_tag, block_no) = key_to_rel_block(key)
+                                    .expect("we filter non-rel-block keys out above");
+                                PagestreamGetPageRequest {
+                                    latest: rng.gen_bool(args.req_latest_probability),
+                                    lsn: r.timeline_lsn,
+                                    rel: rel_tag,
+                                    blkno: block_no,
+                                }
+                            };
+                            if sender.send(req).await.is_err() {
+                                assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
+                            }
+                        }
+                    })
+                };

-            start_work_barrier.wait().await;
+                let tasks: Vec<_> = work_senders
+                    .keys()
+                    .map(|tl| make_timeline_task(*tl))
+                    .collect();

-            join_all(tasks).await;
-        }),
+                start_work_barrier.wait().await;
+
+                join_all(tasks).await;
+            }),
+        }
    };

+    let work_sender_task = tokio::spawn(work_sender);
+
+    info!("waiting for everything to become ready");
+    start_work_barrier.wait().await;
+    info!("work started");
    if let Some(runtime) = args.runtime {
-        match tokio::time::timeout(runtime.into(), work_sender).await {
-            Ok(()) => unreachable!("work sender never terminates"),
-            Err(_timeout) => {
-                // this implicitly drops the work_senders, making all the clients exit
-            }
-        }
+        tokio::time::sleep(runtime.into()).await;
+        info!("runtime over, signalling cancellation");
+        cancel.cancel();
+        work_sender_task.await.unwrap();
+        info!("work sender exited");
    } else {
-        work_sender.await;
+        work_sender_task.await.unwrap();
        unreachable!("work sender never terminates");
    }

+    info!("joining clients");
    for t in tasks {
        t.await.unwrap();
    }

+    info!("all clients stopped");
+
    let output = Output {
        total: {
            let mut agg_stats = request_stats::Stats::new();
@@ -303,12 +392,10 @@ async fn client(
    args: &'static Args,
    timeline: TenantTimelineId,
    start_work_barrier: Arc<Barrier>,
-    mut work: tokio::sync::mpsc::Receiver<(RelTagBlockNo, Lsn)>,
-    all_work_done_barrier: Arc<Barrier>,
+    mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
    live_stats: Arc<LiveStats>,
+    cancel: CancellationToken,
 ) {
-    start_work_barrier.wait().await;
-
    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
        .await
        .unwrap();
@@ -317,19 +404,27 @@ async fn client(
        .await
        .unwrap();

-    while let Some((key, lsn)) = work.recv().await {
-        let start = Instant::now();
-        client
-            .getpage(key, lsn)
-            .await
-            .with_context(|| format!("getpage for {timeline}"))
-            .unwrap();
-        let elapsed = start.elapsed();
-        live_stats.inc();
-        STATS.with(|stats| {
-            stats.borrow().lock().unwrap().observe(elapsed).unwrap();
-        });
+    let do_requests = async {
+        start_work_barrier.wait().await;
+        while let Some(req) = work.recv().await {
+            let start = Instant::now();
+            client
+                .getpage(req)
+                .await
+                .with_context(|| format!("getpage for {timeline}"))
+                .unwrap();
+            let elapsed = start.elapsed();
+            live_stats.inc();
+            STATS.with(|stats| {
+                stats.borrow().lock().unwrap().observe(elapsed).unwrap();
+            });
+        }
+    };
+    tokio::select! {
+        res = do_requests => { res },
+        _ = cancel.cancelled() => {
+            client.shutdown().await;
+            return;
+        }
    }
-
-    all_work_done_barrier.wait().await;
 }
--- a/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
+++ b/pageserver/pagebench/src/cmd/trigger_initial_size_calculation.rs
@@ -4,6 +4,8 @@ use humantime::Duration;
 use tokio::task::JoinSet;
 use utils::id::TenantTimelineId;

+use pageserver_client::mgmt_api::ForceAwaitLogicalSize;
+
 #[derive(clap::Parser)]
 pub(crate) struct Args {
    #[clap(long, default_value = "http://localhost:9898")]
@@ -56,14 +58,15 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
    for tl in timelines {
        let mgmt_api_client = Arc::clone(&mgmt_api_client);
        js.spawn(async move {
-            // TODO: API to explicitly trigger initial logical size computation.
-            // Should probably also avoid making it a side effect of timeline details to trigger initial logical size calculation.
-            // => https://github.com/neondatabase/neon/issues/6168
            let info = mgmt_api_client
-                .timeline_info(tl.tenant_id, tl.timeline_id)
+                .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
                .await
                .unwrap();

+            // Polling should not be strictly required here since we await
+            // for the initial logical size, however it's possible for the request
+            // to land before the timeline is initialised. This results in an approximate
+            // logical size.
            if let Some(period) = args.poll_for_completion {
                let mut ticker = tokio::time::interval(period.into());
                ticker.set_missed_tick_behavior(tokio::time::MissedTickBehavior::Delay);
@@ -71,7 +74,7 @@ async fn main_impl(args: Args) -> anyhow::Result<()> {
                while !info.current_logical_size_is_accurate {
                    ticker.tick().await;
                    info = mgmt_api_client
-                        .timeline_info(tl.tenant_id, tl.timeline_id)
+                        .timeline_info(tl.tenant_id, tl.timeline_id, ForceAwaitLogicalSize::Yes)
                        .await
                        .unwrap();
                }
--- a/pageserver/pagebench/src/main.rs
+++ b/pageserver/pagebench/src/main.rs
@@ -35,6 +35,7 @@ fn main() {
        logging::Output::Stderr,
    )
    .unwrap();
+    logging::replace_panic_hook_with_tracing_panic_hook().forget();

    let args = Args::parse();
    match args {
--- a/pageserver/src/basebackup.rs
+++ b/pageserver/src/basebackup.rs
@@ -23,6 +23,7 @@ use tracing::*;
 use tokio_tar::{Builder, EntryType, Header};

 use crate::context::RequestContext;
+use crate::pgdatadir_mapping::Version;
 use crate::tenant::Timeline;
 use pageserver_api::reltag::{RelTag, SlruKind};

@@ -174,7 +175,7 @@ where
        ] {
            for segno in self
                .timeline
-                .list_slru_segments(kind, self.lsn, self.ctx)
+                .list_slru_segments(kind, Version::Lsn(self.lsn), self.ctx)
                .await?
            {
                self.add_slru_segment(kind, segno).await?;
@@ -192,7 +193,7 @@ where
            // Otherwise only include init forks of unlogged relations.
            let rels = self
                .timeline
-                .list_rels(spcnode, dbnode, self.lsn, self.ctx)
+                .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
                .await?;
            for &rel in rels.iter() {
                // Send init fork as main fork to provide well formed empty
@@ -267,7 +268,7 @@ where
    async fn add_rel(&mut self, src: RelTag, dst: RelTag) -> anyhow::Result<()> {
        let nblocks = self
            .timeline
-            .get_rel_size(src, self.lsn, false, self.ctx)
+            .get_rel_size(src, Version::Lsn(self.lsn), false, self.ctx)
            .await?;

        // If the relation is empty, create an empty file
@@ -288,7 +289,7 @@ where
            for blknum in startblk..endblk {
                let img = self
                    .timeline
-                    .get_rel_page_at_lsn(src, blknum, self.lsn, false, self.ctx)
+                    .get_rel_page_at_lsn(src, blknum, Version::Lsn(self.lsn), false, self.ctx)
                    .await?;
                segment_data.extend_from_slice(&img[..]);
            }
@@ -310,7 +311,7 @@ where
    async fn add_slru_segment(&mut self, slru: SlruKind, segno: u32) -> anyhow::Result<()> {
        let nblocks = self
            .timeline
-            .get_slru_segment_size(slru, segno, self.lsn, self.ctx)
+            .get_slru_segment_size(slru, segno, Version::Lsn(self.lsn), self.ctx)
            .await?;

        let mut slru_buf: Vec<u8> = Vec::with_capacity(nblocks as usize * BLCKSZ as usize);
@@ -352,7 +353,7 @@ where
        let relmap_img = if has_relmap_file {
            let img = self
                .timeline
-                .get_relmap_file(spcnode, dbnode, self.lsn, self.ctx)
+                .get_relmap_file(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
                .await?;

            ensure!(
@@ -399,7 +400,7 @@ where
            if !has_relmap_file
                && self
                    .timeline
-                    .list_rels(spcnode, dbnode, self.lsn, self.ctx)
+                    .list_rels(spcnode, dbnode, Version::Lsn(self.lsn), self.ctx)
                    .await?
                    .is_empty()
            {
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -527,6 +527,7 @@ fn start_pageserver(
            conf,
            remote_storage.clone(),
            disk_usage_eviction_state.clone(),
+            tenant_manager.clone(),
            background_jobs_barrier.clone(),
        )?;
    }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -37,8 +37,8 @@ use crate::tenant::{
    TENANTS_SEGMENT_NAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
 };
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_LOCATION_CONFIG_NAME,
-    TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TENANT_HEATMAP_BASENAME,
+    TENANT_LOCATION_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX, TIMELINE_UNINIT_MARK_SUFFIX,
 };

 use self::defaults::DEFAULT_CONCURRENT_TENANT_WARMUP;
@@ -75,6 +75,9 @@ pub mod defaults {
    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";

    pub const DEFAULT_HEATMAP_UPLOAD_CONCURRENCY: usize = 8;
+    pub const DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY: usize = 1;
+
+    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;

    ///
    /// Default built-in configuration file.
@@ -88,6 +91,7 @@ pub mod defaults {
 #wait_lsn_timeout = '{DEFAULT_WAIT_LSN_TIMEOUT}'
 #wal_redo_timeout = '{DEFAULT_WAL_REDO_TIMEOUT}'

+#page_cache_size = {DEFAULT_PAGE_CACHE_SIZE}
 #max_file_descriptors = {DEFAULT_MAX_FILE_DESCRIPTORS}

 # initial superuser role name to use when creating a new tenant
@@ -108,6 +112,8 @@ pub mod defaults {

 #background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'

+#ingest_batch_size = {DEFAULT_INGEST_BATCH_SIZE}
+
 [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -125,6 +131,7 @@ pub mod defaults {
 #gc_feedback = false

 #heatmap_upload_concurrency = {DEFAULT_HEATMAP_UPLOAD_CONCURRENCY}
+#secondary_download_concurrency = {DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY}

 [remote_storage]

@@ -233,6 +240,13 @@ pub struct PageServerConf {
    /// How many heatmap uploads may be done concurrency: lower values implicitly deprioritize
    /// heatmap uploads vs. other remote storage operations.
    pub heatmap_upload_concurrency: usize,
+
+    /// How many remote storage downloads may be done for secondary tenants concurrently.  Implicitly
+    /// deprioritises secondary downloads vs. remote storage operations for attached tenants.
+    pub secondary_download_concurrency: usize,
+
+    /// Maximum number of WAL records to be ingested and committed at the same time
+    pub ingest_batch_size: u64,
 }

 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -314,6 +328,9 @@ struct PageServerConfigBuilder {
    control_plane_emergency_mode: BuilderValue<bool>,

    heatmap_upload_concurrency: BuilderValue<usize>,
+    secondary_download_concurrency: BuilderValue<usize>,
+
+    ingest_batch_size: BuilderValue<u64>,
 }

 impl Default for PageServerConfigBuilder {
@@ -386,6 +403,9 @@ impl Default for PageServerConfigBuilder {
            control_plane_emergency_mode: Set(false),

            heatmap_upload_concurrency: Set(DEFAULT_HEATMAP_UPLOAD_CONCURRENCY),
+            secondary_download_concurrency: Set(DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY),
+
+            ingest_batch_size: Set(DEFAULT_INGEST_BATCH_SIZE),
        }
    }
 }
@@ -534,6 +554,14 @@ impl PageServerConfigBuilder {
        self.heatmap_upload_concurrency = BuilderValue::Set(value)
    }

+    pub fn secondary_download_concurrency(&mut self, value: usize) {
+        self.secondary_download_concurrency = BuilderValue::Set(value)
+    }
+
+    pub fn ingest_batch_size(&mut self, ingest_batch_size: u64) {
+        self.ingest_batch_size = BuilderValue::Set(ingest_batch_size)
+    }
+
    pub fn build(self) -> anyhow::Result<PageServerConf> {
        let concurrent_tenant_warmup = self
            .concurrent_tenant_warmup
@@ -632,10 +660,15 @@ impl PageServerConfigBuilder {
            control_plane_emergency_mode: self
                .control_plane_emergency_mode
                .ok_or(anyhow!("missing control_plane_emergency_mode"))?,
-
            heatmap_upload_concurrency: self
                .heatmap_upload_concurrency
                .ok_or(anyhow!("missing heatmap_upload_concurrency"))?,
+            secondary_download_concurrency: self
+                .secondary_download_concurrency
+                .ok_or(anyhow!("missing secondary_download_concurrency"))?,
+            ingest_batch_size: self
+                .ingest_batch_size
+                .ok_or(anyhow!("missing ingest_batch_size"))?,
        })
    }
 }
@@ -693,6 +726,11 @@ impl PageServerConf {
            .join(TENANT_LOCATION_CONFIG_NAME)
    }

+    pub(crate) fn tenant_heatmap_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+        self.tenant_path(tenant_shard_id)
+            .join(TENANT_HEATMAP_BASENAME)
+    }
+
    pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
        self.tenant_path(tenant_shard_id)
            .join(TIMELINES_SEGMENT_NAME)
@@ -878,6 +916,10 @@ impl PageServerConf {
                "heatmap_upload_concurrency" => {
                    builder.heatmap_upload_concurrency(parse_toml_u64(key, item)? as usize)
                },
+                "secondary_download_concurrency" => {
+                    builder.secondary_download_concurrency(parse_toml_u64(key, item)? as usize)
+                },
+                "ingest_batch_size" => builder.ingest_batch_size(parse_toml_u64(key, item)?),
                _ => bail!("unrecognized pageserver option '{key}'"),
            }
        }
@@ -949,6 +991,8 @@ impl PageServerConf {
            control_plane_api_token: None,
            control_plane_emergency_mode: false,
            heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
+            secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
+            ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
        }
    }
 }
@@ -1082,11 +1126,12 @@ mod tests {
    };

    use camino_tempfile::{tempdir, Utf8TempDir};
+    use pageserver_api::models::EvictionPolicy;
    use remote_storage::{RemoteStorageKind, S3Config};
    use utils::serde_percent::Percent;

    use super::*;
-    use crate::{tenant::config::EvictionPolicy, DEFAULT_PG_VERSION};
+    use crate::DEFAULT_PG_VERSION;

    const ALL_BASE_VALUES_TOML: &str = r#"
 # Initial configuration file created by 'pageserver --init'
@@ -1177,7 +1222,9 @@ background_task_maximum_delay = '334 s'
                control_plane_api: None,
                control_plane_api_token: None,
                control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
+                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
+                ingest_batch_size: defaults::DEFAULT_INGEST_BATCH_SIZE,
            },
            "Correct defaults should be used when no config values are provided"
        );
@@ -1238,7 +1285,9 @@ background_task_maximum_delay = '334 s'
                control_plane_api: None,
                control_plane_api_token: None,
                control_plane_emergency_mode: false,
-                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY
+                heatmap_upload_concurrency: defaults::DEFAULT_HEATMAP_UPLOAD_CONCURRENCY,
+                secondary_download_concurrency: defaults::DEFAULT_SECONDARY_DOWNLOAD_CONCURRENCY,
+                ingest_batch_size: 100,
            },
            "Should be able to parse all basic config values correctly"
        );
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -267,7 +267,7 @@ async fn calculate_synthetic_size_worker(
            }
        };

-        for (tenant_shard_id, tenant_state) in tenants {
+        for (tenant_shard_id, tenant_state, _gen) in tenants {
            if tenant_state != TenantState::Active {
                continue;
            }
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -196,7 +196,7 @@ pub(super) async fn collect_all_metrics(
        }
    };

-    let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
+    let tenants = futures::stream::iter(tenants).filter_map(|(id, state, _)| async move {
        if state != TenantState::Active || !id.is_zero() {
            None
        } else {
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -1,5 +1,6 @@
 use std::collections::HashMap;

+use futures::Future;
 use pageserver_api::{
    control_api::{
        ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
@@ -28,13 +29,14 @@ pub enum RetryForeverError {
    ShuttingDown,
 }

-#[async_trait::async_trait]
 pub trait ControlPlaneGenerationsApi {
-    async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError>;
-    async fn validate(
+    fn re_attach(
+        &self,
+    ) -> impl Future<Output = Result<HashMap<TenantShardId, Generation>, RetryForeverError>> + Send;
+    fn validate(
        &self,
        tenants: Vec<(TenantShardId, Generation)>,
-    ) -> Result<HashMap<TenantShardId, bool>, RetryForeverError>;
+    ) -> impl Future<Output = Result<HashMap<TenantShardId, bool>, RetryForeverError>> + Send;
 }

 impl ControlPlaneClient {
@@ -123,7 +125,6 @@ impl ControlPlaneClient {
    }
 }

-#[async_trait::async_trait]
 impl ControlPlaneGenerationsApi for ControlPlaneClient {
    /// Block until we get a successful response, or error out if we are shut down
    async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -831,7 +831,6 @@ mod test {
        }
    }

-    #[async_trait::async_trait]
    impl ControlPlaneGenerationsApi for MockControlPlane {
        #[allow(clippy::diverging_sub_expression)] // False positive via async_trait
        async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -47,21 +47,24 @@ use std::{
 };

 use anyhow::Context;
-use camino::Utf8Path;
+use pageserver_api::shard::TenantShardId;
 use remote_storage::GenericRemoteStorage;
 use serde::{Deserialize, Serialize};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn, Instrument};
-use utils::completion;
 use utils::serde_percent::Percent;
+use utils::{completion, id::TimelineId};

 use crate::{
    config::PageServerConf,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        self,
-        storage_layer::{AsLayerDesc, EvictionError, Layer},
+        mgr::TenantManager,
+        remote_timeline_client::LayerFileMetadata,
+        secondary::SecondaryTenant,
+        storage_layer::{AsLayerDesc, EvictionError, Layer, LayerFileName},
        Timeline,
    },
 };
@@ -125,6 +128,7 @@ pub fn launch_disk_usage_global_eviction_task(
    conf: &'static PageServerConf,
    storage: GenericRemoteStorage,
    state: Arc<State>,
+    tenant_manager: Arc<TenantManager>,
    background_jobs_barrier: completion::Barrier,
 ) -> anyhow::Result<()> {
    let Some(task_config) = &conf.disk_usage_based_eviction else {
@@ -150,8 +154,7 @@ pub fn launch_disk_usage_global_eviction_task(
                _ = background_jobs_barrier.wait() => { }
            };

-            disk_usage_eviction_task(&state, task_config, &storage, &conf.tenants_path(), cancel)
-                .await;
+            disk_usage_eviction_task(&state, task_config, &storage, tenant_manager, cancel).await;
            Ok(())
        },
    );
@@ -164,7 +167,7 @@ async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
    storage: &GenericRemoteStorage,
-    tenants_dir: &Utf8Path,
+    tenant_manager: Arc<TenantManager>,
    cancel: CancellationToken,
 ) {
    scopeguard::defer! {
@@ -191,7 +194,7 @@ async fn disk_usage_eviction_task(
                state,
                task_config,
                storage,
-                tenants_dir,
+                &tenant_manager,
                &cancel,
            )
            .await;
@@ -226,15 +229,17 @@ async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
    storage: &GenericRemoteStorage,
-    tenants_dir: &Utf8Path,
+    tenant_manager: &Arc<TenantManager>,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
-    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
+    let tenants_dir = tenant_manager.get_conf().tenants_path();
+    let usage_pre = filesystem_level_usage::get(&tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
    let res = disk_usage_eviction_task_iteration_impl(
        state,
        storage,
        usage_pre,
+        tenant_manager,
        task_config.eviction_order,
        cancel,
    )
@@ -248,7 +253,7 @@ async fn disk_usage_eviction_task_iteration(
                }
                IterationOutcome::Finished(outcome) => {
                    // Verify with statvfs whether we made any real progress
-                    let after = filesystem_level_usage::get(tenants_dir, task_config)
+                    let after = filesystem_level_usage::get(&tenants_dir, task_config)
                        // It's quite unlikely to hit the error here. Keep the code simple and bail out.
                        .context("get filesystem-level disk usage after evictions")?;

@@ -324,6 +329,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
    _storage: &GenericRemoteStorage,
    usage_pre: U,
+    tenant_manager: &Arc<TenantManager>,
    eviction_order: EvictionOrder,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
@@ -344,29 +350,29 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
        "running disk usage based eviction due to pressure"
    );

-    let candidates = match collect_eviction_candidates(eviction_order, cancel).await? {
-        EvictionCandidates::Cancelled => {
-            return Ok(IterationOutcome::Cancelled);
-        }
-        EvictionCandidates::Finished(partitioned) => partitioned,
-    };
+    let candidates =
+        match collect_eviction_candidates(tenant_manager, eviction_order, cancel).await? {
+            EvictionCandidates::Cancelled => {
+                return Ok(IterationOutcome::Cancelled);
+            }
+            EvictionCandidates::Finished(partitioned) => partitioned,
+        };

    // Debug-log the list of candidates
    let now = SystemTime::now();
    for (i, (partition, candidate)) in candidates.iter().enumerate() {
        let nth = i + 1;
-        let desc = candidate.layer.layer_desc();
        let total_candidates = candidates.len();
-        let size = desc.file_size;
+        let size = candidate.layer.get_file_size();
        let rel = candidate.relative_last_activity;
        debug!(
            "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}",
            now.duration_since(candidate.last_activity_ts)
                .unwrap()
                .as_micros(),
-            desc.tenant_shard_id,
-            desc.timeline_id,
-            candidate.layer,
+            candidate.layer.get_tenant_shard_id(),
+            candidate.layer.get_timeline_id(),
+            candidate.layer.get_name(),
        );
    }

@@ -380,39 +386,56 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
-    let mut warned = None;
-    let mut usage_planned = usage_pre;
-    let mut evicted_amount = 0;

-    for (i, (partition, candidate)) in candidates.iter().enumerate() {
-        if !usage_planned.has_pressure() {
-            debug!(
-                no_candidates_evicted = i,
-                "took enough candidates for pressure to be relieved"
-            );
-            break;
+    let selection = select_victims(&candidates, usage_pre);
+
+    let mut candidates = candidates;
+
+    let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) {
+        // we currently have the layers ordered by AbsoluteAccessed so that we can get the summary
+        // for comparison here. this is a temporary measure to develop alternatives.
+        use std::fmt::Write;
+
+        let mut summary_buf = String::with_capacity(256);
+
+        {
+            let absolute_summary = candidates
+                .iter()
+                .take(selection.amount)
+                .map(|(_, candidate)| candidate)
+                .collect::<summary::EvictionSummary>();
+
+            write!(summary_buf, "{absolute_summary}").expect("string grows");
+
+            info!("absolute accessed selection summary: {summary_buf}");
        }

-        if partition == &MinResidentSizePartition::Below && warned.is_none() {
-            warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
-            warned = Some(usage_planned);
+        candidates.sort_unstable_by_key(|(partition, candidate)| {
+            (*partition, candidate.relative_last_activity)
+        });
+
+        let selection = select_victims(&candidates, usage_pre);
+
+        {
+            summary_buf.clear();
+
+            let relative_summary = candidates
+                .iter()
+                .take(selection.amount)
+                .map(|(_, candidate)| candidate)
+                .collect::<summary::EvictionSummary>();
+
+            write!(summary_buf, "{relative_summary}").expect("string grows");
+
+            info!("relative accessed selection summary: {summary_buf}");
        }

-        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
-        evicted_amount += 1;
-    }
-
-    let usage_planned = match warned {
-        Some(respecting_tenant_min_resident_size) => PlannedUsage {
-            respecting_tenant_min_resident_size,
-            fallback_to_global_lru: Some(usage_planned),
-        },
-        None => PlannedUsage {
-            respecting_tenant_min_resident_size: usage_planned,
-            fallback_to_global_lru: None,
-        },
+        selection
+    } else {
+        selection
    };
-    debug!(?usage_planned, "usage planned");
+
+    let (evicted_amount, usage_planned) = selection.into_amount_and_planned();

    // phase2: evict layers

@@ -463,19 +486,30 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                continue;
            };

-            js.spawn(async move {
-                let rtc = candidate.timeline.remote_client.as_ref().expect(
-                    "holding the witness, all timelines must have a remote timeline client",
-                );
-                let file_size = candidate.layer.layer_desc().file_size;
-                candidate
-                    .layer
-                    .evict_and_wait(rtc)
-                    .await
-                    .map(|()| file_size)
-                    .map_err(|e| (file_size, e))
-            });
+            match candidate.layer {
+                EvictionLayer::Attached(layer) => {
+                    let file_size = layer.layer_desc().file_size;
+                    js.spawn(async move {
+                        layer
+                            .evict_and_wait()
+                            .await
+                            .map(|()| file_size)
+                            .map_err(|e| (file_size, e))
+                    });
+                }
+                EvictionLayer::Secondary(layer) => {
+                    let file_size = layer.metadata.file_size();
+                    let tenant_manager = tenant_manager.clone();

+                    js.spawn(async move {
+                        layer
+                            .secondary_tenant
+                            .evict_layer(tenant_manager.get_conf(), layer.timeline_id, layer.name)
+                            .await;
+                        Ok(file_size)
+                    });
+                }
+            }
            tokio::task::yield_now().await;
        }

@@ -502,11 +536,100 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
 }

 #[derive(Clone)]
-struct EvictionCandidate {
-    timeline: Arc<Timeline>,
-    layer: Layer,
-    last_activity_ts: SystemTime,
-    relative_last_activity: finite_f32::FiniteF32,
+pub(crate) struct EvictionSecondaryLayer {
+    pub(crate) secondary_tenant: Arc<SecondaryTenant>,
+    pub(crate) timeline_id: TimelineId,
+    pub(crate) name: LayerFileName,
+    pub(crate) metadata: LayerFileMetadata,
+}
+
+/// Full [`Layer`] objects are specific to tenants in attached mode.  This type is a layer
+/// of indirection to store either a `Layer`, or a reference to a secondary tenant and a layer name.
+#[derive(Clone)]
+pub(crate) enum EvictionLayer {
+    Attached(Layer),
+    #[allow(dead_code)]
+    Secondary(EvictionSecondaryLayer),
+}
+
+impl From<Layer> for EvictionLayer {
+    fn from(value: Layer) -> Self {
+        Self::Attached(value)
+    }
+}
+
+impl EvictionLayer {
+    pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
+        match self {
+            Self::Attached(l) => &l.layer_desc().tenant_shard_id,
+            Self::Secondary(sl) => sl.secondary_tenant.get_tenant_shard_id(),
+        }
+    }
+
+    pub(crate) fn get_timeline_id(&self) -> &TimelineId {
+        match self {
+            Self::Attached(l) => &l.layer_desc().timeline_id,
+            Self::Secondary(sl) => &sl.timeline_id,
+        }
+    }
+
+    pub(crate) fn get_name(&self) -> LayerFileName {
+        match self {
+            Self::Attached(l) => l.layer_desc().filename(),
+            Self::Secondary(sl) => sl.name.clone(),
+        }
+    }
+
+    pub(crate) fn get_file_size(&self) -> u64 {
+        match self {
+            Self::Attached(l) => l.layer_desc().file_size,
+            Self::Secondary(sl) => sl.metadata.file_size(),
+        }
+    }
+}
+
+#[derive(Clone)]
+pub(crate) struct EvictionCandidate {
+    pub(crate) layer: EvictionLayer,
+    pub(crate) last_activity_ts: SystemTime,
+    pub(crate) relative_last_activity: finite_f32::FiniteF32,
+}
+
+impl std::fmt::Display for EvictionLayer {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        match self {
+            Self::Attached(l) => l.fmt(f),
+            Self::Secondary(sl) => {
+                write!(f, "{}/{}", sl.timeline_id, sl.name)
+            }
+        }
+    }
+}
+
+pub(crate) struct DiskUsageEvictionInfo {
+    /// Timeline's largest layer (remote or resident)
+    pub max_layer_size: Option<u64>,
+    /// Timeline's resident layers
+    pub resident_layers: Vec<EvictionCandidate>,
+}
+
+impl std::fmt::Debug for EvictionCandidate {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it
+        // having to allocate a string to this is bad, but it will rarely be formatted
+        let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
+        let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
+        struct DisplayIsDebug<'a, T>(&'a T);
+        impl<'a, T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'a, T> {
+            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+                write!(f, "{}", self.0)
+            }
+        }
+        f.debug_struct("LocalLayerInfoForDiskUsageEviction")
+            .field("layer", &DisplayIsDebug(&self.layer))
+            .field("last_activity", &ts)
+            .finish()
+    }
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
@@ -623,6 +746,7 @@ enum EvictionCandidates {
 /// - tenant B 1 layer
 /// - tenant C 8 layers
 async fn collect_eviction_candidates(
+    tenant_manager: &Arc<TenantManager>,
    eviction_order: EvictionOrder,
    cancel: &CancellationToken,
 ) -> anyhow::Result<EvictionCandidates> {
@@ -631,13 +755,16 @@ async fn collect_eviction_candidates(
        .await
        .context("get list of tenants")?;

+    // TODO: avoid listing every layer in every tenant: this loop can block the executor,
+    // and the resulting data structure can be huge.
+    // (https://github.com/neondatabase/neon/issues/6224)
    let mut candidates = Vec::new();

-    for (tenant_id, _state) in &tenants {
+    for (tenant_id, _state, _gen) in tenants {
        if cancel.is_cancelled() {
            return Ok(EvictionCandidates::Cancelled);
        }
-        let tenant = match tenant::mgr::get_tenant(*tenant_id, true) {
+        let tenant = match tenant::mgr::get_tenant(tenant_id, true) {
            Ok(tenant) => tenant,
            Err(e) => {
                // this can happen if tenant has lifecycle transition after we fetched it
@@ -665,11 +792,7 @@ async fn collect_eviction_candidates(
            }
            let info = tl.get_local_layers_for_disk_usage_eviction().await;
            debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
-            tenant_candidates.extend(
-                info.resident_layers
-                    .into_iter()
-                    .map(|layer_infos| (tl.clone(), layer_infos)),
-            );
+            tenant_candidates.extend(info.resident_layers.into_iter());
            max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0));

            if cancel.is_cancelled() {
@@ -690,14 +813,16 @@ async fn collect_eviction_candidates(
        // A default override can be put in the default tenant conf in the pageserver.toml.
        let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
            debug!(
-                tenant_id=%tenant.tenant_id(),
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
                overridden_size=s,
                "using overridden min resident size for tenant"
            );
            s
        } else {
            debug!(
-                tenant_id=%tenant.tenant_id(),
+                tenant_id=%tenant.tenant_shard_id().tenant_id,
+                shard_id=%tenant.tenant_shard_id().shard_slug(),
                max_layer_size,
                "using max layer size as min_resident_size for tenant",
            );
@@ -707,7 +832,7 @@ async fn collect_eviction_candidates(
        // Sort layers most-recently-used first, then partition by
        // cumsum above/below min_resident_size.
        tenant_candidates
-            .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts));
+            .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
        let mut cumsum: i128 = 0;

        // keeping the -1 or not decides if every tenant should lose their least recently accessed
@@ -741,12 +866,10 @@ async fn collect_eviction_candidates(
            .unwrap_or(1);
        let divider = total as f32;

-        for (i, (timeline, layer_info)) in tenant_candidates.into_iter().enumerate() {
-            let file_size = layer_info.file_size();
-
+        for (i, mut candidate) in tenant_candidates.into_iter().enumerate() {
            // as we iterate this reverse sorted list, the most recently accessed layer will always
            // be 1.0; this is for us to evict it last.
-            let relative_last_activity = if matches!(
+            candidate.relative_last_activity = if matches!(
                eviction_order,
                EvictionOrder::RelativeAccessed { .. }
            ) {
@@ -761,41 +884,123 @@ async fn collect_eviction_candidates(
                finite_f32::FiniteF32::ZERO
            };

-            let candidate = EvictionCandidate {
-                timeline,
-                last_activity_ts: layer_info.last_activity_ts,
-                layer: layer_info.layer,
-                relative_last_activity,
-            };
            let partition = if cumsum > min_resident_size as i128 {
                MinResidentSizePartition::Above
            } else {
                MinResidentSizePartition::Below
            };
+            cumsum += i128::from(candidate.layer.get_file_size());
            candidates.push((partition, candidate));
-            cumsum += i128::from(file_size);
        }
    }

+    // Note: the same tenant ID might be hit twice, if it transitions from attached to
+    // secondary while we run.  That is okay: when we eventually try and run the eviction,
+    // the `Gate` on the object will ensure that whichever one has already been shut down
+    // will not delete anything.
+
+    let mut secondary_tenants = Vec::new();
+    tenant_manager.foreach_secondary_tenants(
+        |_tenant_shard_id: &TenantShardId, state: &Arc<SecondaryTenant>| {
+            secondary_tenants.push(state.clone());
+        },
+    );
+
+    for secondary_tenant in secondary_tenants {
+        let mut layer_info = secondary_tenant.get_layers_for_eviction();
+
+        layer_info
+            .resident_layers
+            .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
+
+        candidates.extend(layer_info.resident_layers.into_iter().map(|candidate| {
+            (
+                // Secondary locations' layers are always considered above the min resident size,
+                // i.e. secondary locations are permitted to be trimmed to zero layers if all
+                // the layers have sufficiently old access times.
+                MinResidentSizePartition::Above,
+                candidate,
+            )
+        }));
+    }
+
    debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
        "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");

-    match eviction_order {
-        EvictionOrder::AbsoluteAccessed => {
-            candidates.sort_unstable_by_key(|(partition, candidate)| {
-                (*partition, candidate.last_activity_ts)
-            });
-        }
-        EvictionOrder::RelativeAccessed { .. } => {
-            candidates.sort_unstable_by_key(|(partition, candidate)| {
-                (*partition, candidate.relative_last_activity)
-            });
-        }
-    }
+    // always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we
+    // will sort later by candidate.relative_last_activity to get compare evictions.
+    candidates
+        .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts));

    Ok(EvictionCandidates::Finished(candidates))
 }

+/// Given a pre-sorted vec of all layers in the system, select the first N which are enough to
+/// relieve pressure.
+///
+/// Returns the amount of candidates selected, with the planned usage.
+fn select_victims<U: Usage>(
+    candidates: &[(MinResidentSizePartition, EvictionCandidate)],
+    usage_pre: U,
+) -> VictimSelection<U> {
+    let mut usage_when_switched = None;
+    let mut usage_planned = usage_pre;
+    let mut evicted_amount = 0;
+
+    for (i, (partition, candidate)) in candidates.iter().enumerate() {
+        if !usage_planned.has_pressure() {
+            break;
+        }
+
+        if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() {
+            usage_when_switched = Some((usage_planned, i));
+        }
+
+        usage_planned.add_available_bytes(candidate.layer.get_file_size());
+        evicted_amount += 1;
+    }
+
+    VictimSelection {
+        amount: evicted_amount,
+        usage_pre,
+        usage_when_switched,
+        usage_planned,
+    }
+}
+
+struct VictimSelection<U> {
+    amount: usize,
+    usage_pre: U,
+    usage_when_switched: Option<(U, usize)>,
+    usage_planned: U,
+}
+
+impl<U: Usage> VictimSelection<U> {
+    fn into_amount_and_planned(self) -> (usize, PlannedUsage<U>) {
+        debug!(
+            evicted_amount=%self.amount,
+            "took enough candidates for pressure to be relieved"
+        );
+
+        if let Some((usage_planned, candidate_no)) = self.usage_when_switched.as_ref() {
+            warn!(usage_pre=?self.usage_pre, ?usage_planned, candidate_no, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
+        }
+
+        let planned = match self.usage_when_switched {
+            Some((respecting_tenant_min_resident_size, _)) => PlannedUsage {
+                respecting_tenant_min_resident_size,
+                fallback_to_global_lru: Some(self.usage_planned),
+            },
+            None => PlannedUsage {
+                respecting_tenant_min_resident_size: self.usage_planned,
+                fallback_to_global_lru: None,
+            },
+        };
+
+        (self.amount, planned)
+    }
+}
+
 struct TimelineKey(Arc<Timeline>);

 impl PartialEq for TimelineKey {
@@ -821,7 +1026,7 @@ impl std::ops::Deref for TimelineKey {
 }

 /// A totally ordered f32 subset we can use with sorting functions.
-mod finite_f32 {
+pub(crate) mod finite_f32 {

    /// A totally ordered f32 subset we can use with sorting functions.
    #[derive(Clone, Copy, PartialEq)]
@@ -880,6 +1085,137 @@ mod finite_f32 {
    }
 }

+mod summary {
+    use super::finite_f32::FiniteF32;
+    use super::{EvictionCandidate, LayerCount};
+    use pageserver_api::shard::TenantShardId;
+    use std::collections::{BTreeMap, HashMap};
+    use std::time::SystemTime;
+
+    #[derive(Debug, Default)]
+    pub(super) struct EvictionSummary {
+        evicted_per_tenant: HashMap<TenantShardId, LayerCount>,
+        total: LayerCount,
+
+        last_absolute: Option<SystemTime>,
+        last_relative: Option<FiniteF32>,
+    }
+
+    impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary {
+        fn from_iter<T: IntoIterator<Item = &'a EvictionCandidate>>(iter: T) -> Self {
+            let mut summary = EvictionSummary::default();
+            for item in iter {
+                let counts = summary
+                    .evicted_per_tenant
+                    .entry(*item.layer.get_tenant_shard_id())
+                    .or_default();
+
+                let sz = item.layer.get_file_size();
+
+                counts.file_sizes += sz;
+                counts.count += 1;
+
+                summary.total.file_sizes += sz;
+                summary.total.count += 1;
+
+                summary.last_absolute = Some(item.last_activity_ts);
+                summary.last_relative = Some(item.relative_last_activity);
+            }
+
+            summary
+        }
+    }
+
+    struct SiBytesAmount(u64);
+
+    impl std::fmt::Display for SiBytesAmount {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            if self.0 < 1024 {
+                return write!(f, "{}B", self.0);
+            }
+
+            let mut tmp = self.0;
+            let mut ch = 0;
+            let suffixes = b"KMGTPE";
+
+            while tmp > 1024 * 1024 && ch < suffixes.len() - 1 {
+                tmp /= 1024;
+                ch += 1;
+            }
+
+            let ch = suffixes[ch] as char;
+
+            write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0)
+        }
+    }
+
+    impl std::fmt::Display for EvictionSummary {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            // wasteful, but it's for testing
+
+            let mut sorted: BTreeMap<usize, Vec<(TenantShardId, u64)>> = BTreeMap::new();
+
+            for (tenant_shard_id, count) in &self.evicted_per_tenant {
+                sorted
+                    .entry(count.count)
+                    .or_default()
+                    .push((*tenant_shard_id, count.file_sizes));
+            }
+
+            let total_file_sizes = SiBytesAmount(self.total.file_sizes);
+
+            writeln!(
+                f,
+                "selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):",
+                self.total.count, self.last_absolute, self.last_relative,
+            )?;
+
+            for (count, per_tenant) in sorted.iter().rev().take(10) {
+                write!(f, "- {count} layers: ")?;
+
+                if per_tenant.len() < 3 {
+                    for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() {
+                        if i > 0 {
+                            write!(f, ", ")?;
+                        }
+                        let bytes = SiBytesAmount(*bytes);
+                        write!(f, "{tenant_shard_id} ({bytes})")?;
+                    }
+                } else {
+                    let num_tenants = per_tenant.len();
+                    let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>();
+                    let total_bytes = SiBytesAmount(total_bytes);
+                    let layers = num_tenants * count;
+
+                    write!(
+                        f,
+                        "{num_tenants} tenants {total_bytes} in total {layers} layers",
+                    )?;
+                }
+
+                writeln!(f)?;
+            }
+
+            if sorted.len() > 10 {
+                let (rem_count, rem_bytes) = sorted
+                    .iter()
+                    .rev()
+                    .map(|(count, per_tenant)| {
+                        (
+                            count,
+                            per_tenant.iter().map(|(_id, bytes)| bytes).sum::<u64>(),
+                        )
+                    })
+                    .fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1));
+                let rem_bytes = SiBytesAmount(rem_bytes);
+                writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?;
+            }
+
+            Ok(())
+        }
+    }
+}
+
 mod filesystem_level_usage {
    use anyhow::Context;
    use camino::Utf8Path;
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -14,7 +14,10 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::LocationConfigListResponse;
+use pageserver_api::models::ShardParameters;
 use pageserver_api::models::TenantDetails;
+use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    DownloadRemoteLayersTaskSpawnRequest, LocationConfigMode, TenantAttachRequest,
    TenantLoadRequest, TenantLocationConfigRequest,
@@ -41,12 +44,14 @@ use crate::tenant::mgr::{
    GetTenantError, SetNewTenantConfigError, TenantManager, TenantMapError, TenantMapInsertError,
    TenantSlotError, TenantSlotUpsertError, TenantStateError,
 };
+use crate::tenant::mgr::{TenantSlot, UpsertLocationError};
 use crate::tenant::secondary::SecondaryController;
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::Timeline;
-use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
+use crate::tenant::SpawnMode;
+use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError};
 use crate::{config::PageServerConf, tenant::mgr};
 use crate::{disk_usage_eviction_task, tenant};
 use pageserver_api::models::{
@@ -112,14 +117,6 @@ impl State {
            secondary_controller,
        })
    }
-
-    fn tenant_resources(&self) -> TenantSharedResources {
-        TenantSharedResources {
-            broker_client: self.broker_client.clone(),
-            remote_storage: self.remote_storage.clone(),
-            deletion_queue_client: self.deletion_queue_client.clone(),
-        }
-    }
 }

 #[inline(always)]
@@ -152,6 +149,7 @@ impl From<PageReconstructError> for ApiError {
            PageReconstructError::AncestorStopping(_) => {
                ApiError::ResourceUnavailable(format!("{pre}").into())
            }
+            PageReconstructError::AncestorLsnTimeout(e) => ApiError::Timeout(format!("{e}").into()),
            PageReconstructError::WalRedo(pre) => ApiError::InternalServerError(pre),
        }
    }
@@ -174,7 +172,7 @@ impl From<TenantSlotError> for ApiError {
            NotFound(tenant_id) => {
                ApiError::NotFound(anyhow::anyhow!("NotFound: tenant {tenant_id}").into())
            }
-            e @ (AlreadyExists(_, _) | Conflict(_)) => ApiError::Conflict(format!("{e}")),
+            e @ AlreadyExists(_, _) => ApiError::Conflict(format!("{e}")),
            InProgress => {
                ApiError::ResourceUnavailable("Tenant is being modified concurrently".into())
            }
@@ -193,6 +191,18 @@ impl From<TenantSlotUpsertError> for ApiError {
    }
 }

+impl From<UpsertLocationError> for ApiError {
+    fn from(e: UpsertLocationError) -> ApiError {
+        use UpsertLocationError::*;
+        match e {
+            BadRequest(e) => ApiError::BadRequest(e),
+            Unavailable(_) => ApiError::ShuttingDown,
+            e @ InProgress => ApiError::Conflict(format!("{e}")),
+            Flush(e) | Other(e) => ApiError::InternalServerError(e),
+        }
+    }
+}
+
 impl From<TenantMapError> for ApiError {
    fn from(e: TenantMapError) -> ApiError {
        use TenantMapError::*;
@@ -257,7 +267,7 @@ impl From<SetNewTenantConfigError> for ApiError {
            SetNewTenantConfigError::GetTenant(tid) => {
                ApiError::NotFound(anyhow!("tenant {}", tid).into())
            }
-            e @ SetNewTenantConfigError::Persist(_) => {
+            e @ (SetNewTenantConfigError::Persist(_) | SetNewTenantConfigError::Other(_)) => {
                ApiError::InternalServerError(anyhow::Error::new(e))
            }
        }
@@ -315,11 +325,21 @@ impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
 async fn build_timeline_info(
    timeline: &Arc<Timeline>,
    include_non_incremental_logical_size: bool,
+    force_await_initial_logical_size: bool,
    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();

-    let mut info = build_timeline_info_common(timeline, ctx).await?;
+    if force_await_initial_logical_size {
+        timeline.clone().await_initial_logical_size().await
+    }
+
+    let mut info = build_timeline_info_common(
+        timeline,
+        ctx,
+        tenant::timeline::GetLogicalSizePriority::Background,
+    )
+    .await?;
    if include_non_incremental_logical_size {
        // XXX we should be using spawn_ondemand_logical_size_calculation here.
        // Otherwise, if someone deletes the timeline / detaches the tenant while
@@ -336,6 +356,7 @@ async fn build_timeline_info(
 async fn build_timeline_info_common(
    timeline: &Arc<Timeline>,
    ctx: &RequestContext,
+    logical_size_task_priority: tenant::timeline::GetLogicalSizePriority,
 ) -> anyhow::Result<TimelineInfo> {
    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
    let initdb_lsn = timeline.initdb_lsn;
@@ -358,8 +379,7 @@ async fn build_timeline_info_common(
        Lsn(0) => None,
        lsn @ Lsn(_) => Some(lsn),
    };
-    let current_logical_size =
-        timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx);
+    let current_logical_size = timeline.get_current_logical_size(logical_size_task_priority, ctx);
    let current_physical_size = Some(timeline.layer_size_sum().await);
    let state = timeline.current_state();
    let remote_consistent_lsn_projected = timeline
@@ -470,7 +490,7 @@ async fn timeline_create_handler(
        .await {
            Ok(new_timeline) => {
                // Created. Construct a TimelineInfo for it.
-                let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
+                let timeline_info = build_timeline_info_common(&new_timeline, &ctx, tenant::timeline::GetLogicalSizePriority::User)
                    .await
                    .map_err(ApiError::InternalServerError)?;
                json_response(StatusCode::CREATED, timeline_info)
@@ -506,6 +526,8 @@ async fn timeline_list_handler(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let include_non_incremental_logical_size: Option<bool> =
        parse_query_param(&request, "include-non-incremental-logical-size")?;
+    let force_await_initial_logical_size: Option<bool> =
+        parse_query_param(&request, "force-await-initial-logical-size")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
@@ -519,6 +541,7 @@ async fn timeline_list_handler(
            let timeline_info = build_timeline_info(
                &timeline,
                include_non_incremental_logical_size.unwrap_or(false),
+                force_await_initial_logical_size.unwrap_or(false),
                &ctx,
            )
            .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
@@ -546,6 +569,8 @@ async fn timeline_detail_handler(
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let include_non_incremental_logical_size: Option<bool> =
        parse_query_param(&request, "include-non-incremental-logical-size")?;
+    let force_await_initial_logical_size: Option<bool> =
+        parse_query_param(&request, "force-await-initial-logical-size")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    // Logical size calculation needs downloading.
@@ -561,6 +586,7 @@ async fn timeline_detail_handler(
        let timeline_info = build_timeline_info(
            &timeline,
            include_non_incremental_logical_size.unwrap_or(false),
+            force_await_initial_logical_size.unwrap_or(false),
            &ctx,
        )
        .await
@@ -679,16 +705,39 @@ async fn tenant_attach_handler(
        )));
    }

-    mgr::attach_tenant(
-        state.conf,
-        tenant_id,
-        generation,
-        tenant_conf,
-        state.tenant_resources(),
-        &ctx,
-    )
-    .instrument(info_span!("tenant_attach", %tenant_id))
-    .await?;
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+    let shard_params = ShardParameters::default();
+    let location_conf = LocationConf::attached_single(tenant_conf, generation, &shard_params);
+
+    let tenant = state
+        .tenant_manager
+        .upsert_location(
+            tenant_shard_id,
+            location_conf,
+            None,
+            SpawnMode::Normal,
+            &ctx,
+        )
+        .await?;
+
+    let Some(tenant) = tenant else {
+        // This should never happen: indicates a bug in upsert_location
+        return Err(ApiError::InternalServerError(anyhow::anyhow!(
+            "Upsert succeeded but didn't return tenant!"
+        )));
+    };
+
+    // We might have successfully constructed a Tenant, but it could still
+    // end up in a broken state:
+    if let TenantState::Broken {
+        reason,
+        backtrace: _,
+    } = tenant.current_state()
+    {
+        return Err(ApiError::InternalServerError(anyhow::anyhow!(
+            "Tenant state is Broken: {reason}"
+        )));
+    }

    json_response(StatusCode::ACCEPTED, ())
 }
@@ -829,11 +878,12 @@ async fn tenant_list_handler(
            ApiError::ResourceUnavailable("Tenant map is initializing or shutting down".into())
        })?
        .iter()
-        .map(|(id, state)| TenantInfo {
+        .map(|(id, state, gen)| TenantInfo {
            id: *id,
            state: state.clone(),
            current_physical_size: None,
            attachment_status: state.attachment_status(),
+            generation: (*gen).into(),
        })
        .collect::<Vec<TenantInfo>>();

@@ -863,6 +913,7 @@ async fn tenant_status(
                state: state.clone(),
                current_physical_size: Some(current_physical_size),
                attachment_status: state.attachment_status(),
+                generation: tenant.generation().into(),
            },
            timelines: tenant.list_timeline_ids(),
        })
@@ -1147,17 +1198,26 @@ async fn tenant_create_handler(

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);

-    let new_tenant = mgr::create_tenant(
-        state.conf,
-        tenant_conf,
-        target_tenant_id,
-        generation,
-        state.tenant_resources(),
-        &ctx,
-    )
-    .instrument(info_span!("tenant_create", tenant_id = %target_tenant_id))
-    .await?;
+    let location_conf =
+        LocationConf::attached_single(tenant_conf, generation, &request_data.shard_parameters);

+    let new_tenant = state
+        .tenant_manager
+        .upsert_location(
+            target_tenant_id,
+            location_conf,
+            None,
+            SpawnMode::Create,
+            &ctx,
+        )
+        .await?;
+
+    let Some(new_tenant) = new_tenant else {
+        // This should never happen: indicates a bug in upsert_location
+        return Err(ApiError::InternalServerError(anyhow::anyhow!(
+            "Upsert succeeded but didn't return tenant!"
+        )));
+    };
    // We created the tenant. Existing API semantics are that the tenant
    // is Active when this function returns.
    if let res @ Err(_) = new_tenant
@@ -1165,7 +1225,7 @@ async fn tenant_create_handler(
        .await
    {
        // This shouldn't happen because we just created the tenant directory
-        // in tenant::mgr::create_tenant, and there aren't any remote timelines
+        // in upsert_location, and there aren't any remote timelines
        // to load, so, nothing can really fail during load.
        // Don't do cleanup because we don't know how we got here.
        // The tenant will likely be in `Broken` state and subsequent
@@ -1176,7 +1236,7 @@ async fn tenant_create_handler(

    json_response(
        StatusCode::CREATED,
-        TenantCreateResponse(new_tenant.tenant_id()),
+        TenantCreateResponse(new_tenant.tenant_shard_id().tenant_id),
    )
 }

@@ -1266,16 +1326,57 @@ async fn put_tenant_location_config_handler(

    state
        .tenant_manager
-        .upsert_location(tenant_shard_id, location_conf, flush, &ctx)
-        .await
-        // TODO: badrequest assumes the caller was asking for something unreasonable, but in
-        // principle we might have hit something like concurrent API calls to the same tenant,
-        // which is not a 400 but a 409.
-        .map_err(ApiError::BadRequest)?;
+        .upsert_location(
+            tenant_shard_id,
+            location_conf,
+            flush,
+            tenant::SpawnMode::Normal,
+            &ctx,
+        )
+        .await?;
+
+    if let Some(_flush_ms) = flush {
+        match state
+            .secondary_controller
+            .upload_tenant(tenant_shard_id)
+            .await
+        {
+            Ok(()) => {
+                tracing::info!("Uploaded heatmap during flush");
+            }
+            Err(e) => {
+                tracing::warn!("Failed to flush heatmap: {e}");
+            }
+        }
+    } else {
+        tracing::info!("No flush requested when configuring");
+    }

    json_response(StatusCode::OK, ())
 }

+async fn list_location_config_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let slots = state.tenant_manager.list();
+    let result = LocationConfigListResponse {
+        tenant_shards: slots
+            .into_iter()
+            .map(|(tenant_shard_id, slot)| {
+                let v = match slot {
+                    TenantSlot::Attached(t) => Some(t.get_location_conf()),
+                    TenantSlot::Secondary(s) => Some(s.get_location_conf()),
+                    TenantSlot::InProgress(_) => None,
+                };
+                (tenant_shard_id, v)
+            })
+            .collect(),
+    };
+    json_response(StatusCode::OK, result)
+}
+
 /// Testing helper to transition a tenant to [`crate::tenant::TenantState::Broken`].
 async fn handle_tenant_break(
    r: Request<Body>,
@@ -1577,12 +1678,13 @@ async fn disk_usage_eviction_run(
        )));
    };

-    let state = state.disk_usage_eviction_state.clone();
+    let eviction_state = state.disk_usage_eviction_state.clone();

    let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
-        &state,
+        &eviction_state,
        storage,
        usage,
+        &state.tenant_manager,
        config.eviction_order,
        &cancel,
    )
@@ -1610,6 +1712,21 @@ async fn secondary_upload_handler(
    json_response(StatusCode::OK, ())
 }

+async fn secondary_download_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let state = get_state(&request);
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    state
+        .secondary_controller
+        .download_tenant(tenant_shard_id)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
    json_response(
        StatusCode::NOT_FOUND,
@@ -1802,6 +1919,9 @@ pub fn make_router(
        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
            api_handler(r, put_tenant_location_config_handler)
        })
+        .get("/v1/location_config", |r| {
+            api_handler(r, list_location_config_handler)
+        })
        .get("/v1/tenant/:tenant_shard_id/timeline", |r| {
            api_handler(r, timeline_list_handler)
        })
@@ -1878,6 +1998,9 @@ pub fn make_router(
        .put("/v1/deletion_queue/flush", |r| {
            api_handler(r, deletion_queue_flush)
        })
+        .post("/v1/tenant/:tenant_shard_id/secondary/download", |r| {
+            api_handler(r, secondary_download_handler)
+        })
        .put("/v1/tenant/:tenant_shard_id/break", |r| {
            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
        })
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -21,6 +21,7 @@ use tracing::*;
 use walkdir::WalkDir;

 use crate::context::RequestContext;
+use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::Timeline;
@@ -312,13 +313,16 @@ async fn import_wal(
        waldecoder.feed_bytes(&buf);

        let mut nrecords = 0;
-        let mut modification = tline.begin_modification(endpoint);
+        let mut modification = tline.begin_modification(last_lsn);
        let mut decoded = DecodedWALRecord::default();
        while last_lsn <= endpoint {
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                walingest
                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                    .await?;
+                WAL_INGEST.records_committed.inc();
+
+                modification.commit(ctx).await?;
                last_lsn = lsn;

                nrecords += 1;
@@ -448,13 +452,14 @@ pub async fn import_wal_from_tar(

        waldecoder.feed_bytes(&bytes[offset..]);

-        let mut modification = tline.begin_modification(end_lsn);
+        let mut modification = tline.begin_modification(last_lsn);
        let mut decoded = DecodedWALRecord::default();
        while last_lsn <= end_lsn {
            if let Some((lsn, recdata)) = waldecoder.poll_decode()? {
                walingest
                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, ctx)
                    .await?;
+                modification.commit(ctx).await?;
                last_lsn = lsn;

                debug!("imported record at {} (end {})", lsn, end_lsn);
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -117,6 +117,10 @@ pub const TENANT_CONFIG_NAME: &str = "config";
 /// Full path: `tenants/<tenant_id>/config`.
 pub const TENANT_LOCATION_CONFIG_NAME: &str = "config-v1";

+/// Per-tenant copy of their remote heatmap, downloaded into the local
+/// tenant path while in secondary mode.
+pub const TENANT_HEATMAP_BASENAME: &str = "heatmap-v1.json";
+
 /// A suffix used for various temporary files. Any temporary files found in the
 /// data directory at pageserver startup can be automatically removed.
 pub const TEMP_FILE_SUFFIX: &str = "___temp";
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -11,7 +11,7 @@ use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
 use strum::{EnumCount, IntoEnumIterator, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
-use utils::id::{TenantId, TimelineId};
+use utils::id::TimelineId;

 /// Prometheus histogram buckets (in seconds) for operations in the critical
 /// path. In other words, operations that directly affect that latency of user
@@ -29,7 +29,7 @@ const CRITICAL_OP_BUCKETS: &[f64] = &[
 // Metrics collected on operations on the storage repository.
 #[derive(Debug, EnumVariantNames, IntoStaticStr)]
 #[strum(serialize_all = "kebab_case")]
-pub enum StorageTimeOperation {
+pub(crate) enum StorageTimeOperation {
    #[strum(serialize = "layer flush")]
    LayerFlush,

@@ -55,20 +55,20 @@ pub enum StorageTimeOperation {
    CreateTenant,
 }

-pub static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
+pub(crate) static STORAGE_TIME_SUM_PER_TIMELINE: Lazy<CounterVec> = Lazy::new(|| {
    register_counter_vec!(
        "pageserver_storage_operations_seconds_sum",
        "Total time spent on storage operations with operation, tenant and timeline dimensions",
-        &["operation", "tenant_id", "timeline_id"],
+        &["operation", "tenant_id", "shard_id", "timeline_id"],
    )
    .expect("failed to define a metric")
 });

-pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_storage_operations_seconds_count",
        "Count of storage operations with operation, tenant and timeline dimensions",
-        &["operation", "tenant_id", "timeline_id"],
+        &["operation", "tenant_id", "shard_id", "timeline_id"],
    )
    .expect("failed to define a metric")
 });
@@ -150,7 +150,7 @@ pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub struct PageCacheMetricsForTaskKind {
+pub(crate) struct PageCacheMetricsForTaskKind {
    pub read_accesses_materialized_page: IntCounter,
    pub read_accesses_immutable: IntCounter,

@@ -159,7 +159,7 @@ pub struct PageCacheMetricsForTaskKind {
    pub read_hits_materialized_page_older_lsn: IntCounter,
 }

-pub struct PageCacheMetrics {
+pub(crate) struct PageCacheMetrics {
    map: EnumMap<TaskKind, EnumMap<PageContentKind, PageCacheMetricsForTaskKind>>,
 }

@@ -181,7 +181,7 @@ static PAGE_CACHE_READ_ACCESSES: Lazy<IntCounterVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
+pub(crate) static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
    map: EnumMap::from_array(std::array::from_fn(|task_kind| {
        let task_kind = <TaskKind as enum_map::Enum>::from_usize(task_kind);
        let task_kind: &'static str = task_kind.into();
@@ -243,10 +243,9 @@ impl PageCacheMetrics {
    }
 }

-pub struct PageCacheSizeMetrics {
+pub(crate) struct PageCacheSizeMetrics {
    pub max_bytes: UIntGauge,

-    pub current_bytes_ephemeral: UIntGauge,
    pub current_bytes_immutable: UIntGauge,
    pub current_bytes_materialized_page: UIntGauge,
 }
@@ -260,31 +259,26 @@ static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });

-pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
-    max_bytes: {
-        register_uint_gauge!(
-            "pageserver_page_cache_size_max_bytes",
-            "Maximum size of the page cache in bytes"
-        )
-        .expect("failed to define a metric")
-    },
-
-    current_bytes_ephemeral: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["ephemeral"])
-            .unwrap()
-    },
-    current_bytes_immutable: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["immutable"])
-            .unwrap()
-    },
-    current_bytes_materialized_page: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["materialized_page"])
-            .unwrap()
-    },
-});
+pub(crate) static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> =
+    Lazy::new(|| PageCacheSizeMetrics {
+        max_bytes: {
+            register_uint_gauge!(
+                "pageserver_page_cache_size_max_bytes",
+                "Maximum size of the page cache in bytes"
+            )
+            .expect("failed to define a metric")
+        },
+        current_bytes_immutable: {
+            PAGE_CACHE_SIZE_CURRENT_BYTES
+                .get_metric_with_label_values(&["immutable"])
+                .unwrap()
+        },
+        current_bytes_materialized_page: {
+            PAGE_CACHE_SIZE_CURRENT_BYTES
+                .get_metric_with_label_values(&["materialized_page"])
+                .unwrap()
+        },
+    });

 pub(crate) mod page_cache_eviction_metrics {
    use std::num::NonZeroUsize;
@@ -343,15 +337,6 @@ pub(crate) mod page_cache_eviction_metrics {
    }
 }

-pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
-    register_histogram!(
-        "pageserver_page_cache_acquire_pinned_slot_seconds",
-        "Time spent acquiring a pinned slot in the page cache",
-        CRITICAL_OP_BUCKETS.into(),
-    )
-    .expect("failed to define a metric")
-});
-
 static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "page_cache_errors_total",
@@ -388,7 +373,7 @@ static LAST_RECORD_LSN: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_last_record_lsn",
        "Last record LSN grouped by timeline",
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -397,7 +382,7 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_resident_physical_size",
        "The size of the layer files present in the pageserver's filesystem.",
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -415,7 +400,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
        "pageserver_remote_physical_size",
        "The size of the layer files present in the remote storage that are listed in the the remote index_part.json.",
        // Corollary: If any files are missing from the index part, they won't be included here.
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -448,7 +433,7 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_current_logical_size",
        "Current logical size grouped by timeline",
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define current logical size metric")
 });
@@ -597,7 +582,7 @@ pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_broken_tenants_count",
        "Set of broken tenants",
-        &["tenant_id"]
+        &["tenant_id", "shard_id"]
    )
    .expect("Failed to register pageserver_tenant_states_count metric")
 });
@@ -617,7 +602,7 @@ static NUM_PERSISTENT_FILES_CREATED: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_created_persistent_files_total",
        "Number of files created that are meant to be uploaded to cloud storage",
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -626,7 +611,7 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_written_persistent_bytes_total",
        "Total bytes written that are meant to be uploaded to cloud storage",
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -645,7 +630,7 @@ static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_evictions",
        "Number of layers evicted from the pageserver",
-        &["tenant_id", "timeline_id"]
+        &["tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -740,13 +725,13 @@ pub(crate) static TENANT: Lazy<TenantMetrics> = Lazy::new(|| {

 /// Each `Timeline`'s  [`EVICTIONS_WITH_LOW_RESIDENCE_DURATION`] metric.
 #[derive(Debug)]
-pub struct EvictionsWithLowResidenceDuration {
+pub(crate) struct EvictionsWithLowResidenceDuration {
    data_source: &'static str,
    threshold: Duration,
    counter: Option<IntCounter>,
 }

-pub struct EvictionsWithLowResidenceDurationBuilder {
+pub(crate) struct EvictionsWithLowResidenceDurationBuilder {
    data_source: &'static str,
    threshold: Duration,
 }
@@ -942,7 +927,7 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_io_operations_bytes_total",
        "Total amount of bytes read/written in IO operations",
-        &["operation", "tenant_id", "timeline_id"]
+        &["operation", "tenant_id", "shard_id", "timeline_id"]
    )
    .expect("failed to define a metric")
 });
@@ -1009,7 +994,7 @@ pub enum SmgrQueryType {
 }

 #[derive(Debug)]
-pub struct SmgrQueryTimePerTimeline {
+pub(crate) struct SmgrQueryTimePerTimeline {
    metrics: [GlobalAndPerTimelineHistogram; SmgrQueryType::COUNT],
 }

@@ -1017,7 +1002,7 @@ static SMGR_QUERY_TIME_PER_TENANT_TIMELINE: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_smgr_query_seconds",
        "Time spent on smgr query handling, aggegated by query type and tenant/timeline.",
-        &["smgr_query_type", "tenant_id", "timeline_id"],
+        &["smgr_query_type", "tenant_id", "shard_id", "timeline_id"],
        CRITICAL_OP_BUCKETS.into(),
    )
    .expect("failed to define a metric")
@@ -1084,8 +1069,9 @@ static SMGR_QUERY_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
 });

 impl SmgrQueryTimePerTimeline {
-    pub(crate) fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
-        let tenant_id = tenant_id.to_string();
+    pub(crate) fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
+        let shard_slug = format!("{}", tenant_shard_id.shard_slug());
        let timeline_id = timeline_id.to_string();
        let metrics = std::array::from_fn(|i| {
            let op = SmgrQueryType::from_repr(i).unwrap();
@@ -1093,7 +1079,7 @@ impl SmgrQueryTimePerTimeline {
                .get_metric_with_label_values(&[op.into()])
                .unwrap();
            let per_tenant_timeline = SMGR_QUERY_TIME_PER_TENANT_TIMELINE
-                .get_metric_with_label_values(&[op.into(), &tenant_id, &timeline_id])
+                .get_metric_with_label_values(&[op.into(), &tenant_id, &shard_slug, &timeline_id])
                .unwrap();
            GlobalAndPerTimelineHistogram {
                global,
@@ -1113,6 +1099,7 @@ impl SmgrQueryTimePerTimeline {

 #[cfg(test)]
 mod smgr_query_time_tests {
+    use pageserver_api::shard::TenantShardId;
    use strum::IntoEnumIterator;
    use utils::id::{TenantId, TimelineId};

@@ -1139,7 +1126,10 @@ mod smgr_query_time_tests {
        for op in &ops {
            let tenant_id = TenantId::generate();
            let timeline_id = TimelineId::generate();
-            let metrics = super::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
+            let metrics = super::SmgrQueryTimePerTimeline::new(
+                &TenantShardId::unsharded(tenant_id),
+                &timeline_id,
+            );

            let get_counts = || {
                let global: u64 = ops
@@ -1181,8 +1171,8 @@ static COMPUTE_STARTUP_BUCKETS: Lazy<[f64; 28]> = Lazy::new(|| {
    .map(|ms| (ms as f64) / 1000.0)
 });

-pub struct BasebackupQueryTime(HistogramVec);
-pub static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
+pub(crate) struct BasebackupQueryTime(HistogramVec);
+pub(crate) static BASEBACKUP_QUERY_TIME: Lazy<BasebackupQueryTime> = Lazy::new(|| {
    BasebackupQueryTime({
        register_histogram_vec!(
            "pageserver_basebackup_query_seconds",
@@ -1202,7 +1192,7 @@ impl DurationResultObserver for BasebackupQueryTime {
    }
 }

-pub static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
+pub(crate) static LIVE_CONNECTIONS_COUNT: Lazy<IntGaugeVec> = Lazy::new(|| {
    register_int_gauge_vec!(
        "pageserver_live_connections",
        "Number of live network connections",
@@ -1220,7 +1210,13 @@ static REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE: Lazy<IntGaugeVec> = Lazy::
        "Number of ongoing calls to remote timeline client. \
         Used to populate pageserver_remote_timeline_client_calls_started. \
         This metric is not useful for sampling from Prometheus, but useful in tests.",
-        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+        &[
+            "tenant_id",
+            "shard_id",
+            "timeline_id",
+            "file_kind",
+            "op_kind"
+        ],
    )
    .expect("failed to define a metric")
 });
@@ -1241,22 +1237,23 @@ static REMOTE_TIMELINE_CLIENT_CALLS_STARTED_HIST: Lazy<HistogramVec> = Lazy::new
    .expect("failed to define a metric")
 });

-static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
-    register_int_counter_vec!(
+static REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER: Lazy<IntCounterVec> =
+    Lazy::new(|| {
+        register_int_counter_vec!(
        "pageserver_remote_timeline_client_bytes_started",
        "Incremented by the number of bytes associated with a remote timeline client operation. \
         The increment happens when the operation is scheduled.",
-        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+        &["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
    )
-    .expect("failed to define a metric")
-});
+        .expect("failed to define a metric")
+    });

 static REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "pageserver_remote_timeline_client_bytes_finished",
        "Incremented by the number of bytes associated with a remote timeline client operation. \
         The increment happens when the operation finishes (regardless of success/failure/shutdown).",
-        &["tenant_id", "timeline_id", "file_kind", "op_kind"],
+        &["tenant_id", "shard_id", "timeline_id", "file_kind", "op_kind"],
    )
    .expect("failed to define a metric")
 });
@@ -1369,6 +1366,8 @@ pub(crate) struct SecondaryModeMetrics {
    pub(crate) upload_heatmap: IntCounter,
    pub(crate) upload_heatmap_errors: IntCounter,
    pub(crate) upload_heatmap_duration: Histogram,
+    pub(crate) download_heatmap: IntCounter,
+    pub(crate) download_layer: IntCounter,
 }
 pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| SecondaryModeMetrics {
    upload_heatmap: register_int_counter!(
@@ -1386,6 +1385,16 @@ pub(crate) static SECONDARY_MODE: Lazy<SecondaryModeMetrics> = Lazy::new(|| Seco
        "Time to build and upload a heatmap, including any waiting inside the S3 client"
    )
    .expect("failed to define a metric"),
+    download_heatmap: register_int_counter!(
+        "pageserver_secondary_download_heatmap",
+        "Number of downloads of heatmaps by secondary mode locations"
+    )
+    .expect("failed to define a metric"),
+    download_layer: register_int_counter!(
+        "pageserver_secondary_download_layer",
+        "Number of downloads of layers by secondary mode locations"
+    )
+    .expect("failed to define a metric"),
 });

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
@@ -1655,7 +1664,7 @@ pub(crate) static WAL_REDO_PROCESS_COUNTERS: Lazy<WalRedoProcessCounters> =
    Lazy::new(WalRedoProcessCounters::default);

 /// Similar to `prometheus::HistogramTimer` but does not record on drop.
-pub struct StorageTimeMetricsTimer {
+pub(crate) struct StorageTimeMetricsTimer {
    metrics: StorageTimeMetrics,
    start: Instant,
 }
@@ -1680,7 +1689,7 @@ impl StorageTimeMetricsTimer {
 /// Timing facilities for an globally histogrammed metric, which is supported by per tenant and
 /// timeline total sum and count.
 #[derive(Clone, Debug)]
-pub struct StorageTimeMetrics {
+pub(crate) struct StorageTimeMetrics {
    /// Sum of f64 seconds, per operation, tenant_id and timeline_id
    timeline_sum: Counter,
    /// Number of oeprations, per operation, tenant_id and timeline_id
@@ -1690,14 +1699,19 @@ pub struct StorageTimeMetrics {
 }

 impl StorageTimeMetrics {
-    pub fn new(operation: StorageTimeOperation, tenant_id: &str, timeline_id: &str) -> Self {
+    pub fn new(
+        operation: StorageTimeOperation,
+        tenant_id: &str,
+        shard_id: &str,
+        timeline_id: &str,
+    ) -> Self {
        let operation: &'static str = operation.into();

        let timeline_sum = STORAGE_TIME_SUM_PER_TIMELINE
-            .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
+            .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
            .unwrap();
        let timeline_count = STORAGE_TIME_COUNT_PER_TIMELINE
-            .get_metric_with_label_values(&[operation, tenant_id, timeline_id])
+            .get_metric_with_label_values(&[operation, tenant_id, shard_id, timeline_id])
            .unwrap();
        let global_histogram = STORAGE_TIME_GLOBAL
            .get_metric_with_label_values(&[operation])
@@ -1719,7 +1733,7 @@ impl StorageTimeMetrics {
 }

 #[derive(Debug)]
-pub struct TimelineMetrics {
+pub(crate) struct TimelineMetrics {
    tenant_id: String,
    shard_id: String,
    timeline_id: String,
@@ -1749,40 +1763,66 @@ impl TimelineMetrics {
        let tenant_id = tenant_shard_id.tenant_id.to_string();
        let shard_id = format!("{}", tenant_shard_id.shard_slug());
        let timeline_id = timeline_id.to_string();
-        let flush_time_histo =
-            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
-        let compact_time_histo =
-            StorageTimeMetrics::new(StorageTimeOperation::Compact, &tenant_id, &timeline_id);
-        let create_images_time_histo =
-            StorageTimeMetrics::new(StorageTimeOperation::CreateImages, &tenant_id, &timeline_id);
-        let logical_size_histo =
-            StorageTimeMetrics::new(StorageTimeOperation::LogicalSize, &tenant_id, &timeline_id);
+        let flush_time_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::LayerFlush,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
+        let compact_time_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::Compact,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
+        let create_images_time_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::CreateImages,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
+        let logical_size_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::LogicalSize,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
        let imitate_logical_size_histo = StorageTimeMetrics::new(
            StorageTimeOperation::ImitateLogicalSize,
            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
+        let load_layer_map_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::LoadLayerMap,
+            &tenant_id,
+            &shard_id,
+            &timeline_id,
+        );
+        let garbage_collect_histo = StorageTimeMetrics::new(
+            StorageTimeOperation::Gc,
+            &tenant_id,
+            &shard_id,
            &timeline_id,
        );
-        let load_layer_map_histo =
-            StorageTimeMetrics::new(StorageTimeOperation::LoadLayerMap, &tenant_id, &timeline_id);
-        let garbage_collect_histo =
-            StorageTimeMetrics::new(StorageTimeOperation::Gc, &tenant_id, &timeline_id);
        let last_record_gauge = LAST_RECORD_LSN
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
+        // TODO: we shouldn't expose this metric
        let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let num_persistent_files_created = NUM_PERSISTENT_FILES_CREATED
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let persistent_bytes_written = PERSISTENT_BYTES_WRITTEN
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let evictions = EVICTIONS
-            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
            .unwrap();
        let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
            .build(&tenant_id, &shard_id, &timeline_id);
@@ -1836,15 +1876,17 @@ impl Drop for TimelineMetrics {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
-        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
-            let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+            let _ =
+                RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
        }
-        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, timeline_id]);
-        let _ = EVICTIONS.remove_label_values(&[tenant_id, timeline_id]);
+        let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ =
+            NUM_PERSISTENT_FILES_CREATED.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = PERSISTENT_BYTES_WRITTEN.remove_label_values(&[tenant_id, &shard_id, timeline_id]);
+        let _ = EVICTIONS.remove_label_values(&[tenant_id, &shard_id, timeline_id]);

        self.evictions_with_low_residence_duration
            .write()
@@ -1857,29 +1899,42 @@ impl Drop for TimelineMetrics {
        // outlive an individual smgr connection, but not the timeline.

        for op in StorageTimeOperation::VARIANTS {
-            let _ =
-                STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
-            let _ =
-                STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[op, tenant_id, timeline_id]);
+            let _ = STORAGE_TIME_SUM_PER_TIMELINE.remove_label_values(&[
+                op,
+                tenant_id,
+                shard_id,
+                timeline_id,
+            ]);
+            let _ = STORAGE_TIME_COUNT_PER_TIMELINE.remove_label_values(&[
+                op,
+                tenant_id,
+                shard_id,
+                timeline_id,
+            ]);
        }

        for op in STORAGE_IO_SIZE_OPERATIONS {
-            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, timeline_id]);
+            let _ = STORAGE_IO_SIZE.remove_label_values(&[op, tenant_id, shard_id, timeline_id]);
        }

        for op in SmgrQueryType::iter() {
            let _ = SMGR_QUERY_TIME_PER_TENANT_TIMELINE.remove_label_values(&[
                op.into(),
                tenant_id,
+                shard_id,
                timeline_id,
            ]);
        }
    }
 }

-pub fn remove_tenant_metrics(tenant_id: &TenantId) {
-    let tid = tenant_id.to_string();
-    let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
+pub(crate) fn remove_tenant_metrics(tenant_shard_id: &TenantShardId) {
+    // Only shard zero deals in synthetic sizes
+    if tenant_shard_id.is_zero() {
+        let tid = tenant_shard_id.tenant_id.to_string();
+        let _ = TENANT_SYNTHETIC_SIZE_METRIC.remove_label_values(&[&tid]);
+    }
+
    // we leave the BROKEN_TENANTS_SET entry if any
 }

@@ -1927,8 +1982,9 @@ impl Drop for PerTimelineRemotePhysicalSizeGauge {
    }
 }

-pub struct RemoteTimelineClientMetrics {
+pub(crate) struct RemoteTimelineClientMetrics {
    tenant_id: String,
+    shard_id: String,
    timeline_id: String,
    remote_physical_size_gauge: Mutex<Option<PerTimelineRemotePhysicalSizeGauge>>,
    calls_unfinished_gauge: Mutex<HashMap<(&'static str, &'static str), IntGauge>>,
@@ -1940,6 +1996,7 @@ impl RemoteTimelineClientMetrics {
    pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
        RemoteTimelineClientMetrics {
            tenant_id: tenant_shard_id.tenant_id.to_string(),
+            shard_id: format!("{}", tenant_shard_id.shard_slug()),
            timeline_id: timeline_id.to_string(),
            calls_unfinished_gauge: Mutex::new(HashMap::default()),
            bytes_started_counter: Mutex::new(HashMap::default()),
@@ -1954,8 +2011,9 @@ impl RemoteTimelineClientMetrics {
            PerTimelineRemotePhysicalSizeGauge::new(
                REMOTE_PHYSICAL_SIZE
                    .get_metric_with_label_values(&[
-                        &self.tenant_id.to_string(),
-                        &self.timeline_id.to_string(),
+                        &self.tenant_id,
+                        &self.shard_id,
+                        &self.timeline_id,
                    ])
                    .unwrap(),
            )
@@ -1990,8 +2048,9 @@ impl RemoteTimelineClientMetrics {
        let metric = guard.entry(key).or_insert_with(move || {
            REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE
                .get_metric_with_label_values(&[
-                    &self.tenant_id.to_string(),
-                    &self.timeline_id.to_string(),
+                    &self.tenant_id,
+                    &self.shard_id,
+                    &self.timeline_id,
                    key.0,
                    key.1,
                ])
@@ -2021,8 +2080,9 @@ impl RemoteTimelineClientMetrics {
        let metric = guard.entry(key).or_insert_with(move || {
            REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER
                .get_metric_with_label_values(&[
-                    &self.tenant_id.to_string(),
-                    &self.timeline_id.to_string(),
+                    &self.tenant_id,
+                    &self.shard_id,
+                    &self.timeline_id,
                    key.0,
                    key.1,
                ])
@@ -2041,8 +2101,9 @@ impl RemoteTimelineClientMetrics {
        let metric = guard.entry(key).or_insert_with(move || {
            REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER
                .get_metric_with_label_values(&[
-                    &self.tenant_id.to_string(),
-                    &self.timeline_id.to_string(),
+                    &self.tenant_id,
+                    &self.shard_id,
+                    &self.timeline_id,
                    key.0,
                    key.1,
                ])
@@ -2186,6 +2247,7 @@ impl Drop for RemoteTimelineClientMetrics {
    fn drop(&mut self) {
        let RemoteTimelineClientMetrics {
            tenant_id,
+            shard_id,
            timeline_id,
            remote_physical_size_gauge,
            calls_unfinished_gauge,
@@ -2195,6 +2257,7 @@ impl Drop for RemoteTimelineClientMetrics {
        for ((a, b), _) in calls_unfinished_gauge.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_CALLS_UNFINISHED_GAUGE.remove_label_values(&[
                tenant_id,
+                shard_id,
                timeline_id,
                a,
                b,
@@ -2203,6 +2266,7 @@ impl Drop for RemoteTimelineClientMetrics {
        for ((a, b), _) in bytes_started_counter.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_BYTES_STARTED_COUNTER.remove_label_values(&[
                tenant_id,
+                shard_id,
                timeline_id,
                a,
                b,
@@ -2211,6 +2275,7 @@ impl Drop for RemoteTimelineClientMetrics {
        for ((a, b), _) in bytes_finished_counter.get_mut().unwrap().drain() {
            let _ = REMOTE_TIMELINE_CLIENT_BYTES_FINISHED_COUNTER.remove_label_values(&[
                tenant_id,
+                shard_id,
                timeline_id,
                a,
                b,
@@ -2218,18 +2283,16 @@ impl Drop for RemoteTimelineClientMetrics {
        }
        {
            let _ = remote_physical_size_gauge; // use to avoid 'unused' warning in desctructuring above
-            let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, timeline_id]);
+            let _ = REMOTE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
        }
    }
 }

 /// Wrapper future that measures the time spent by a remote storage operation,
 /// and records the time and success/failure as a prometheus metric.
-pub trait MeasureRemoteOp: Sized {
+pub(crate) trait MeasureRemoteOp: Sized {
    fn measure_remote_op(
        self,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
        file_kind: RemoteOpFileKind,
        op: RemoteOpKind,
        metrics: Arc<RemoteTimelineClientMetrics>,
@@ -2237,8 +2300,6 @@ pub trait MeasureRemoteOp: Sized {
        let start = Instant::now();
        MeasuredRemoteOp {
            inner: self,
-            tenant_id,
-            timeline_id,
            file_kind,
            op,
            start,
@@ -2250,12 +2311,10 @@ pub trait MeasureRemoteOp: Sized {
 impl<T: Sized> MeasureRemoteOp for T {}

 pin_project! {
-    pub struct MeasuredRemoteOp<F>
+    pub(crate) struct MeasuredRemoteOp<F>
    {
        #[pin]
        inner: F,
-        tenant_id: TenantId,
-        timeline_id: TimelineId,
        file_kind: RemoteOpFileKind,
        op: RemoteOpKind,
        start: Instant,
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -550,7 +550,6 @@ impl PageCache {
    // not require changes.

    async fn try_get_pinned_slot_permit(&self) -> anyhow::Result<PinnedSlotsPermit> {
-        let timer = crate::metrics::PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME.start_timer();
        match tokio::time::timeout(
            // Choose small timeout, neon_smgr does its own retries.
            // https://neondb.slack.com/archives/C04DGM6SMTM/p1694786876476869
@@ -563,7 +562,6 @@ impl PageCache {
                res.expect("this semaphore is never closed"),
            )),
            Err(_timeout) => {
-                timer.stop_and_discard();
                crate::metrics::page_cache_errors_inc(
                    crate::metrics::PageCacheErrorKind::AcquirePinnedSlotTimeout,
                );
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -13,7 +13,10 @@ use anyhow::Context;
 use async_compression::tokio::write::GzipEncoder;
 use bytes::Buf;
 use bytes::Bytes;
+use futures::stream::FuturesUnordered;
 use futures::Stream;
+use futures::StreamExt;
+use pageserver_api::key::Key;
 use pageserver_api::models::TenantState;
 use pageserver_api::models::{
    PagestreamBeMessage, PagestreamDbSizeRequest, PagestreamDbSizeResponse,
@@ -21,10 +24,14 @@ use pageserver_api::models::{
    PagestreamFeMessage, PagestreamGetPageRequest, PagestreamGetPageResponse,
    PagestreamNblocksRequest, PagestreamNblocksResponse,
 };
+use pageserver_api::shard::ShardIndex;
+use pageserver_api::shard::{ShardCount, ShardNumber};
 use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
 use pq_proto::{BeMessage, FeMessage, RowDescriptor};
+use std::borrow::Cow;
+use std::collections::HashMap;
 use std::io;
 use std::net::TcpListener;
 use std::pin::pin;
@@ -39,6 +46,7 @@ use tokio_util::sync::CancellationToken;
 use tracing::field;
 use tracing::*;
 use utils::id::ConnectionId;
+use utils::sync::gate::GateGuard;
 use utils::{
    auth::{Claims, Scope, SwappableJwtAuth},
    id::{TenantId, TimelineId},
@@ -53,7 +61,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
-use crate::pgdatadir_mapping::rel_block_to_key;
+use crate::pgdatadir_mapping::{rel_block_to_key, Version};
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
@@ -61,6 +69,9 @@ use crate::tenant::mgr;
 use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::ShardSelector;
+use crate::tenant::timeline::WaitLsnError;
+use crate::tenant::GetTimelineError;
+use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;

@@ -270,6 +281,13 @@ async fn page_service_conn_main(
    }
 }

+/// While a handler holds a reference to a Timeline, it also holds a the
+/// timeline's Gate open.
+struct HandlerTimeline {
+    timeline: Arc<Timeline>,
+    _guard: GateGuard,
+}
+
 struct PageServerHandler {
    _conf: &'static PageServerConf,
    broker_client: storage_broker::BrokerClientChannel,
@@ -281,6 +299,72 @@ struct PageServerHandler {
    /// For each query received over the connection,
    /// `process_query` creates a child context from this one.
    connection_ctx: RequestContext,
+
+    /// See [`Self::cache_timeline`] for usage.
+    ///
+    /// Note on size: the typical size of this map is 1.  The largest size we expect
+    /// to see is the number of shards divided by the number of pageservers (typically < 2),
+    /// or the ratio used when splitting shards (i.e. how many children created from one)
+    /// parent shard, where a "large" number might be ~8.
+    shard_timelines: HashMap<ShardIndex, HandlerTimeline>,
+}
+
+#[derive(thiserror::Error, Debug)]
+enum PageStreamError {
+    /// We encountered an error that should prompt the client to reconnect:
+    /// in practice this means we drop the connection without sending a response.
+    #[error("Reconnect required: {0}")]
+    Reconnect(Cow<'static, str>),
+
+    /// We were instructed to shutdown while processing the query
+    #[error("Shutting down")]
+    Shutdown,
+
+    /// Something went wrong reading a page: this likely indicates a pageserver bug
+    #[error("Read error: {0}")]
+    Read(PageReconstructError),
+
+    /// Ran out of time waiting for an LSN
+    #[error("LSN timeout: {0}")]
+    LsnTimeout(WaitLsnError),
+
+    /// The entity required to serve the request (tenant or timeline) is not found,
+    /// or is not found in a suitable state to serve a request.
+    #[error("Not found: {0}")]
+    NotFound(std::borrow::Cow<'static, str>),
+
+    /// Request asked for something that doesn't make sense, like an invalid LSN
+    #[error("Bad request: {0}")]
+    BadRequest(std::borrow::Cow<'static, str>),
+}
+
+impl From<PageReconstructError> for PageStreamError {
+    fn from(value: PageReconstructError) -> Self {
+        match value {
+            PageReconstructError::Cancelled => Self::Shutdown,
+            e => Self::Read(e),
+        }
+    }
+}
+
+impl From<GetActiveTimelineError> for PageStreamError {
+    fn from(value: GetActiveTimelineError) -> Self {
+        match value {
+            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => Self::Shutdown,
+            GetActiveTimelineError::Tenant(e) => Self::NotFound(format!("{e}").into()),
+            GetActiveTimelineError::Timeline(e) => Self::NotFound(format!("{e}").into()),
+        }
+    }
+}
+
+impl From<WaitLsnError> for PageStreamError {
+    fn from(value: WaitLsnError) -> Self {
+        match value {
+            e @ WaitLsnError::Timeout(_) => Self::LsnTimeout(e),
+            WaitLsnError::Shutdown => Self::Shutdown,
+            WaitLsnError::BadState => Self::Reconnect("Timeline is not active".into()),
+        }
+    }
 }

 impl PageServerHandler {
@@ -296,13 +380,57 @@ impl PageServerHandler {
            auth,
            claims: None,
            connection_ctx,
+            shard_timelines: HashMap::new(),
        }
    }

-    /// Wrap PostgresBackend::flush to respect our CancellationToken: it is important to use
-    /// this rather than naked flush() in order to shut down promptly.  Without this, we would
-    /// block shutdown of a tenant if a postgres client was failing to consume bytes we send
-    /// in the flush.
+    /// Future that completes when we need to shut down the connection.
+    ///
+    /// Reasons for need to shut down are:
+    /// - any of the timelines we hold GateGuards for in `shard_timelines` is cancelled
+    /// - task_mgr requests shutdown of the connection
+    ///
+    /// The need to check for `task_mgr` cancellation arises mainly from `handle_pagerequests`
+    /// where, at first, `shard_timelines` is empty, see <https://github.com/neondatabase/neon/pull/6388>
+    ///
+    /// NB: keep in sync with [`Self::is_connection_cancelled`]
+    async fn await_connection_cancelled(&self) {
+        // A short wait before we expend the cycles to walk our timeline map.  This avoids incurring
+        // that cost every time we check for cancellation.
+        tokio::time::sleep(Duration::from_millis(10)).await;
+
+        // This function is never called concurrently with code that adds timelines to shard_timelines,
+        // which is enforced by the borrow checker (the future returned by this function carries the
+        // immutable &self).  So it's fine to evaluate shard_timelines after the sleep, we don't risk
+        // missing any inserts to the map.
+
+        let mut futs = self
+            .shard_timelines
+            .values()
+            .map(|ht| ht.timeline.cancel.cancelled())
+            .collect::<FuturesUnordered<_>>();
+
+        tokio::select! {
+            _ = task_mgr::shutdown_watcher() => { }
+            _ = futs.next() => {}
+        }
+    }
+
+    /// Checking variant of [`Self::await_connection_cancelled`].
+    fn is_connection_cancelled(&self) -> bool {
+        task_mgr::is_shutdown_requested()
+            || self
+                .shard_timelines
+                .values()
+                .any(|ht| ht.timeline.cancel.is_cancelled() || ht.timeline.is_stopping())
+    }
+
+    /// This function always respects cancellation of any timeline in `[Self::shard_timelines]`.  Pass in
+    /// a cancellation token at the next scope up (such as a tenant cancellation token) to ensure we respect
+    /// cancellation if there aren't any timelines in the cache.
+    ///
+    /// If calling from a function that doesn't use the `[Self::shard_timelines]` cache, then pass in the
+    /// timeline cancellation token.
    async fn flush_cancellable<IO>(
        &self,
        pgb: &mut PostgresBackend<IO>,
@@ -315,6 +443,9 @@ impl PageServerHandler {
            flush_r = pgb.flush() => {
                Ok(flush_r?)
            },
+            _ = self.await_connection_cancelled() => {
+                Err(QueryError::Shutdown)
+            }
            _ = cancel.cancelled() => {
                Err(QueryError::Shutdown)
            }
@@ -390,7 +521,7 @@ impl PageServerHandler {

    #[instrument(skip_all)]
    async fn handle_pagerequests<IO>(
-        &self,
+        &mut self,
        pgb: &mut PostgresBackend<IO>,
        tenant_id: TenantId,
        timeline_id: TimelineId,
@@ -401,10 +532,6 @@ impl PageServerHandler {
    {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        // Note that since one connection may contain getpage requests that target different
-        // shards (e.g. during splitting when the compute is not yet aware of the split), the tenant
-        // that we look up here may not be the one that serves all the actual requests: we will double
-        // check the mapping of key->shard later before calling into Timeline for getpage requests.
        let tenant = mgr::get_active_tenant_with_timeout(
            tenant_id,
            ShardSelector::First,
@@ -425,27 +552,15 @@ impl PageServerHandler {
            None
        };

-        // Check that the timeline exists
-        let timeline = tenant
-            .get_timeline(timeline_id, true)
-            .map_err(|e| anyhow::anyhow!(e))?;
-
-        // Avoid starting new requests if the timeline has already started shutting down,
-        // and block timeline shutdown until this request is complete, or drops out due
-        // to cancellation.
-        let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
-
        // switch client to COPYBOTH
        pgb.write_message_noflush(&BeMessage::CopyBothResponse)?;
-        self.flush_cancellable(pgb, &timeline.cancel).await?;
-
-        let metrics = metrics::SmgrQueryTimePerTimeline::new(&tenant_id, &timeline_id);
+        self.flush_cancellable(pgb, &tenant.cancel).await?;

        loop {
            let msg = tokio::select! {
                biased;

-                _ = timeline.cancel.cancelled() => {
+                _ = self.await_connection_cancelled() => {
                    // We were requested to shut down.
                    info!("shutdown request received in page handler");
                    return Err(QueryError::Shutdown)
@@ -479,40 +594,36 @@ impl PageServerHandler {

            let (response, span) = match neon_fe_msg {
                PagestreamFeMessage::Exists(req) => {
-                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelExists);
                    let span = tracing::info_span!("handle_get_rel_exists_request", rel = %req.rel, req_lsn = %req.lsn);
                    (
-                        self.handle_get_rel_exists_request(&timeline, &req, &ctx)
+                        self.handle_get_rel_exists_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
                            .await,
                        span,
                    )
                }
                PagestreamFeMessage::Nblocks(req) => {
-                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetRelSize);
                    let span = tracing::info_span!("handle_get_nblocks_request", rel = %req.rel, req_lsn = %req.lsn);
                    (
-                        self.handle_get_nblocks_request(&timeline, &req, &ctx)
+                        self.handle_get_nblocks_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
                            .await,
                        span,
                    )
                }
                PagestreamFeMessage::GetPage(req) => {
-                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetPageAtLsn);
                    let span = tracing::info_span!("handle_get_page_at_lsn_request", rel = %req.rel, blkno = %req.blkno, req_lsn = %req.lsn);
                    (
-                        self.handle_get_page_at_lsn_request(&timeline, &req, &ctx)
+                        self.handle_get_page_at_lsn_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
                            .await,
                        span,
                    )
                }
                PagestreamFeMessage::DbSize(req) => {
-                    let _timer = metrics.start_timer(metrics::SmgrQueryType::GetDbSize);
                    let span = tracing::info_span!("handle_db_size_request", dbnode = %req.dbnode, req_lsn = %req.lsn);
                    (
-                        self.handle_db_size_request(&timeline, &req, &ctx)
+                        self.handle_db_size_request(tenant_id, timeline_id, &req, &ctx)
                            .instrument(span.clone())
                            .await,
                        span,
@@ -520,32 +631,44 @@ impl PageServerHandler {
                }
            };

-            if let Err(e) = &response {
-                // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
-                // because wait_lsn etc will drop out
-                // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
-                // is_canceled(): [`Timeline::shutdown`]` has entered
-                if timeline.cancel.is_cancelled() || timeline.is_stopping() {
+            match response {
+                Err(PageStreamError::Shutdown) => {
                    // If we fail to fulfil a request during shutdown, which may be _because_ of
                    // shutdown, then do not send the error to the client.  Instead just drop the
                    // connection.
-                    span.in_scope(|| info!("dropped response during shutdown: {e:#}"));
+                    span.in_scope(|| info!("dropping connection due to shutdown"));
                    return Err(QueryError::Shutdown);
                }
+                Err(PageStreamError::Reconnect(reason)) => {
+                    span.in_scope(|| info!("handler requested reconnect: {reason}"));
+                    return Err(QueryError::Reconnect);
+                }
+                Err(e) if self.is_connection_cancelled() => {
+                    // This branch accomodates code within request handlers that returns an anyhow::Error instead of a clean
+                    // shutdown error, this may be buried inside a PageReconstructError::Other for example.
+                    //
+                    // Requests may fail as soon as we are Stopping, even if the Timeline's cancellation token wasn't fired yet,
+                    // because wait_lsn etc will drop out
+                    // is_stopping(): [`Timeline::flush_and_shutdown`] has entered
+                    // is_canceled(): [`Timeline::shutdown`]` has entered
+                    span.in_scope(|| info!("dropped error response during shutdown: {e:#}"));
+                    return Err(QueryError::Shutdown);
+                }
+                r => {
+                    let response_msg = r.unwrap_or_else(|e| {
+                        // print the all details to the log with {:#}, but for the client the
+                        // error message is enough.  Do not log if shutting down, as the anyhow::Error
+                        // here includes cancellation which is not an error.
+                        span.in_scope(|| error!("error reading relation or page version: {:#}", e));
+                        PagestreamBeMessage::Error(PagestreamErrorResponse {
+                            message: e.to_string(),
+                        })
+                    });
+
+                    pgb.write_message_noflush(&BeMessage::CopyData(&response_msg.serialize()))?;
+                    self.flush_cancellable(pgb, &tenant.cancel).await?;
+                }
            }
-
-            let response = response.unwrap_or_else(|e| {
-                // print the all details to the log with {:#}, but for the client the
-                // error message is enough.  Do not log if shutting down, as the anyhow::Error
-                // here includes cancellation which is not an error.
-                span.in_scope(|| error!("error reading relation or page version: {:#}", e));
-                PagestreamBeMessage::Error(PagestreamErrorResponse {
-                    message: e.to_string(),
-                })
-            });
-
-            pgb.write_message_noflush(&BeMessage::CopyData(&response.serialize()))?;
-            self.flush_cancellable(pgb, &timeline.cancel).await?;
        }
        Ok(())
    }
@@ -692,7 +815,7 @@ impl PageServerHandler {
        latest: bool,
        latest_gc_cutoff_lsn: &RcuReadGuard<Lsn>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Lsn> {
+    ) -> Result<Lsn, PageStreamError> {
        if latest {
            // Latest page version was requested. If LSN is given, it is a hint
            // to the page server that there have been no modifications to the
@@ -723,31 +846,41 @@ impl PageServerHandler {
            }
        } else {
            if lsn == Lsn(0) {
-                anyhow::bail!("invalid LSN(0) in request");
+                return Err(PageStreamError::BadRequest(
+                    "invalid LSN(0) in request".into(),
+                ));
            }
            timeline.wait_lsn(lsn, ctx).await?;
        }
-        anyhow::ensure!(
-            lsn >= **latest_gc_cutoff_lsn,
-            "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
-            lsn, **latest_gc_cutoff_lsn
-        );
+
+        if lsn < **latest_gc_cutoff_lsn {
+            return Err(PageStreamError::BadRequest(format!(
+                "tried to request a page version that was garbage collected. requested at {} gc cutoff {}",
+                lsn, **latest_gc_cutoff_lsn
+            ).into()));
+        }
        Ok(lsn)
    }

    async fn handle_get_rel_exists_request(
-        &self,
-        timeline: &Timeline,
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
        req: &PagestreamExistsRequest,
        ctx: &RequestContext,
-    ) -> anyhow::Result<PagestreamBeMessage> {
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
+        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+        let _timer = timeline
+            .query_metrics
+            .start_timer(metrics::SmgrQueryType::GetRelExists);
+
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                .await?;

        let exists = timeline
-            .get_rel_exists(req.rel, lsn, req.latest, ctx)
+            .get_rel_exists(req.rel, Version::Lsn(lsn), req.latest, ctx)
            .await?;

        Ok(PagestreamBeMessage::Exists(PagestreamExistsResponse {
@@ -756,17 +889,26 @@ impl PageServerHandler {
    }

    async fn handle_get_nblocks_request(
-        &self,
-        timeline: &Timeline,
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
        req: &PagestreamNblocksRequest,
        ctx: &RequestContext,
-    ) -> anyhow::Result<PagestreamBeMessage> {
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
+        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+
+        let _timer = timeline
+            .query_metrics
+            .start_timer(metrics::SmgrQueryType::GetRelSize);
+
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                .await?;

-        let n_blocks = timeline.get_rel_size(req.rel, lsn, req.latest, ctx).await?;
+        let n_blocks = timeline
+            .get_rel_size(req.rel, Version::Lsn(lsn), req.latest, ctx)
+            .await?;

        Ok(PagestreamBeMessage::Nblocks(PagestreamNblocksResponse {
            n_blocks,
@@ -774,18 +916,31 @@ impl PageServerHandler {
    }

    async fn handle_db_size_request(
-        &self,
-        timeline: &Timeline,
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
        req: &PagestreamDbSizeRequest,
        ctx: &RequestContext,
-    ) -> anyhow::Result<PagestreamBeMessage> {
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
+        let timeline = self.get_timeline_shard_zero(tenant_id, timeline_id).await?;
+
+        let _timer = timeline
+            .query_metrics
+            .start_timer(metrics::SmgrQueryType::GetDbSize);
+
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                .await?;

        let total_blocks = timeline
-            .get_db_size(DEFAULTTABLESPACE_OID, req.dbnode, lsn, req.latest, ctx)
+            .get_db_size(
+                DEFAULTTABLESPACE_OID,
+                req.dbnode,
+                Version::Lsn(lsn),
+                req.latest,
+                ctx,
+            )
            .await?;
        let db_size = total_blocks as i64 * BLCKSZ as i64;

@@ -794,68 +949,167 @@ impl PageServerHandler {
        }))
    }

+    /// For most getpage requests, we will already have a Timeline to serve the request: this function
+    /// looks up such a Timeline synchronously and without touching any global state.
+    fn get_cached_timeline_for_page(
+        &mut self,
+        req: &PagestreamGetPageRequest,
+    ) -> Result<&Arc<Timeline>, Key> {
+        let key = if let Some((first_idx, first_timeline)) = self.shard_timelines.iter().next() {
+            // Fastest path: single sharded case
+            if first_idx.shard_count < ShardCount(2) {
+                return Ok(&first_timeline.timeline);
+            }
+
+            let key = rel_block_to_key(req.rel, req.blkno);
+            let shard_num = first_timeline
+                .timeline
+                .get_shard_identity()
+                .get_shard_number(&key);
+
+            // Fast path: matched the first timeline in our local handler map.  This case is common if
+            // only one shard per tenant is attached to this pageserver.
+            if first_timeline.timeline.get_shard_identity().number == shard_num {
+                return Ok(&first_timeline.timeline);
+            }
+
+            let shard_index = ShardIndex {
+                shard_number: shard_num,
+                shard_count: first_timeline.timeline.get_shard_identity().count,
+            };
+
+            // Fast-ish path: timeline is in the connection handler's local cache
+            if let Some(found) = self.shard_timelines.get(&shard_index) {
+                return Ok(&found.timeline);
+            }
+
+            key
+        } else {
+            rel_block_to_key(req.rel, req.blkno)
+        };
+
+        Err(key)
+    }
+
+    /// Having looked up the [`Timeline`] instance for a particular shard, cache it to enable
+    /// use in future requests without having to traverse [`crate::tenant::mgr::TenantManager`]
+    /// again.
+    ///
+    /// Note that all the Timelines in this cache are for the same timeline_id: they're differ
+    /// in which shard they belong to.  When we serve a getpage@lsn request, we choose a shard
+    /// based on key.
+    ///
+    /// The typical size of this cache is 1, as we generally create shards to distribute work
+    /// across pageservers, so don't tend to have multiple shards for the same tenant on the
+    /// same pageserver.
+    fn cache_timeline(
+        &mut self,
+        timeline: Arc<Timeline>,
+    ) -> Result<&Arc<Timeline>, GetActiveTimelineError> {
+        let gate_guard = timeline
+            .gate
+            .enter()
+            .map_err(|_| GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled))?;
+
+        let shard_index = timeline.tenant_shard_id.to_index();
+        let entry = self
+            .shard_timelines
+            .entry(shard_index)
+            .or_insert(HandlerTimeline {
+                timeline,
+                _guard: gate_guard,
+            });
+
+        Ok(&entry.timeline)
+    }
+
+    /// If [`Self::get_cached_timeline_for_page`] missed, then this function is used to populate the cache with
+    /// a Timeline to serve requests for this key, if such a Timeline is present on this pageserver.  If no such
+    /// Timeline is found, then we will return an error (this indicates that the client is talking to the wrong node).
+    async fn load_timeline_for_page(
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+        key: Key,
+    ) -> anyhow::Result<&Arc<Timeline>, GetActiveTimelineError> {
+        // Slow path: we must call out to the TenantManager to find the timeline for this Key
+        let timeline = self
+            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Page(key))
+            .await?;
+
+        self.cache_timeline(timeline)
+    }
+
+    async fn get_timeline_shard_zero(
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> anyhow::Result<&Arc<Timeline>, GetActiveTimelineError> {
+        // This is a borrow-checker workaround: we can't return from inside of the  `if let Some` because
+        // that would be an immutable-borrow-self return, whereas later in the function we will use a mutable
+        // ref to salf.  So instead, we first build a bool, and then return while not borrowing self.
+        let have_cached = if let Some((idx, _tl)) = self.shard_timelines.iter().next() {
+            idx.shard_number == ShardNumber(0)
+        } else {
+            false
+        };
+
+        if have_cached {
+            let entry = self.shard_timelines.iter().next().unwrap();
+            Ok(&entry.1.timeline)
+        } else {
+            let timeline = self
+                .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+                .await?;
+            Ok(self.cache_timeline(timeline)?)
+        }
+    }
+
    async fn handle_get_page_at_lsn_request(
-        &self,
-        timeline: &Timeline,
+        &mut self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
        req: &PagestreamGetPageRequest,
        ctx: &RequestContext,
-    ) -> anyhow::Result<PagestreamBeMessage> {
+    ) -> Result<PagestreamBeMessage, PageStreamError> {
+        let timeline = match self.get_cached_timeline_for_page(req) {
+            Ok(tl) => tl,
+            Err(key) => {
+                match self
+                    .load_timeline_for_page(tenant_id, timeline_id, key)
+                    .await
+                {
+                    Ok(t) => t,
+                    Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
+                        // We already know this tenant exists in general, because we resolved it at
+                        // start of connection.  Getting a NotFound here indicates that the shard containing
+                        // the requested page is not present on this node: the client's knowledge of shard->pageserver
+                        // mapping is out of date.
+                        //
+                        // Closing the connection by returning ``::Reconnect` has the side effect of rate-limiting above message, via
+                        // client's reconnect backoff, as well as hopefully prompting the client to load its updated configuration
+                        // and talk to a different pageserver.
+                        return Err(PageStreamError::Reconnect(
+                            "getpage@lsn request routed to wrong shard".into(),
+                        ));
+                    }
+                    Err(e) => return Err(e.into()),
+                }
+            }
+        };
+
+        let _timer = timeline
+            .query_metrics
+            .start_timer(metrics::SmgrQueryType::GetPageAtLsn);
+
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        let lsn =
            Self::wait_or_get_last_lsn(timeline, req.lsn, req.latest, &latest_gc_cutoff_lsn, ctx)
                .await?;
-        /*
-        // Add a 1s delay to some requests. The delay helps the requests to
-        // hit the race condition from github issue #1047 more easily.
-        use rand::Rng;
-        if rand::thread_rng().gen::<u8>() < 5 {
-            std::thread::sleep(std::time::Duration::from_millis(1000));
-        }
-        */

-        let key = rel_block_to_key(req.rel, req.blkno);
-        let page = if timeline.get_shard_identity().is_key_local(&key) {
-            timeline
-                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
-                .await?
-        } else {
-            // The Tenant shard we looked up at connection start does not hold this particular
-            // key: look for other shards in this tenant.  This scenario occurs if a pageserver
-            // has multiple shards for the same tenant.
-            //
-            // TODO: optimize this (https://github.com/neondatabase/neon/pull/6037)
-            let timeline = match self
-                .get_active_tenant_timeline(
-                    timeline.tenant_shard_id.tenant_id,
-                    timeline.timeline_id,
-                    ShardSelector::Page(key),
-                )
-                .await
-            {
-                Ok(t) => t,
-                Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
-                    // We already know this tenant exists in general, because we resolved it at
-                    // start of connection.  Getting a NotFound here indicates that the shard containing
-                    // the requested page is not present on this node.
-
-                    // TODO: this should be some kind of structured error that the client will understand,
-                    // so that it can block until its config is updated: this error is expected in the case
-                    // that the Tenant's shards' placements are being updated and the client hasn't been
-                    // informed yet.
-                    //
-                    // https://github.com/neondatabase/neon/issues/6038
-                    return Err(anyhow::anyhow!("Request routed to wrong shard"));
-                }
-                Err(e) => return Err(e.into()),
-            };
-
-            // Take a GateGuard for the duration of this request.  If we were using our main Timeline object,
-            // the GateGuard was already held over the whole connection.
-            let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
-            timeline
-                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
-                .await?
-        };
+        let page = timeline
+            .get_rel_page_at_lsn(req.rel, req.blkno, Version::Lsn(lsn), req.latest, ctx)
+            .await?;

        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
            page,
@@ -1000,9 +1254,7 @@ impl PageServerHandler {
        )
        .await
        .map_err(GetActiveTimelineError::Tenant)?;
-        let timeline = tenant
-            .get_timeline(timeline_id, true)
-            .map_err(|e| GetActiveTimelineError::Timeline(anyhow::anyhow!(e)))?;
+        let timeline = tenant.get_timeline(timeline_id, true)?;
        Ok(timeline)
    }
 }
@@ -1411,7 +1663,8 @@ impl From<GetActiveTenantError> for QueryError {
            GetActiveTenantError::WaitForActiveTimeout { .. } => QueryError::Disconnected(
                ConnectionError::Io(io::Error::new(io::ErrorKind::TimedOut, e.to_string())),
            ),
-            GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
+            GetActiveTenantError::Cancelled
+            | GetActiveTenantError::WillNotBecomeActive(TenantState::Stopping { .. }) => {
                QueryError::Shutdown
            }
            e => QueryError::Other(anyhow::anyhow!(e)),
@@ -1424,14 +1677,15 @@ enum GetActiveTimelineError {
    #[error(transparent)]
    Tenant(GetActiveTenantError),
    #[error(transparent)]
-    Timeline(anyhow::Error),
+    Timeline(#[from] GetTimelineError),
 }

 impl From<GetActiveTimelineError> for QueryError {
    fn from(e: GetActiveTimelineError) -> Self {
        match e {
+            GetActiveTimelineError::Tenant(GetActiveTenantError::Cancelled) => QueryError::Shutdown,
            GetActiveTimelineError::Tenant(e) => e.into(),
-            GetActiveTimelineError::Timeline(e) => QueryError::Other(e),
+            GetActiveTimelineError::Timeline(e) => QueryError::NotFound(format!("{e}").into()),
        }
    }
 }
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -11,10 +11,10 @@ use crate::context::RequestContext;
 use crate::keyspace::{KeySpace, KeySpaceAccum};
 use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
-use anyhow::Context;
+use anyhow::{ensure, Context};
 use bytes::{Buf, Bytes};
 use pageserver_api::key::is_rel_block_key;
-use pageserver_api::reltag::{RelTag, SlruKind};
+use pageserver_api::reltag::{BlockNumber, RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
 use postgres_ffi::{Oid, TimestampTz, TransactionId};
@@ -27,9 +27,6 @@ use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::{bin_ser::BeSer, lsn::Lsn};

-/// Block number within a relation or SLRU. This matches PostgreSQL's BlockNumber type.
-pub type BlockNumber = u32;
-
 #[derive(Debug)]
 pub enum LsnForTimestamp {
    /// Found commits both before and after the given timestamp
@@ -147,6 +144,7 @@ impl Timeline {
    {
        DatadirModification {
            tline: self,
+            pending_lsns: Vec::new(),
            pending_updates: HashMap::new(),
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
@@ -159,11 +157,11 @@ impl Timeline {
    //------------------------------------------------------------------------------

    /// Look up given page version.
-    pub async fn get_rel_page_at_lsn(
+    pub(crate) async fn get_rel_page_at_lsn(
        &self,
        tag: RelTag,
        blknum: BlockNumber,
-        lsn: Lsn,
+        version: Version<'_>,
        latest: bool,
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
@@ -173,44 +171,47 @@ impl Timeline {
            ));
        }

-        let nblocks = self.get_rel_size(tag, lsn, latest, ctx).await?;
+        let nblocks = self.get_rel_size(tag, version, latest, ctx).await?;
        if blknum >= nblocks {
            debug!(
                "read beyond EOF at {} blk {} at {}, size is {}: returning all-zeros page",
-                tag, blknum, lsn, nblocks
+                tag,
+                blknum,
+                version.get_lsn(),
+                nblocks
            );
            return Ok(ZERO_PAGE.clone());
        }

        let key = rel_block_to_key(tag, blknum);
-        self.get(key, lsn, ctx).await
+        version.get(self, key, ctx).await
    }

    // Get size of a database in blocks
-    pub async fn get_db_size(
+    pub(crate) async fn get_db_size(
        &self,
        spcnode: Oid,
        dbnode: Oid,
-        lsn: Lsn,
+        version: Version<'_>,
        latest: bool,
        ctx: &RequestContext,
    ) -> Result<usize, PageReconstructError> {
        let mut total_blocks = 0;

-        let rels = self.list_rels(spcnode, dbnode, lsn, ctx).await?;
+        let rels = self.list_rels(spcnode, dbnode, version, ctx).await?;

        for rel in rels {
-            let n_blocks = self.get_rel_size(rel, lsn, latest, ctx).await?;
+            let n_blocks = self.get_rel_size(rel, version, latest, ctx).await?;
            total_blocks += n_blocks as usize;
        }
        Ok(total_blocks)
    }

    /// Get size of a relation file
-    pub async fn get_rel_size(
+    pub(crate) async fn get_rel_size(
        &self,
        tag: RelTag,
-        lsn: Lsn,
+        version: Version<'_>,
        latest: bool,
        ctx: &RequestContext,
    ) -> Result<BlockNumber, PageReconstructError> {
@@ -220,12 +221,12 @@ impl Timeline {
            ));
        }

-        if let Some(nblocks) = self.get_cached_rel_size(&tag, lsn) {
+        if let Some(nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
            return Ok(nblocks);
        }

        if (tag.forknum == FSM_FORKNUM || tag.forknum == VISIBILITYMAP_FORKNUM)
-            && !self.get_rel_exists(tag, lsn, latest, ctx).await?
+            && !self.get_rel_exists(tag, version, latest, ctx).await?
        {
            // FIXME: Postgres sometimes calls smgrcreate() to create
            // FSM, and smgrnblocks() on it immediately afterwards,
@@ -235,7 +236,7 @@ impl Timeline {
        }

        let key = rel_size_to_key(tag);
-        let mut buf = self.get(key, lsn, ctx).await?;
+        let mut buf = version.get(self, key, ctx).await?;
        let nblocks = buf.get_u32_le();

        if latest {
@@ -246,16 +247,16 @@ impl Timeline {
            // latest=true, then it can not cause cache corruption, because with latest=true
            // pageserver choose max(request_lsn, last_written_lsn) and so cached value will be
            // associated with most recent value of LSN.
-            self.update_cached_rel_size(tag, lsn, nblocks);
+            self.update_cached_rel_size(tag, version.get_lsn(), nblocks);
        }
        Ok(nblocks)
    }

    /// Does relation exist?
-    pub async fn get_rel_exists(
+    pub(crate) async fn get_rel_exists(
        &self,
        tag: RelTag,
-        lsn: Lsn,
+        version: Version<'_>,
        _latest: bool,
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
@@ -266,12 +267,12 @@ impl Timeline {
        }

        // first try to lookup relation in cache
-        if let Some(_nblocks) = self.get_cached_rel_size(&tag, lsn) {
+        if let Some(_nblocks) = self.get_cached_rel_size(&tag, version.get_lsn()) {
            return Ok(true);
        }
        // fetch directory listing
        let key = rel_dir_to_key(tag.spcnode, tag.dbnode);
-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;

        match RelDirectory::des(&buf).context("deserialization failure") {
            Ok(dir) => {
@@ -287,16 +288,16 @@ impl Timeline {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub async fn list_rels(
+    pub(crate) async fn list_rels(
        &self,
        spcnode: Oid,
        dbnode: Oid,
-        lsn: Lsn,
+        version: Version<'_>,
        ctx: &RequestContext,
    ) -> Result<HashSet<RelTag>, PageReconstructError> {
        // fetch directory listing
        let key = rel_dir_to_key(spcnode, dbnode);
-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;

        match RelDirectory::des(&buf).context("deserialization failure") {
            Ok(dir) => {
@@ -315,7 +316,7 @@ impl Timeline {
    }

    /// Look up given SLRU page version.
-    pub async fn get_slru_page_at_lsn(
+    pub(crate) async fn get_slru_page_at_lsn(
        &self,
        kind: SlruKind,
        segno: u32,
@@ -328,29 +329,29 @@ impl Timeline {
    }

    /// Get size of an SLRU segment
-    pub async fn get_slru_segment_size(
+    pub(crate) async fn get_slru_segment_size(
        &self,
        kind: SlruKind,
        segno: u32,
-        lsn: Lsn,
+        version: Version<'_>,
        ctx: &RequestContext,
    ) -> Result<BlockNumber, PageReconstructError> {
        let key = slru_segment_size_to_key(kind, segno);
-        let mut buf = self.get(key, lsn, ctx).await?;
+        let mut buf = version.get(self, key, ctx).await?;
        Ok(buf.get_u32_le())
    }

    /// Get size of an SLRU segment
-    pub async fn get_slru_segment_exists(
+    pub(crate) async fn get_slru_segment_exists(
        &self,
        kind: SlruKind,
        segno: u32,
-        lsn: Lsn,
+        version: Version<'_>,
        ctx: &RequestContext,
    ) -> Result<bool, PageReconstructError> {
        // fetch directory listing
        let key = slru_dir_to_key(kind);
-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;

        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
            Ok(dir) => {
@@ -368,7 +369,7 @@ impl Timeline {
    /// so it's not well defined which LSN you get if there were multiple commits
    /// "in flight" at that point in time.
    ///
-    pub async fn find_lsn_for_timestamp(
+    pub(crate) async fn find_lsn_for_timestamp(
        &self,
        search_timestamp: TimestampTz,
        cancel: &CancellationToken,
@@ -448,7 +449,7 @@ impl Timeline {
    /// Additionally, sets 'found_smaller'/'found_Larger, if encounters any commits
    /// with a smaller/larger timestamp.
    ///
-    pub async fn is_latest_commit_timestamp_ge_than(
+    pub(crate) async fn is_latest_commit_timestamp_ge_than(
        &self,
        search_timestamp: TimestampTz,
        probe_lsn: Lsn,
@@ -471,7 +472,7 @@ impl Timeline {
    /// Obtain the possible timestamp range for the given lsn.
    ///
    /// If the lsn has no timestamps, returns None. returns `(min, max, median)` if it has timestamps.
-    pub async fn get_timestamp_for_lsn(
+    pub(crate) async fn get_timestamp_for_lsn(
        &self,
        probe_lsn: Lsn,
        ctx: &RequestContext,
@@ -501,11 +502,11 @@ impl Timeline {
        mut f: impl FnMut(TimestampTz) -> ControlFlow<T>,
    ) -> Result<T, PageReconstructError> {
        for segno in self
-            .list_slru_segments(SlruKind::Clog, probe_lsn, ctx)
+            .list_slru_segments(SlruKind::Clog, Version::Lsn(probe_lsn), ctx)
            .await?
        {
            let nblocks = self
-                .get_slru_segment_size(SlruKind::Clog, segno, probe_lsn, ctx)
+                .get_slru_segment_size(SlruKind::Clog, segno, Version::Lsn(probe_lsn), ctx)
                .await?;
            for blknum in (0..nblocks).rev() {
                let clog_page = self
@@ -528,36 +529,36 @@ impl Timeline {
    }

    /// Get a list of SLRU segments
-    pub async fn list_slru_segments(
+    pub(crate) async fn list_slru_segments(
        &self,
        kind: SlruKind,
-        lsn: Lsn,
+        version: Version<'_>,
        ctx: &RequestContext,
    ) -> Result<HashSet<u32>, PageReconstructError> {
        // fetch directory entry
        let key = slru_dir_to_key(kind);

-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;
        match SlruSegmentDirectory::des(&buf).context("deserialization failure") {
            Ok(dir) => Ok(dir.segments),
            Err(e) => Err(PageReconstructError::from(e)),
        }
    }

-    pub async fn get_relmap_file(
+    pub(crate) async fn get_relmap_file(
        &self,
        spcnode: Oid,
        dbnode: Oid,
-        lsn: Lsn,
+        version: Version<'_>,
        ctx: &RequestContext,
    ) -> Result<Bytes, PageReconstructError> {
        let key = relmap_file_key(spcnode, dbnode);

-        let buf = self.get(key, lsn, ctx).await?;
+        let buf = version.get(self, key, ctx).await?;
        Ok(buf)
    }

-    pub async fn list_dbdirs(
+    pub(crate) async fn list_dbdirs(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -571,7 +572,7 @@ impl Timeline {
        }
    }

-    pub async fn get_twophase_file(
+    pub(crate) async fn get_twophase_file(
        &self,
        xid: TransactionId,
        lsn: Lsn,
@@ -582,7 +583,7 @@ impl Timeline {
        Ok(buf)
    }

-    pub async fn list_twophase_files(
+    pub(crate) async fn list_twophase_files(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -596,7 +597,7 @@ impl Timeline {
        }
    }

-    pub async fn get_control_file(
+    pub(crate) async fn get_control_file(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -604,7 +605,7 @@ impl Timeline {
        self.get(CONTROLFILE_KEY, lsn, ctx).await
    }

-    pub async fn get_checkpoint(
+    pub(crate) async fn get_checkpoint(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -612,7 +613,7 @@ impl Timeline {
        self.get(CHECKPOINT_KEY, lsn, ctx).await
    }

-    pub async fn list_aux_files(
+    pub(crate) async fn list_aux_files(
        &self,
        lsn: Lsn,
        ctx: &RequestContext,
@@ -652,7 +653,10 @@ impl Timeline {

        let mut total_size: u64 = 0;
        for (spcnode, dbnode) in dbdir.dbdirs.keys() {
-            for rel in self.list_rels(*spcnode, *dbnode, lsn, ctx).await? {
+            for rel in self
+                .list_rels(*spcnode, *dbnode, Version::Lsn(lsn), ctx)
+                .await?
+            {
                if self.cancel.is_cancelled() {
                    return Err(CalculateLogicalSizeError::Cancelled);
                }
@@ -692,7 +696,7 @@ impl Timeline {
            result.add_key(rel_dir_to_key(spcnode, dbnode));

            let mut rels: Vec<RelTag> = self
-                .list_rels(spcnode, dbnode, lsn, ctx)
+                .list_rels(spcnode, dbnode, Version::Lsn(lsn), ctx)
                .await?
                .into_iter()
                .collect();
@@ -799,18 +803,39 @@ pub struct DatadirModification<'a> {
    /// in the state in 'tline' yet.
    pub tline: &'a Timeline,

-    /// Lsn assigned by begin_modification
-    pub lsn: Lsn,
+    /// Current LSN of the modification
+    lsn: Lsn,

    // The modifications are not applied directly to the underlying key-value store.
    // The put-functions add the modifications here, and they are flushed to the
    // underlying key-value store by the 'finish' function.
-    pending_updates: HashMap<Key, Value>,
-    pending_deletions: Vec<Range<Key>>,
+    pending_lsns: Vec<Lsn>,
+    pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
+    pending_deletions: Vec<(Range<Key>, Lsn)>,
    pending_nblocks: i64,
 }

 impl<'a> DatadirModification<'a> {
+    /// Get the current lsn
+    pub(crate) fn get_lsn(&self) -> Lsn {
+        self.lsn
+    }
+
+    /// Set the current lsn
+    pub(crate) fn set_lsn(&mut self, lsn: Lsn) -> anyhow::Result<()> {
+        ensure!(
+            lsn >= self.lsn,
+            "setting an older lsn {} than {} is not allowed",
+            lsn,
+            self.lsn
+        );
+        if lsn > self.lsn {
+            self.pending_lsns.push(self.lsn);
+            self.lsn = lsn;
+        }
+        Ok(())
+    }
+
    /// Initialize a completely new repository.
    ///
    /// This inserts the directory metadata entries that are assumed to
@@ -984,11 +1009,9 @@ impl<'a> DatadirModification<'a> {
        dbnode: Oid,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        let req_lsn = self.tline.get_last_record_lsn();
-
        let total_blocks = self
            .tline
-            .get_db_size(spcnode, dbnode, req_lsn, true, ctx)
+            .get_db_size(spcnode, dbnode, Version::Modified(self), true, ctx)
            .await?;

        // Remove entry from dbdir
@@ -1077,8 +1100,11 @@ impl<'a> DatadirModification<'a> {
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        anyhow::ensure!(rel.relnode != 0, RelationError::InvalidRelnode);
-        let last_lsn = self.tline.get_last_record_lsn();
-        if self.tline.get_rel_exists(rel, last_lsn, true, ctx).await? {
+        if self
+            .tline
+            .get_rel_exists(rel, Version::Modified(self), true, ctx)
+            .await?
+        {
            let size_key = rel_size_to_key(rel);
            // Fetch the old size first
            let old_size = self.get(size_key, ctx).await?.get_u32_le();
@@ -1323,17 +1349,23 @@ impl<'a> DatadirModification<'a> {
        let writer = self.tline.writer().await;

        // Flush relation and  SLRU data blocks, keep metadata.
-        let mut retained_pending_updates = HashMap::new();
-        for (key, value) in self.pending_updates.drain() {
-            if is_rel_block_key(&key) || is_slru_block_key(key) {
-                // This bails out on first error without modifying pending_updates.
-                // That's Ok, cf this function's doc comment.
-                writer.put(key, self.lsn, &value, ctx).await?;
-            } else {
-                retained_pending_updates.insert(key, value);
+        let mut retained_pending_updates = HashMap::<_, Vec<_>>::new();
+        for (key, values) in self.pending_updates.drain() {
+            for (lsn, value) in values {
+                if is_rel_block_key(&key) || is_slru_block_key(key) {
+                    // This bails out on first error without modifying pending_updates.
+                    // That's Ok, cf this function's doc comment.
+                    writer.put(key, lsn, &value, ctx).await?;
+                } else {
+                    retained_pending_updates
+                        .entry(key)
+                        .or_default()
+                        .push((lsn, value));
+                }
            }
        }
-        self.pending_updates.extend(retained_pending_updates);
+
+        self.pending_updates = retained_pending_updates;

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1350,18 +1382,28 @@ impl<'a> DatadirModification<'a> {
    ///
    pub async fn commit(&mut self, ctx: &RequestContext) -> anyhow::Result<()> {
        let writer = self.tline.writer().await;
-        let lsn = self.lsn;
+
        let pending_nblocks = self.pending_nblocks;
        self.pending_nblocks = 0;

-        for (key, value) in self.pending_updates.drain() {
-            writer.put(key, lsn, &value, ctx).await?;
-        }
-        for key_range in self.pending_deletions.drain(..) {
-            writer.delete(key_range, lsn).await?;
+        if !self.pending_updates.is_empty() {
+            writer.put_batch(&self.pending_updates, ctx).await?;
+            self.pending_updates.clear();
        }

-        writer.finish_write(lsn);
+        if !self.pending_deletions.is_empty() {
+            writer.delete_batch(&self.pending_deletions).await?;
+            self.pending_deletions.clear();
+        }
+
+        self.pending_lsns.push(self.lsn);
+        for pending_lsn in self.pending_lsns.drain(..) {
+            // Ideally, we should be able to call writer.finish_write() only once
+            // with the highest LSN. However, the last_record_lsn variable in the
+            // timeline keeps track of the latest LSN and the immediate previous LSN
+            // so we need to record every LSN to not leave a gap between them.
+            writer.finish_write(pending_lsn);
+        }

        if pending_nblocks != 0 {
            writer.update_current_logical_size(pending_nblocks * i64::from(BLCKSZ));
@@ -1370,44 +1412,86 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

-    pub(crate) fn is_empty(&self) -> bool {
-        self.pending_updates.is_empty() && self.pending_deletions.is_empty()
+    pub(crate) fn len(&self) -> usize {
+        self.pending_updates.len() + self.pending_deletions.len()
    }

    // Internal helper functions to batch the modifications

    async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
-        // Have we already updated the same key? Read the pending updated
+        // Have we already updated the same key? Read the latest pending updated
        // version in that case.
        //
        // Note: we don't check pending_deletions. It is an error to request a
        // value that has been removed, deletion only avoids leaking storage.
-        if let Some(value) = self.pending_updates.get(&key) {
-            if let Value::Image(img) = value {
-                Ok(img.clone())
-            } else {
-                // Currently, we never need to read back a WAL record that we
-                // inserted in the same "transaction". All the metadata updates
-                // work directly with Images, and we never need to read actual
-                // data pages. We could handle this if we had to, by calling
-                // the walredo manager, but let's keep it simple for now.
-                Err(PageReconstructError::from(anyhow::anyhow!(
-                    "unexpected pending WAL record"
-                )))
+        if let Some(values) = self.pending_updates.get(&key) {
+            if let Some((_, value)) = values.last() {
+                return if let Value::Image(img) = value {
+                    Ok(img.clone())
+                } else {
+                    // Currently, we never need to read back a WAL record that we
+                    // inserted in the same "transaction". All the metadata updates
+                    // work directly with Images, and we never need to read actual
+                    // data pages. We could handle this if we had to, by calling
+                    // the walredo manager, but let's keep it simple for now.
+                    Err(PageReconstructError::from(anyhow::anyhow!(
+                        "unexpected pending WAL record"
+                    )))
+                };
            }
-        } else {
-            let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
-            self.tline.get(key, lsn, ctx).await
        }
+        let lsn = Lsn::max(self.tline.get_last_record_lsn(), self.lsn);
+        self.tline.get(key, lsn, ctx).await
    }

    fn put(&mut self, key: Key, val: Value) {
-        self.pending_updates.insert(key, val);
+        let values = self.pending_updates.entry(key).or_default();
+        // Replace the previous value if it exists at the same lsn
+        if let Some((last_lsn, last_value)) = values.last_mut() {
+            if *last_lsn == self.lsn {
+                *last_value = val;
+                return;
+            }
+        }
+        values.push((self.lsn, val));
    }

    fn delete(&mut self, key_range: Range<Key>) {
        trace!("DELETE {}-{}", key_range.start, key_range.end);
-        self.pending_deletions.push(key_range);
+        self.pending_deletions.push((key_range, self.lsn));
+    }
+}
+
+/// This struct facilitates accessing either a committed key from the timeline at a
+/// specific LSN, or the latest uncommitted key from a pending modification.
+/// During WAL ingestion, the records from multiple LSNs may be batched in the same
+/// modification before being flushed to the timeline. Hence, the routines in WalIngest
+/// need to look up the keys in the modification first before looking them up in the
+/// timeline to not miss the latest updates.
+#[derive(Clone, Copy)]
+pub enum Version<'a> {
+    Lsn(Lsn),
+    Modified(&'a DatadirModification<'a>),
+}
+
+impl<'a> Version<'a> {
+    async fn get(
+        &self,
+        timeline: &Timeline,
+        key: Key,
+        ctx: &RequestContext,
+    ) -> Result<Bytes, PageReconstructError> {
+        match self {
+            Version::Lsn(lsn) => timeline.get(key, *lsn, ctx).await,
+            Version::Modified(modification) => modification.get(key, ctx).await,
+        }
+    }
+
+    fn get_lsn(&self) -> Lsn {
+        match self {
+            Version::Lsn(lsn) => *lsn,
+            Version::Modified(modification) => modification.lsn,
+        }
    }
 }

@@ -1776,21 +1860,6 @@ pub fn is_inherited_key(key: Key) -> bool {
    key != AUX_FILES_KEY
 }

-/// Guaranteed to return `Ok()` if [[is_rel_block_key]] returns `true` for `key`.
-pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
-    Ok(match key.field1 {
-        0x00 => (
-            RelTag {
-                spcnode: key.field2,
-                dbnode: key.field3,
-                relnode: key.field4,
-                forknum: key.field5,
-            },
-            key.field6,
-        ),
-        _ => anyhow::bail!("unexpected value kind 0x{:02x}", key.field1),
-    })
-}
 pub fn is_rel_fsm_block_key(key: Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
 }
--- a/pageserver/src/statvfs.rs
+++ b/pageserver/src/statvfs.rs
@@ -23,7 +23,7 @@ impl Statvfs {
    }

    // NB: allow() because the block count type is u32 on macOS.
-    #[allow(clippy::useless_conversion)]
+    #[allow(clippy::useless_conversion, clippy::unnecessary_fallible_conversions)]
    pub fn blocks(&self) -> u64 {
        match self {
            Statvfs::Real(stat) => u64::try_from(stat.blocks()).unwrap(),
@@ -32,7 +32,7 @@ impl Statvfs {
    }

    // NB: allow() because the block count type is u32 on macOS.
-    #[allow(clippy::useless_conversion)]
+    #[allow(clippy::useless_conversion, clippy::unnecessary_fallible_conversions)]
    pub fn blocks_available(&self) -> u64 {
        match self {
            Statvfs::Real(stat) => u64::try_from(stat.blocks_available()).unwrap(),
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -147,7 +147,7 @@ pub(crate) static BACKGROUND_RUNTIME_WORKER_THREADS: Lazy<usize> = Lazy::new(||
    // else, but that has not been needed in a long time.
    std::env::var("TOKIO_WORKER_THREADS")
        .map(|s| s.parse::<usize>().unwrap())
-        .unwrap_or_else(|_e| usize::max(1, num_cpus::get()))
+        .unwrap_or_else(|_e| usize::max(2, num_cpus::get()))
 });

 #[derive(Debug, Clone, Copy)]
@@ -258,6 +258,9 @@ pub enum TaskKind {
    /// See [`crate::disk_usage_eviction_task`].
    DiskUsageEviction,

+    /// See [`crate::tenant::secondary`].
+    SecondaryDownloads,
+
    /// See [`crate::tenant::secondary`].
    SecondaryUploads,

--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -12,11 +12,13 @@
 //!

 use anyhow::{bail, Context};
-use camino::{Utf8Path, Utf8PathBuf};
+use camino::Utf8Path;
+use camino::Utf8PathBuf;
 use enumset::EnumSet;
 use futures::stream::FuturesUnordered;
 use futures::FutureExt;
 use futures::StreamExt;
+use pageserver_api::models;
 use pageserver_api::models::TimelineState;
 use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::TenantShardId;
@@ -56,6 +58,7 @@ use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use self::timeline::TimelineResources;
+use self::timeline::WaitLsnError;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::deletion_queue::DeletionQueueClient;
@@ -71,6 +74,7 @@ use crate::tenant::config::LocationMode;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
 pub use crate::tenant::remote_timeline_client::index::IndexPart;
+use crate::tenant::remote_timeline_client::remote_initdb_archive_path;
 use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::storage_layer::DeltaLayer;
@@ -108,7 +112,7 @@ use toml_edit;
 use utils::{
    crashsafe,
    generation::Generation,
-    id::{TenantId, TimelineId},
+    id::TimelineId,
    lsn::{Lsn, RecordLsn},
 };

@@ -129,6 +133,13 @@ macro_rules! pausable_failpoint {
            .expect("spawn_blocking");
        }
    };
+    ($name:literal, $cond:expr) => {
+        if cfg!(feature = "testing") {
+            if $cond {
+                pausable_failpoint!($name)
+            }
+        }
+    };
 }

 pub mod blob_io;
@@ -360,13 +371,13 @@ impl WalRedoManager {
 pub enum GetTimelineError {
    #[error("Timeline {tenant_id}/{timeline_id} is not active, state: {state:?}")]
    NotActive {
-        tenant_id: TenantId,
+        tenant_id: TenantShardId,
        timeline_id: TimelineId,
        state: TimelineState,
    },
    #[error("Timeline {tenant_id}/{timeline_id} was not found")]
    NotFound {
-        tenant_id: TenantId,
+        tenant_id: TenantShardId,
        timeline_id: TimelineId,
    },
 }
@@ -595,10 +606,9 @@ impl Tenant {
        mode: SpawnMode,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Tenant>> {
-        // TODO(sharding): make WalRedoManager shard-aware
        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
            conf,
-            tenant_shard_id.tenant_id,
+            tenant_shard_id,
        )));

        let TenantSharedResources {
@@ -1003,7 +1013,7 @@ impl Tenant {
        // IndexPart is the source of truth.
        self.clean_up_timelines(&existent_timelines)?;

-        failpoint_support::sleep_millis_async!("attach-before-activate");
+        failpoint_support::sleep_millis_async!("attach-before-activate", &self.cancel);

        info!("Done");

@@ -1145,10 +1155,9 @@ impl Tenant {
        tenant_shard_id: TenantShardId,
        reason: String,
    ) -> Arc<Tenant> {
-        // TODO(sharding): make WalRedoManager shard-aware
        let wal_redo_manager = Arc::new(WalRedoManager::from(PostgresRedoManager::new(
            conf,
-            tenant_shard_id.tenant_id,
+            tenant_shard_id,
        )));
        Arc::new(Tenant::new(
            TenantState::Broken {
@@ -1508,10 +1517,6 @@ impl Tenant {
            .map_err(LoadLocalTimelineError::Load)
    }

-    pub(crate) fn tenant_id(&self) -> TenantId {
-        self.tenant_shard_id.tenant_id
-    }
-
    pub(crate) fn tenant_shard_id(&self) -> TenantShardId {
        self.tenant_shard_id
    }
@@ -1527,13 +1532,13 @@ impl Tenant {
        let timeline = timelines_accessor
            .get(&timeline_id)
            .ok_or(GetTimelineError::NotFound {
-                tenant_id: self.tenant_shard_id.tenant_id,
+                tenant_id: self.tenant_shard_id,
                timeline_id,
            })?;

        if active_only && !timeline.is_active() {
            Err(GetTimelineError::NotActive {
-                tenant_id: self.tenant_shard_id.tenant_id,
+                tenant_id: self.tenant_shard_id,
                timeline_id,
                state: timeline.current_state(),
            })
@@ -1760,7 +1765,15 @@ impl Tenant {
                    // decoding the new WAL might need to look up previous pages, relation
                    // sizes etc. and that would get confused if the previous page versions
                    // are not in the repository yet.
-                    ancestor_timeline.wait_lsn(*lsn, ctx).await?;
+                    ancestor_timeline
+                        .wait_lsn(*lsn, ctx)
+                        .await
+                        .map_err(|e| match e {
+                            e @ (WaitLsnError::Timeout(_) | WaitLsnError::BadState) => {
+                                CreateTimelineError::AncestorLsn(anyhow::anyhow!(e))
+                            }
+                            WaitLsnError::Shutdown => CreateTimelineError::ShuttingDown,
+                        })?;
                }

                self.branch_timeline(
@@ -1916,6 +1929,10 @@ impl Tenant {
        self.current_state() == TenantState::Active
    }

+    pub fn generation(&self) -> Generation {
+        self.generation
+    }
+
    /// Changes tenant status to active, unless shutdown was already requested.
    ///
    /// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup
@@ -2029,6 +2046,13 @@ impl Tenant {
        // It's mesed up.
        // we just ignore the failure to stop

+        // If we're still attaching, fire the cancellation token early to drop out: this
+        // will prevent us flushing, but ensures timely shutdown if some I/O during attach
+        // is very slow.
+        if matches!(self.current_state(), TenantState::Attaching) {
+            self.cancel.cancel();
+        }
+
        match self.set_stopping(shutdown_progress, false, false).await {
            Ok(()) => {}
            Err(SetStoppingError::Broken) => {
@@ -2298,6 +2322,32 @@ impl Tenant {
            .clone()
    }

+    /// For API access: generate a LocationConfig equivalent to the one that would be used to
+    /// create a Tenant in the same state.  Do not use this in hot paths: it's for relatively
+    /// rare external API calls, like a reconciliation at startup.
+    pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
+        let conf = self.tenant_conf.read().unwrap();
+
+        let location_config_mode = match conf.location.attach_mode {
+            AttachmentMode::Single => models::LocationConfigMode::AttachedSingle,
+            AttachmentMode::Multi => models::LocationConfigMode::AttachedMulti,
+            AttachmentMode::Stale => models::LocationConfigMode::AttachedStale,
+        };
+
+        // We have a pageserver TenantConf, we need the API-facing TenantConfig.
+        let tenant_config: models::TenantConfig = conf.tenant_conf.into();
+
+        models::LocationConfig {
+            mode: location_config_mode,
+            generation: self.generation.into(),
+            secondary_conf: None,
+            shard_number: self.shard_identity.number.0,
+            shard_count: self.shard_identity.count.0,
+            shard_stripe_size: self.shard_identity.stripe_size.0,
+            tenant_conf: tenant_config,
+        }
+    }
+
    pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
        &self.tenant_shard_id
    }
@@ -2543,7 +2593,9 @@ impl Tenant {
        let (state, mut rx) = watch::channel(state);

        tokio::spawn(async move {
+            // Strings for metric labels
            let tid = tenant_shard_id.to_string();
+            let shard_id_str = format!("{}", tenant_shard_id.shard_slug());

            fn inspect_state(state: &TenantState) -> ([&'static str; 1], bool) {
                ([state.into()], matches!(state, TenantState::Broken { .. }))
@@ -2556,13 +2608,15 @@ impl Tenant {
                // the tenant might be ignored and reloaded, so first remove any previous set
                // element. it most likely has already been scraped, as these are manual operations
                // right now. most likely we will add it back very soon.
-                drop(crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid]));
+                drop(
+                    crate::metrics::BROKEN_TENANTS_SET.remove_label_values(&[&tid, &shard_id_str]),
+                );
                false
            } else {
                // add the id to the set right away, there should not be any updates on the channel
                // after
                crate::metrics::BROKEN_TENANTS_SET
-                    .with_label_values(&[&tid])
+                    .with_label_values(&[&tid, &shard_id_str])
                    .set(1);
                true
            };
@@ -2588,7 +2642,7 @@ impl Tenant {
                    counted_broken = true;
                    // insert the tenant_id (back) into the set
                    crate::metrics::BROKEN_TENANTS_SET
-                        .with_label_values(&[&tid])
+                        .with_label_values(&[&tid, &shard_id_str])
                        .inc();
                }
            }
@@ -2648,10 +2702,11 @@ impl Tenant {
                }
            }

-            // Legacy configs are implicitly in attached state
+            // Legacy configs are implicitly in attached state, and do not support sharding
            Ok(LocationConf::attached_single(
                tenant_conf,
                Generation::none(),
+                &models::ShardParameters::default(),
            ))
        } else {
            // FIXME If the config file is not found, assume that we're attaching
@@ -2727,6 +2782,10 @@ impl Tenant {
 "#
        .to_string();

+        fail::fail_point!("tenant-config-before-write", |_| {
+            anyhow::bail!("tenant-config-before-write");
+        });
+
        // Convert the config to a toml file.
        conf_content += &toml_edit::ser::to_string_pretty(&location_conf)?;

@@ -3152,6 +3211,55 @@ impl Tenant {
        .await
    }

+    async fn upload_initdb(
+        &self,
+        timelines_path: &Utf8PathBuf,
+        pgdata_path: &Utf8PathBuf,
+        timeline_id: &TimelineId,
+    ) -> anyhow::Result<()> {
+        let Some(storage) = &self.remote_storage else {
+            // No remote storage?  No upload.
+            return Ok(());
+        };
+
+        let temp_path = timelines_path.join(format!(
+            "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
+        ));
+
+        scopeguard::defer! {
+            if let Err(e) = fs::remove_file(&temp_path) {
+                error!("Failed to remove temporary initdb archive '{temp_path}': {e}");
+            }
+        }
+
+        let (pgdata_zstd, tar_zst_size) =
+            import_datadir::create_tar_zst(pgdata_path, &temp_path).await?;
+
+        pausable_failpoint!("before-initdb-upload");
+
+        backoff::retry(
+            || async {
+                self::remote_timeline_client::upload_initdb_dir(
+                    storage,
+                    &self.tenant_shard_id.tenant_id,
+                    timeline_id,
+                    pgdata_zstd.try_clone().await?,
+                    tar_zst_size,
+                    &self.cancel,
+                )
+                .await
+            },
+            |_| false,
+            3,
+            u32::MAX,
+            "persist_initdb_tar_zst",
+            backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
+        )
+        .await?;
+
+        Ok(())
+    }
+
    /// - run initdb to init temporary instance and get bootstrap data
    /// - after initialization completes, tar up the temp dir and upload it to S3.
    ///
@@ -3191,6 +3299,18 @@ impl Tenant {
            let Some(storage) = &self.remote_storage else {
                bail!("no storage configured but load_existing_initdb set to {existing_initdb_timeline_id}");
            };
+            if existing_initdb_timeline_id != timeline_id {
+                let source_path = &remote_initdb_archive_path(
+                    &self.tenant_shard_id.tenant_id,
+                    &existing_initdb_timeline_id,
+                );
+                let dest_path =
+                    &remote_initdb_archive_path(&self.tenant_shard_id.tenant_id, &timeline_id);
+                storage
+                    .copy_object(source_path, dest_path)
+                    .await
+                    .context("copy initdb tar")?;
+            }
            let (initdb_tar_zst_path, initdb_tar_zst) =
                self::remote_timeline_client::download_initdb_tar_zst(
                    self.conf,
@@ -3201,66 +3321,26 @@ impl Tenant {
                )
                .await
                .context("download initdb tar")?;
+
+            scopeguard::defer! {
+                if let Err(e) = fs::remove_file(&initdb_tar_zst_path) {
+                    error!("Failed to remove temporary initdb archive '{initdb_tar_zst_path}': {e}");
+                }
+            }
+
            let buf_read =
                BufReader::with_capacity(remote_timeline_client::BUFFER_SIZE, initdb_tar_zst);
            import_datadir::extract_tar_zst(&pgdata_path, buf_read)
                .await
                .context("extract initdb tar")?;
-
-            tokio::fs::remove_file(&initdb_tar_zst_path)
-                .await
-                .or_else(|e| {
-                    if e.kind() == std::io::ErrorKind::NotFound {
-                        // If something else already removed the file, ignore the error
-                        Ok(())
-                    } else {
-                        Err(e)
-                    }
-                })
-                .with_context(|| format!("tempfile removal {initdb_tar_zst_path}"))?;
        } else {
-            // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path
+            // Init temporarily repo to get bootstrap data, this creates a directory in the `pgdata_path` path
            run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;

            // Upload the created data dir to S3
-            if let Some(storage) = &self.remote_storage {
-                let temp_path = timelines_path.join(format!(
-                    "{INITDB_PATH}.upload-{timeline_id}.{TEMP_FILE_SUFFIX}"
-                ));
-
-                let (pgdata_zstd, tar_zst_size) =
-                    import_datadir::create_tar_zst(&pgdata_path, &temp_path).await?;
-                backoff::retry(
-                    || async {
-                        self::remote_timeline_client::upload_initdb_dir(
-                            storage,
-                            &self.tenant_shard_id.tenant_id,
-                            &timeline_id,
-                            pgdata_zstd.try_clone().await?,
-                            tar_zst_size,
-                            &self.cancel,
-                        )
-                        .await
-                    },
-                    |_| false,
-                    3,
-                    u32::MAX,
-                    "persist_initdb_tar_zst",
-                    backoff::Cancel::new(self.cancel.clone(), || anyhow::anyhow!("Cancelled")),
-                )
-                .await?;
-
-                tokio::fs::remove_file(&temp_path)
-                    .await
-                    .or_else(|e| {
-                        if e.kind() == std::io::ErrorKind::NotFound {
-                            // If something else already removed the file, ignore the error
-                            Ok(())
-                        } else {
-                            Err(e)
-                        }
-                    })
-                    .with_context(|| format!("tempfile removal {temp_path}"))?;
+            if self.tenant_shard_id().is_zero() {
+                self.upload_initdb(&timelines_path, &pgdata_path, &timeline_id)
+                    .await?;
            }
        }
        let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();
@@ -3549,6 +3629,9 @@ impl Tenant {
        self.cached_synthetic_tenant_size
            .store(size, Ordering::Relaxed);

+        // Only shard zero should be calculating synthetic sizes
+        debug_assert!(self.shard_identity.is_zero());
+
        TENANT_SYNTHETIC_SIZE_METRIC
            .get_metric_with_label_values(&[&self.tenant_shard_id.tenant_id.to_string()])
            .unwrap()
@@ -3643,140 +3726,6 @@ fn remove_timeline_and_uninit_mark(
    Ok(())
 }

-pub(crate) async fn create_tenant_files(
-    conf: &'static PageServerConf,
-    location_conf: &LocationConf,
-    tenant_shard_id: &TenantShardId,
-) -> anyhow::Result<Utf8PathBuf> {
-    let target_tenant_directory = conf.tenant_path(tenant_shard_id);
-    anyhow::ensure!(
-        !target_tenant_directory
-            .try_exists()
-            .context("check existence of tenant directory")?,
-        "tenant directory already exists",
-    );
-
-    let temporary_tenant_dir =
-        path_with_suffix_extension(&target_tenant_directory, TEMP_FILE_SUFFIX);
-    debug!("Creating temporary directory structure in {temporary_tenant_dir}");
-
-    // top-level dir may exist if we are creating it through CLI
-    crashsafe::create_dir_all(&temporary_tenant_dir).with_context(|| {
-        format!("could not create temporary tenant directory {temporary_tenant_dir}")
-    })?;
-
-    let creation_result = try_create_target_tenant_dir(
-        conf,
-        location_conf,
-        tenant_shard_id,
-        &temporary_tenant_dir,
-        &target_tenant_directory,
-    )
-    .await;
-
-    if creation_result.is_err() {
-        error!(
-            "Failed to create directory structure for tenant {tenant_shard_id}, cleaning tmp data"
-        );
-        if let Err(e) = fs::remove_dir_all(&temporary_tenant_dir) {
-            error!("Failed to remove temporary tenant directory {temporary_tenant_dir:?}: {e}")
-        } else if let Err(e) = crashsafe::fsync(&temporary_tenant_dir) {
-            error!(
-                "Failed to fsync removed temporary tenant directory {temporary_tenant_dir:?}: {e}"
-            )
-        }
-    }
-
-    creation_result?;
-
-    Ok(target_tenant_directory)
-}
-
-async fn try_create_target_tenant_dir(
-    conf: &'static PageServerConf,
-    location_conf: &LocationConf,
-    tenant_shard_id: &TenantShardId,
-    temporary_tenant_dir: &Utf8Path,
-    target_tenant_directory: &Utf8Path,
-) -> Result<(), anyhow::Error> {
-    let temporary_tenant_timelines_dir = rebase_directory(
-        &conf.timelines_path(tenant_shard_id),
-        target_tenant_directory,
-        temporary_tenant_dir,
-    )
-    .with_context(|| format!("resolve tenant {tenant_shard_id} temporary timelines dir"))?;
-    let temporary_legacy_tenant_config_path = rebase_directory(
-        &conf.tenant_config_path(tenant_shard_id),
-        target_tenant_directory,
-        temporary_tenant_dir,
-    )
-    .with_context(|| format!("resolve tenant {tenant_shard_id} temporary config path"))?;
-    let temporary_tenant_config_path = rebase_directory(
-        &conf.tenant_location_config_path(tenant_shard_id),
-        target_tenant_directory,
-        temporary_tenant_dir,
-    )
-    .with_context(|| format!("resolve tenant {tenant_shard_id} temporary config path"))?;
-
-    Tenant::persist_tenant_config_at(
-        tenant_shard_id,
-        &temporary_tenant_config_path,
-        &temporary_legacy_tenant_config_path,
-        location_conf,
-    )
-    .await?;
-
-    crashsafe::create_dir(&temporary_tenant_timelines_dir).with_context(|| {
-        format!(
-            "create tenant {} temporary timelines directory {}",
-            tenant_shard_id, temporary_tenant_timelines_dir,
-        )
-    })?;
-    fail::fail_point!("tenant-creation-before-tmp-rename", |_| {
-        anyhow::bail!("failpoint tenant-creation-before-tmp-rename");
-    });
-
-    // Make sure the current tenant directory entries are durable before renaming.
-    // Without this, a crash may reorder any of the directory entry creations above.
-    crashsafe::fsync(temporary_tenant_dir)
-        .with_context(|| format!("sync temporary tenant directory {temporary_tenant_dir:?}"))?;
-
-    fs::rename(temporary_tenant_dir, target_tenant_directory).with_context(|| {
-        format!(
-            "move tenant {} temporary directory {} into the permanent one {}",
-            tenant_shard_id, temporary_tenant_dir, target_tenant_directory
-        )
-    })?;
-    let target_dir_parent = target_tenant_directory.parent().with_context(|| {
-        format!(
-            "get tenant {} dir parent for {}",
-            tenant_shard_id, target_tenant_directory,
-        )
-    })?;
-    crashsafe::fsync(target_dir_parent).with_context(|| {
-        format!(
-            "fsync renamed directory's parent {} for tenant {}",
-            target_dir_parent, tenant_shard_id,
-        )
-    })?;
-
-    Ok(())
-}
-
-fn rebase_directory(
-    original_path: &Utf8Path,
-    base: &Utf8Path,
-    new_base: &Utf8Path,
-) -> anyhow::Result<Utf8PathBuf> {
-    let relative_path = original_path.strip_prefix(base).with_context(|| {
-        format!(
-            "Failed to strip base prefix '{}' off path '{}'",
-            base, original_path
-        )
-    })?;
-    Ok(new_base.join(relative_path))
-}
-
 /// Create the cluster temporarily in 'initdbpath' directory inside the repository
 /// to get bootstrap data for timeline initialization.
 async fn run_initdb(
@@ -3834,7 +3783,7 @@ async fn run_initdb(

 impl Drop for Tenant {
    fn drop(&mut self) {
-        remove_tenant_metrics(&self.tenant_shard_id.tenant_id);
+        remove_tenant_metrics(&self.tenant_shard_id);
    }
 }
 /// Dump contents of a layer file to stdout.
@@ -3871,7 +3820,9 @@ pub async fn dump_layerfile_from_path(
 #[cfg(test)]
 pub(crate) mod harness {
    use bytes::{Bytes, BytesMut};
+    use camino::Utf8PathBuf;
    use once_cell::sync::OnceCell;
+    use pageserver_api::models::ShardParameters;
    use pageserver_api::shard::ShardIndex;
    use std::fs;
    use std::sync::Arc;
@@ -3938,8 +3889,6 @@ pub(crate) mod harness {
    pub struct TenantHarness {
        pub conf: &'static PageServerConf,
        pub tenant_conf: TenantConf,
-        // TODO(sharding): remove duplicative `tenant_id` in favor of access to tenant_shard_id
-        pub(crate) tenant_id: TenantId,
        pub tenant_shard_id: TenantShardId,
        pub generation: Generation,
        pub shard: ShardIndex,
@@ -4001,7 +3950,6 @@ pub(crate) mod harness {
            Ok(Self {
                conf,
                tenant_conf,
-                tenant_id,
                tenant_shard_id,
                generation: Generation::new(0xdeadbeef),
                shard: ShardIndex::unsharded(),
@@ -4059,6 +4007,7 @@ pub(crate) mod harness {
                AttachedTenantConf::try_from(LocationConf::attached_single(
                    TenantConfOpt::from(self.tenant_conf),
                    self.generation,
+                    &ShardParameters::default(),
                ))
                .unwrap(),
                // This is a legacy/test code path: sharding isn't supported here.
@@ -5262,7 +5211,7 @@ mod tests {
                assert_eq!(
                    e,
                    GetTimelineError::NotFound {
-                        tenant_id: tenant.tenant_shard_id.tenant_id,
+                        tenant_id: tenant.tenant_shard_id,
                        timeline_id: TIMELINE_ID,
                    }
                )
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -10,6 +10,7 @@
 //!
 use anyhow::bail;
 use pageserver_api::models;
+use pageserver_api::models::EvictionPolicy;
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
@@ -46,6 +47,8 @@ pub mod defaults {
    pub const DEFAULT_WALRECEIVER_LAGGING_WAL_TIMEOUT: &str = "10 seconds";
    pub const DEFAULT_MAX_WALRECEIVER_LSN_WAL_LAG: u64 = 10 * 1024 * 1024;
    pub const DEFAULT_EVICTIONS_LOW_RESIDENCE_DURATION_METRIC_THRESHOLD: &str = "24 hour";
+
+    pub const DEFAULT_INGEST_BATCH_SIZE: u64 = 100;
 }

 #[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
@@ -165,14 +168,17 @@ impl LocationConf {
    /// For use when loading from a legacy configuration: presence of a tenant
    /// implies it is in AttachmentMode::Single, which used to be the only
    /// possible state.  This function should eventually be removed.
-    pub(crate) fn attached_single(tenant_conf: TenantConfOpt, generation: Generation) -> Self {
+    pub(crate) fn attached_single(
+        tenant_conf: TenantConfOpt,
+        generation: Generation,
+        shard_params: &models::ShardParameters,
+    ) -> Self {
        Self {
            mode: LocationMode::Attached(AttachedLocationConfig {
                generation,
                attach_mode: AttachmentMode::Single,
            }),
-            // Legacy configuration loads are always from tenants created before sharding existed.
-            shard: ShardIdentity::unsharded(),
+            shard: ShardIdentity::from_params(ShardNumber(0), shard_params),
            tenant_conf,
        }
    }
@@ -426,30 +432,6 @@ pub struct TenantConfOpt {
    pub heatmap_period: Option<Duration>,
 }

-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-#[serde(tag = "kind")]
-pub enum EvictionPolicy {
-    NoEviction,
-    LayerAccessThreshold(EvictionPolicyLayerAccessThreshold),
-}
-
-impl EvictionPolicy {
-    pub fn discriminant_str(&self) -> &'static str {
-        match self {
-            EvictionPolicy::NoEviction => "NoEviction",
-            EvictionPolicy::LayerAccessThreshold(_) => "LayerAccessThreshold",
-        }
-    }
-}
-
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
-pub struct EvictionPolicyLayerAccessThreshold {
-    #[serde(with = "humantime_serde")]
-    pub period: Duration,
-    #[serde(with = "humantime_serde")]
-    pub threshold: Duration,
-}
-
 impl TenantConfOpt {
    pub fn merge(&self, global_conf: TenantConf) -> TenantConf {
        TenantConf {
@@ -574,6 +556,38 @@ impl TryFrom<toml_edit::Item> for TenantConfOpt {
    }
 }

+/// This is a conversion from our internal tenant config object to the one used
+/// in external APIs.
+impl From<TenantConfOpt> for models::TenantConfig {
+    fn from(value: TenantConfOpt) -> Self {
+        fn humantime(d: Duration) -> String {
+            format!("{}s", d.as_secs())
+        }
+        Self {
+            checkpoint_distance: value.checkpoint_distance,
+            checkpoint_timeout: value.checkpoint_timeout.map(humantime),
+            compaction_target_size: value.compaction_target_size,
+            compaction_period: value.compaction_period.map(humantime),
+            compaction_threshold: value.compaction_threshold,
+            gc_horizon: value.gc_horizon,
+            gc_period: value.gc_period.map(humantime),
+            image_creation_threshold: value.image_creation_threshold,
+            pitr_interval: value.pitr_interval.map(humantime),
+            walreceiver_connect_timeout: value.walreceiver_connect_timeout.map(humantime),
+            lagging_wal_timeout: value.lagging_wal_timeout.map(humantime),
+            max_lsn_wal_lag: value.max_lsn_wal_lag,
+            trace_read_requests: value.trace_read_requests,
+            eviction_policy: value.eviction_policy,
+            min_resident_size_override: value.min_resident_size_override,
+            evictions_low_residence_duration_metric_threshold: value
+                .evictions_low_residence_duration_metric_threshold
+                .map(humantime),
+            gc_feedback: value.gc_feedback,
+            heatmap_period: value.heatmap_period.map(humantime),
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -542,6 +542,7 @@ impl DeleteTenantFlow {
        )
        .await?;

+        pausable_failpoint!("tenant-delete-before-cleanup-remaining-fs-traces-pausable");
        fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
            Err(anyhow::anyhow!(
                "failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
@@ -588,7 +589,7 @@ impl DeleteTenantFlow {
                            }
                            break;
                        }
-                        TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
+                        TenantsMapRemoveResult::Occupied(TenantSlot::Secondary(_)) => {
                            // This is unexpected: this secondary tenants should not have been created, and we
                            // are not in a position to shut it down from here.
                            tracing::warn!("Tenant transitioned to secondary mode while deleting!");
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -3,7 +3,8 @@

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
 use pageserver_api::key::Key;
-use pageserver_api::shard::{ShardIdentity, ShardNumber, TenantShardId};
+use pageserver_api::models::ShardParameters;
+use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::collections::{BTreeMap, HashMap};
@@ -35,7 +36,7 @@ use crate::tenant::config::{
 };
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
-use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState};
+use crate::tenant::{AttachedTenantConf, SpawnMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME, TEMP_FILE_SUFFIX};

 use utils::crashsafe::path_with_suffix_extension;
@@ -44,6 +45,7 @@ use utils::generation::Generation;
 use utils::id::{TenantId, TimelineId};

 use super::delete::DeleteTenantError;
+use super::secondary::SecondaryTenant;
 use super::TenantSharedResources;

 /// For a tenant that appears in TenantsMap, it may either be
@@ -55,9 +57,10 @@ use super::TenantSharedResources;
 /// that way we avoid having to carefully switch a tenant's ingestion etc on and off during
 /// its lifetime, and we can preserve some important safety invariants like `Tenant` always
 /// having a properly acquired generation (Secondary doesn't need a generation)
+#[derive(Clone)]
 pub(crate) enum TenantSlot {
    Attached(Arc<Tenant>),
-    Secondary,
+    Secondary(Arc<SecondaryTenant>),
    /// In this state, other administrative operations acting on the TenantId should
    /// block, or return a retry indicator equivalent to HTTP 503.
    InProgress(utils::completion::Barrier),
@@ -67,7 +70,7 @@ impl std::fmt::Debug for TenantSlot {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Attached(tenant) => write!(f, "Attached({})", tenant.current_state()),
-            Self::Secondary => write!(f, "Secondary"),
+            Self::Secondary(_) => write!(f, "Secondary"),
            Self::InProgress(_) => write!(f, "InProgress"),
        }
    }
@@ -78,7 +81,7 @@ impl TenantSlot {
    fn get_attached(&self) -> Option<&Arc<Tenant>> {
        match self {
            Self::Attached(t) => Some(t),
-            Self::Secondary => None,
+            Self::Secondary(_) => None,
            Self::InProgress(_) => None,
        }
    }
@@ -130,7 +133,7 @@ impl TenantsMap {

    /// A page service client sends a TenantId, and to look up the correct Tenant we must
    /// resolve this to a fully qualified TenantShardId.
-    fn resolve_shard(
+    fn resolve_attached_shard(
        &self,
        tenant_id: &TenantId,
        selector: ShardSelector,
@@ -140,25 +143,27 @@ impl TenantsMap {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
                for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
+                    // Ignore all slots that don't contain an attached tenant
+                    let tenant = match &slot.1 {
+                        TenantSlot::Attached(t) => t,
+                        _ => continue,
+                    };
+
                    match selector {
                        ShardSelector::First => return Some(*slot.0),
                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
                            return Some(*slot.0)
                        }
                        ShardSelector::Page(key) => {
-                            if let Some(tenant) = slot.1.get_attached() {
-                                // First slot we see for this tenant, calculate the expected shard number
-                                // for the key: we will use this for checking if this and subsequent
-                                // slots contain the key, rather than recalculating the hash each time.
-                                if want_shard.is_none() {
-                                    want_shard = Some(tenant.shard_identity.get_shard_number(&key));
-                                }
+                            // First slot we see for this tenant, calculate the expected shard number
+                            // for the key: we will use this for checking if this and subsequent
+                            // slots contain the key, rather than recalculating the hash each time.
+                            if want_shard.is_none() {
+                                want_shard = Some(tenant.shard_identity.get_shard_number(&key));
+                            }

-                                if Some(tenant.shard_identity.number) == want_shard {
-                                    return Some(*slot.0);
-                                }
-                            } else {
-                                continue;
+                            if Some(tenant.shard_identity.number) == want_shard {
+                                return Some(*slot.0);
                            }
                        }
                        _ => continue,
@@ -464,12 +469,20 @@ pub async fn init_tenant_mgr(
                *gen
            } else {
                match &location_conf.mode {
-                    LocationMode::Secondary(_) => {
+                    LocationMode::Secondary(secondary_config) => {
                        // We do not require the control plane's permission for secondary mode
                        // tenants, because they do no remote writes and hence require no
                        // generation number
                        info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
-                        tenants.insert(tenant_shard_id, TenantSlot::Secondary);
+                        tenants.insert(
+                            tenant_shard_id,
+                            TenantSlot::Secondary(SecondaryTenant::new(
+                                tenant_shard_id,
+                                location_conf.shard,
+                                location_conf.tenant_conf,
+                                secondary_config,
+                            )),
+                        );
                    }
                    LocationMode::Attached(_) => {
                        // TODO: augment re-attach API to enable the control plane to
@@ -661,8 +674,14 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {

                            total_attached += 1;
                        }
-                        TenantSlot::Secondary => {
-                            shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary);
+                        TenantSlot::Secondary(state) => {
+                            // We don't need to wait for this individually per-tenant: the
+                            // downloader task will be waited on eventually, this cancel
+                            // is just to encourage it to drop out if it is doing work
+                            // for this tenant right now.
+                            state.cancel.cancel();
+
+                            shutdown_state.insert(tenant_shard_id, TenantSlot::Secondary(state));
                        }
                        TenantSlot::InProgress(notify) => {
                            // InProgress tenants are not visible in TenantsMap::ShuttingDown: we will
@@ -739,51 +758,14 @@ async fn shutdown_all_tenants0(tenants: &std::sync::RwLock<TenantsMap>) {
    // caller will log how long we took
 }

-pub(crate) async fn create_tenant(
-    conf: &'static PageServerConf,
-    tenant_conf: TenantConfOpt,
-    tenant_shard_id: TenantShardId,
-    generation: Generation,
-    resources: TenantSharedResources,
-    ctx: &RequestContext,
-) -> Result<Arc<Tenant>, TenantMapInsertError> {
-    let location_conf = LocationConf::attached_single(tenant_conf, generation);
-    info!("Creating tenant at location {location_conf:?}");
-
-    let slot_guard =
-        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
-    let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
-
-    let shard_identity = location_conf.shard;
-    let created_tenant = tenant_spawn(
-        conf,
-        tenant_shard_id,
-        &tenant_path,
-        resources,
-        AttachedTenantConf::try_from(location_conf)?,
-        shard_identity,
-        None,
-        &TENANTS,
-        SpawnMode::Create,
-        ctx,
-    )?;
-    // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
-    //      See https://github.com/neondatabase/neon/issues/4233
-
-    let created_tenant_id = created_tenant.tenant_id();
-    debug_assert_eq!(created_tenant_id, tenant_shard_id.tenant_id);
-
-    slot_guard.upsert(TenantSlot::Attached(created_tenant.clone()))?;
-
-    Ok(created_tenant)
-}
-
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum SetNewTenantConfigError {
    #[error(transparent)]
    GetTenant(#[from] GetTenantError),
    #[error(transparent)]
    Persist(anyhow::Error),
+    #[error(transparent)]
+    Other(anyhow::Error),
 }

 pub(crate) async fn set_new_tenant_config(
@@ -797,10 +779,21 @@ pub(crate) async fn set_new_tenant_config(
    info!("configuring tenant {tenant_id}");
    let tenant = get_tenant(tenant_shard_id, true)?;

+    if tenant.tenant_shard_id().shard_count > ShardCount(0) {
+        // Note that we use ShardParameters::default below.
+        return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
+            "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
+        )));
+    }
+
    // This is a legacy API that only operates on attached tenants: the preferred
    // API to use is the location_config/ endpoint, which lets the caller provide
    // the full LocationConf.
-    let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation);
+    let location_conf = LocationConf::attached_single(
+        new_tenant_conf,
+        tenant.generation,
+        &ShardParameters::default(),
+    );

    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
        .await
@@ -809,6 +802,24 @@ pub(crate) async fn set_new_tenant_config(
    Ok(())
 }

+#[derive(thiserror::Error, Debug)]
+pub(crate) enum UpsertLocationError {
+    #[error("Bad config request: {0}")]
+    BadRequest(anyhow::Error),
+
+    #[error("Cannot change config in this state: {0}")]
+    Unavailable(#[from] TenantMapError),
+
+    #[error("Tenant is already being modified")]
+    InProgress,
+
+    #[error("Failed to flush: {0}")]
+    Flush(anyhow::Error),
+
+    #[error("Internal error: {0}")]
+    Other(#[from] anyhow::Error),
+}
+
 impl TenantManager {
    /// Convenience function so that anyone with a TenantManager can get at the global configuration, without
    /// having to pass it around everywhere as a separate object.
@@ -836,36 +847,56 @@ impl TenantManager {
                TenantState::Active => Ok(Arc::clone(tenant)),
                _ => {
                    if active_only {
-                        Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
+                        Err(GetTenantError::NotActive(tenant_shard_id))
                    } else {
                        Ok(Arc::clone(tenant))
                    }
                }
            },
-            Some(TenantSlot::InProgress(_)) => {
-                Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
-            }
-            None | Some(TenantSlot::Secondary) => {
+            Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
+            None | Some(TenantSlot::Secondary(_)) => {
                Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
            }
        }
    }

+    pub(crate) fn get_secondary_tenant_shard(
+        &self,
+        tenant_shard_id: TenantShardId,
+    ) -> Option<Arc<SecondaryTenant>> {
+        let locked = self.tenants.read().unwrap();
+
+        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
+            .ok()
+            .flatten();
+
+        match peek_slot {
+            Some(TenantSlot::Secondary(s)) => Some(s.clone()),
+            _ => None,
+        }
+    }
+
    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
    pub(crate) async fn upsert_location(
        &self,
        tenant_shard_id: TenantShardId,
        new_location_config: LocationConf,
        flush: Option<Duration>,
+        spawn_mode: SpawnMode,
        ctx: &RequestContext,
-    ) -> Result<(), anyhow::Error> {
+    ) -> Result<Option<Arc<Tenant>>, UpsertLocationError> {
        debug_assert_current_span_has_tenant_id();
        info!("configuring tenant location to state {new_location_config:?}");

-        // Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
+        enum FastPathModified {
+            Attached(Arc<Tenant>),
+            Secondary(Arc<SecondaryTenant>),
+        }
+
+        // Special case fast-path for updates to existing slots: if our upsert is only updating configuration,
        // then we do not need to set the slot to InProgress, we can just call into the
        // existng tenant.
-        let modify_tenant = {
+        let fast_path_taken = {
            let locked = self.tenants.read().unwrap();
            let peek_slot =
                tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
@@ -875,16 +906,25 @@ impl TenantManager {
                        // A transition from Attached to Attached in the same generation, we may
                        // take our fast path and just provide the updated configuration
                        // to the tenant.
-                        tenant.set_new_location_config(AttachedTenantConf::try_from(
-                            new_location_config.clone(),
-                        )?);
+                        tenant.set_new_location_config(
+                            AttachedTenantConf::try_from(new_location_config.clone())
+                                .map_err(UpsertLocationError::BadRequest)?,
+                        );

-                        Some(tenant.clone())
+                        Some(FastPathModified::Attached(tenant.clone()))
                    } else {
                        // Different generations, fall through to general case
                        None
                    }
                }
+                (
+                    LocationMode::Secondary(secondary_conf),
+                    Some(TenantSlot::Secondary(secondary_tenant)),
+                ) => {
+                    secondary_tenant.set_config(secondary_conf);
+                    secondary_tenant.set_tenant_conf(&new_location_config.tenant_conf);
+                    Some(FastPathModified::Secondary(secondary_tenant.clone()))
+                }
                _ => {
                    // Not an Attached->Attached transition, fall through to general case
                    None
@@ -893,69 +933,107 @@ impl TenantManager {
        };

        // Fast-path continued: having dropped out of the self.tenants lock, do the async
-        // phase of waiting for flush, before returning.
-        if let Some(tenant) = modify_tenant {
-            // Transition to AttachedStale means we may well hold a valid generation
-            // still, and have been requested to go stale as part of a migration.  If
-            // the caller set `flush`, then flush to remote storage.
-            if let LocationMode::Attached(AttachedLocationConfig {
-                generation: _,
-                attach_mode: AttachmentMode::Stale,
-            }) = &new_location_config.mode
-            {
-                if let Some(flush_timeout) = flush {
-                    match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
-                        Ok(Err(e)) => {
-                            return Err(e);
-                        }
-                        Ok(Ok(_)) => return Ok(()),
-                        Err(_) => {
-                            tracing::warn!(
+        // phase of writing config and/or waiting for flush, before returning.
+        match fast_path_taken {
+            Some(FastPathModified::Attached(tenant)) => {
+                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+                    .await?;
+
+                // Transition to AttachedStale means we may well hold a valid generation
+                // still, and have been requested to go stale as part of a migration.  If
+                // the caller set `flush`, then flush to remote storage.
+                if let LocationMode::Attached(AttachedLocationConfig {
+                    generation: _,
+                    attach_mode: AttachmentMode::Stale,
+                }) = &new_location_config.mode
+                {
+                    if let Some(flush_timeout) = flush {
+                        match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
+                            Ok(Err(e)) => {
+                                return Err(UpsertLocationError::Flush(e));
+                            }
+                            Ok(Ok(_)) => return Ok(Some(tenant)),
+                            Err(_) => {
+                                tracing::warn!(
                                timeout_ms = flush_timeout.as_millis(),
                                "Timed out waiting for flush to remote storage, proceeding anyway."
                            )
+                            }
                        }
                    }
                }
-            }

-            return Ok(());
-        }
+                return Ok(Some(tenant));
+            }
+            Some(FastPathModified::Secondary(_secondary_tenant)) => {
+                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+                    .await?;
+
+                return Ok(None);
+            }
+            None => {
+                // Proceed with the general case procedure, where we will shutdown & remove any existing
+                // slot contents and replace with a fresh one
+            }
+        };

        // General case for upserts to TenantsMap, excluding the case above: we will substitute an
        // InProgress value to the slot while we make whatever changes are required.  The state for
        // the tenant is inaccessible to the outside world while we are doing this, but that is sensible:
        // the state is ill-defined while we're in transition.  Transitions are async, but fast: we do
        // not do significant I/O, and shutdowns should be prompt via cancellation tokens.
-        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
-
-        if let Some(TenantSlot::Attached(tenant)) = slot_guard.get_old_value() {
-            // The case where we keep a Tenant alive was covered above in the special case
-            // for Attached->Attached transitions in the same generation.  By this point,
-            // if we see an attached tenant we know it will be discarded and should be
-            // shut down.
-            let (_guard, progress) = utils::completion::channel();
-
-            match tenant.get_attach_mode() {
-                AttachmentMode::Single | AttachmentMode::Multi => {
-                    // Before we leave our state as the presumed holder of the latest generation,
-                    // flush any outstanding deletions to reduce the risk of leaking objects.
-                    self.resources.deletion_queue_client.flush_advisory()
+        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)
+            .map_err(|e| match e {
+                TenantSlotError::AlreadyExists(_, _) | TenantSlotError::NotFound(_) => {
+                    unreachable!("Called with mode Any")
                }
-                AttachmentMode::Stale => {
-                    // If we're stale there's not point trying to flush deletions
-                }
-            };
+                TenantSlotError::InProgress => UpsertLocationError::InProgress,
+                TenantSlotError::MapState(s) => UpsertLocationError::Unavailable(s),
+            })?;

-            info!("Shutting down attached tenant");
-            match tenant.shutdown(progress, false).await {
-                Ok(()) => {}
-                Err(barrier) => {
-                    info!("Shutdown already in progress, waiting for it to complete");
-                    barrier.wait().await;
+        match slot_guard.get_old_value() {
+            Some(TenantSlot::Attached(tenant)) => {
+                // The case where we keep a Tenant alive was covered above in the special case
+                // for Attached->Attached transitions in the same generation.  By this point,
+                // if we see an attached tenant we know it will be discarded and should be
+                // shut down.
+                let (_guard, progress) = utils::completion::channel();
+
+                match tenant.get_attach_mode() {
+                    AttachmentMode::Single | AttachmentMode::Multi => {
+                        // Before we leave our state as the presumed holder of the latest generation,
+                        // flush any outstanding deletions to reduce the risk of leaking objects.
+                        self.resources.deletion_queue_client.flush_advisory()
+                    }
+                    AttachmentMode::Stale => {
+                        // If we're stale there's not point trying to flush deletions
+                    }
+                };
+
+                info!("Shutting down attached tenant");
+                match tenant.shutdown(progress, false).await {
+                    Ok(()) => {}
+                    Err(barrier) => {
+                        info!("Shutdown already in progress, waiting for it to complete");
+                        barrier.wait().await;
+                    }
                }
+                slot_guard.drop_old_value().expect("We just shut it down");
+            }
+            Some(TenantSlot::Secondary(state)) => {
+                info!("Shutting down secondary tenant");
+                state.shutdown().await;
+            }
+            Some(TenantSlot::InProgress(_)) => {
+                // This should never happen: acquire_slot should error out
+                // if the contents of a slot were InProgress.
+                return Err(UpsertLocationError::Other(anyhow::anyhow!(
+                    "Acquired an InProgress slot, this is a bug."
+                )));
+            }
+            None => {
+                // Slot was vacant, nothing needs shutting down.
            }
-            slot_guard.drop_old_value().expect("We just shut it down");
        }

        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
@@ -973,24 +1051,44 @@ impl TenantManager {
        // Before activating either secondary or attached mode, persist the
        // configuration, so that on restart we will re-attach (or re-start
        // secondary) on the tenant.
-        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-            .await
-            .map_err(SetNewTenantConfigError::Persist)?;
+        Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config).await?;

        let new_slot = match &new_location_config.mode {
-            LocationMode::Secondary(_) => TenantSlot::Secondary,
+            LocationMode::Secondary(secondary_config) => {
+                let shard_identity = new_location_config.shard;
+                TenantSlot::Secondary(SecondaryTenant::new(
+                    tenant_shard_id,
+                    shard_identity,
+                    new_location_config.tenant_conf,
+                    secondary_config,
+                ))
+            }
            LocationMode::Attached(_attach_config) => {
                let shard_identity = new_location_config.shard;
+
+                // Testing hack: if we are configured with no control plane, then drop the generation
+                // from upserts.  This enables creating generation-less tenants even though neon_local
+                // always uses generations when calling the location conf API.
+                let attached_conf = if cfg!(feature = "testing") {
+                    let mut conf = AttachedTenantConf::try_from(new_location_config)?;
+                    if self.conf.control_plane_api.is_none() {
+                        conf.location.generation = Generation::none();
+                    }
+                    conf
+                } else {
+                    AttachedTenantConf::try_from(new_location_config)?
+                };
+
                let tenant = tenant_spawn(
                    self.conf,
                    tenant_shard_id,
                    &tenant_path,
                    self.resources.clone(),
-                    AttachedTenantConf::try_from(new_location_config)?,
+                    attached_conf,
                    shard_identity,
                    None,
                    self.tenants,
-                    SpawnMode::Normal,
+                    spawn_mode,
                    ctx,
                )?;

@@ -998,9 +1096,20 @@ impl TenantManager {
            }
        };

-        slot_guard.upsert(new_slot)?;
+        let attached_tenant = if let TenantSlot::Attached(tenant) = &new_slot {
+            Some(tenant.clone())
+        } else {
+            None
+        };

-        Ok(())
+        slot_guard.upsert(new_slot).map_err(|e| match e {
+            TenantSlotUpsertError::InternalError(e) => {
+                UpsertLocationError::Other(anyhow::anyhow!(e))
+            }
+            TenantSlotUpsertError::MapState(e) => UpsertLocationError::Unavailable(e),
+        })?;
+
+        Ok(attached_tenant)
    }

    /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
@@ -1091,6 +1200,41 @@ impl TenantManager {
                .collect(),
        }
    }
+    // Do some synchronous work for all tenant slots in Secondary state.  The provided
+    // callback should be small and fast, as it will be called inside the global
+    // TenantsMap lock.
+    pub(crate) fn foreach_secondary_tenants<F>(&self, mut func: F)
+    where
+        // TODO: let the callback return a hint to drop out of the loop early
+        F: FnMut(&TenantShardId, &Arc<SecondaryTenant>),
+    {
+        let locked = self.tenants.read().unwrap();
+
+        let map = match &*locked {
+            TenantsMap::Initializing | TenantsMap::ShuttingDown(_) => return,
+            TenantsMap::Open(m) => m,
+        };
+
+        for (tenant_id, slot) in map {
+            if let TenantSlot::Secondary(state) = slot {
+                // Only expose secondary tenants that are not currently shutting down
+                if !state.cancel.is_cancelled() {
+                    func(tenant_id, state)
+                }
+            }
+        }
+    }
+
+    /// Total list of all tenant slots: this includes attached, secondary, and InProgress.
+    pub(crate) fn list(&self) -> Vec<(TenantShardId, TenantSlot)> {
+        let locked = self.tenants.read().unwrap();
+        match &*locked {
+            TenantsMap::Initializing => Vec::new(),
+            TenantsMap::Open(map) | TenantsMap::ShuttingDown(map) => {
+                map.iter().map(|(k, v)| (*k, v.clone())).collect()
+            }
+        }
+    }

    pub(crate) async fn delete_tenant(
        &self,
@@ -1160,10 +1304,13 @@ impl TenantManager {

 #[derive(Debug, thiserror::Error)]
 pub(crate) enum GetTenantError {
+    /// NotFound is a TenantId rather than TenantShardId, because this error type is used from
+    /// getters that use a TenantId and a ShardSelector, not just getters that target a specific shard.
    #[error("Tenant {0} not found")]
    NotFound(TenantId),
+
    #[error("Tenant {0} is not active")]
-    NotActive(TenantId),
+    NotActive(TenantShardId),
    /// Broken is logically a subset of NotActive, but a distinct error is useful as
    /// NotActive is usually a retryable state for API purposes, whereas Broken
    /// is a stuck error state
@@ -1196,16 +1343,14 @@ pub(crate) fn get_tenant(
            TenantState::Active => Ok(Arc::clone(tenant)),
            _ => {
                if active_only {
-                    Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
+                    Err(GetTenantError::NotActive(tenant_shard_id))
                } else {
                    Ok(Arc::clone(tenant))
                }
            }
        },
-        Some(TenantSlot::InProgress(_)) => {
-            Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
-        }
-        None | Some(TenantSlot::Secondary) => {
+        Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_shard_id)),
+        None | Some(TenantSlot::Secondary(_)) => {
            Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
        }
    }
@@ -1257,9 +1402,11 @@ pub(crate) async fn get_active_tenant_with_timeout(
        let locked = TENANTS.read().unwrap();

        // Resolve TenantId to TenantShardId
-        let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
-            GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
-        )?;
+        let tenant_shard_id = locked
+            .resolve_attached_shard(&tenant_id, shard_selector)
+            .ok_or(GetActiveTenantError::NotFound(GetTenantError::NotFound(
+                tenant_id,
+            )))?;

        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
            .map_err(GetTenantError::MapState)?;
@@ -1276,9 +1423,9 @@ pub(crate) async fn get_active_tenant_with_timeout(
                    }
                }
            }
-            Some(TenantSlot::Secondary) => {
+            Some(TenantSlot::Secondary(_)) => {
                return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
-                    tenant_id,
+                    tenant_shard_id,
                )))
            }
            Some(TenantSlot::InProgress(barrier)) => {
@@ -1317,7 +1464,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                    Some(TenantSlot::Attached(tenant)) => tenant.clone(),
                    _ => {
                        return Err(GetActiveTenantError::NotFound(GetTenantError::NotActive(
-                            tenant_id,
+                            tenant_shard_id,
                        )))
                    }
                }
@@ -1345,7 +1492,7 @@ pub(crate) enum DeleteTimelineError {
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum TenantStateError {
    #[error("Tenant {0} is stopping")]
-    IsStopping(TenantId),
+    IsStopping(TenantShardId),
    #[error(transparent)]
    SlotError(#[from] TenantSlotError),
    #[error(transparent)]
@@ -1530,8 +1677,8 @@ pub(crate) enum TenantMapListError {
 ///
 /// Get list of tenants, for the mgmt API
 ///
-pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>, TenantMapListError>
-{
+pub(crate) async fn list_tenants(
+) -> Result<Vec<(TenantShardId, TenantState, Generation)>, TenantMapListError> {
    let tenants = TENANTS.read().unwrap();
    let m = match &*tenants {
        TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
@@ -1539,62 +1686,15 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>,
    };
    Ok(m.iter()
        .filter_map(|(id, tenant)| match tenant {
-            TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
-            TenantSlot::Secondary => None,
+            TenantSlot::Attached(tenant) => {
+                Some((*id, tenant.current_state(), tenant.generation()))
+            }
+            TenantSlot::Secondary(_) => None,
            TenantSlot::InProgress(_) => None,
        })
        .collect())
 }

-/// Execute Attach mgmt API command.
-///
-/// Downloading all the tenant data is performed in the background, this merely
-/// spawns the background task and returns quickly.
-pub(crate) async fn attach_tenant(
-    conf: &'static PageServerConf,
-    tenant_id: TenantId,
-    generation: Generation,
-    tenant_conf: TenantConfOpt,
-    resources: TenantSharedResources,
-    ctx: &RequestContext,
-) -> Result<(), TenantMapInsertError> {
-    // This is a legacy API (replaced by `/location_conf`).  It does not support sharding
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
-    let slot_guard =
-        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
-    let location_conf = LocationConf::attached_single(tenant_conf, generation);
-    let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
-    // TODO: tenant directory remains on disk if we bail out from here on.
-    //       See https://github.com/neondatabase/neon/issues/4233
-
-    let shard_identity = location_conf.shard;
-    let attached_tenant = tenant_spawn(
-        conf,
-        tenant_shard_id,
-        &tenant_dir,
-        resources,
-        AttachedTenantConf::try_from(location_conf)?,
-        shard_identity,
-        None,
-        &TENANTS,
-        SpawnMode::Normal,
-        ctx,
-    )?;
-    // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
-    //      See https://github.com/neondatabase/neon/issues/4233
-
-    let attached_tenant_id = attached_tenant.tenant_id();
-    if tenant_id != attached_tenant_id {
-        return Err(TenantMapInsertError::Other(anyhow::anyhow!(
-            "loaded created tenant has unexpected tenant id (expect {tenant_id} != actual {attached_tenant_id})",
-        )));
-    }
-
-    slot_guard.upsert(TenantSlot::Attached(attached_tenant))?;
-    Ok(())
-}
-
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum TenantMapInsertError {
    #[error(transparent)]
@@ -1608,7 +1708,7 @@ pub(crate) enum TenantMapInsertError {
 /// Superset of TenantMapError: issues that can occur when acquiring a slot
 /// for a particular tenant ID.
 #[derive(Debug, thiserror::Error)]
-pub enum TenantSlotError {
+pub(crate) enum TenantSlotError {
    /// When acquiring a slot with the expectation that the tenant already exists.
    #[error("Tenant {0} not found")]
    NotFound(TenantShardId),
@@ -1617,9 +1717,6 @@ pub enum TenantSlotError {
    #[error("tenant {0} already exists, state: {1:?}")]
    AlreadyExists(TenantShardId, TenantState),

-    #[error("tenant {0} already exists in but is not attached")]
-    Conflict(TenantShardId),
-
    // Tried to read a slot that is currently being mutated by another administrative
    // operation.
    #[error("tenant has a state change in progress, try again later")]
@@ -1797,11 +1894,7 @@ impl SlotGuard {
    fn old_value_is_shutdown(&self) -> bool {
        match self.old_value.as_ref() {
            Some(TenantSlot::Attached(tenant)) => tenant.gate.close_complete(),
-            Some(TenantSlot::Secondary) => {
-                // TODO: when adding secondary mode tenants, this will check for shutdown
-                // in the same way that we do for `Tenant` above
-                true
-            }
+            Some(TenantSlot::Secondary(secondary_tenant)) => secondary_tenant.gate.close_complete(),
            Some(TenantSlot::InProgress(_)) => {
                // A SlotGuard cannot be constructed for a slot that was already InProgress
                unreachable!()
@@ -2011,39 +2104,40 @@ where
    let mut slot_guard =
        tenant_map_acquire_slot_impl(&tenant_shard_id, tenants, TenantSlotAcquireMode::MustExist)?;

-    // The SlotGuard allows us to manipulate the Tenant object without fear of some
-    // concurrent API request doing something else for the same tenant ID.
-    let attached_tenant = match slot_guard.get_old_value() {
-        Some(TenantSlot::Attached(t)) => Some(t),
-        _ => None,
-    };
-
    // allow pageserver shutdown to await for our completion
    let (_guard, progress) = completion::channel();

-    // If the tenant was attached, shut it down gracefully.  For secondary
-    // locations this part is not necessary
-    match &attached_tenant {
-        Some(attached_tenant) => {
+    // The SlotGuard allows us to manipulate the Tenant object without fear of some
+    // concurrent API request doing something else for the same tenant ID.
+    let attached_tenant = match slot_guard.get_old_value() {
+        Some(TenantSlot::Attached(tenant)) => {
            // whenever we remove a tenant from memory, we don't want to flush and wait for upload
            let freeze_and_flush = false;

            // shutdown is sure to transition tenant to stopping, and wait for all tasks to complete, so
            // that we can continue safely to cleanup.
-            match attached_tenant.shutdown(progress, freeze_and_flush).await {
+            match tenant.shutdown(progress, freeze_and_flush).await {
                Ok(()) => {}
                Err(_other) => {
                    // if pageserver shutdown or other detach/ignore is already ongoing, we don't want to
                    // wait for it but return an error right away because these are distinct requests.
                    slot_guard.revert();
-                    return Err(TenantStateError::IsStopping(tenant_shard_id.tenant_id));
+                    return Err(TenantStateError::IsStopping(tenant_shard_id));
                }
            }
+            Some(tenant)
        }
-        None => {
-            // Nothing to wait on when not attached, proceed.
+        Some(TenantSlot::Secondary(secondary_state)) => {
+            tracing::info!("Shutting down in secondary mode");
+            secondary_state.shutdown().await;
+            None
        }
-    }
+        Some(TenantSlot::InProgress(_)) => {
+            // Acquiring a slot guarantees its old value was not InProgress
+            unreachable!();
+        }
+        None => None,
+    };

    match tenant_cleanup
        .await
@@ -2157,7 +2251,6 @@ pub(crate) async fn immediate_gc(

 #[cfg(test)]
 mod tests {
-    use pageserver_api::shard::TenantShardId;
    use std::collections::BTreeMap;
    use std::sync::Arc;
    use tracing::{info_span, Instrument};
@@ -2178,7 +2271,7 @@ mod tests {

        // harness loads it to active, which is forced and nothing is running on the tenant

-        let id = TenantShardId::unsharded(t.tenant_id());
+        let id = t.tenant_shard_id();

        // tenant harness configures the logging and we cannot escape it
        let _e = info_span!("testing", tenant_id = %id).entered();
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -182,7 +182,7 @@

 pub(crate) mod download;
 pub mod index;
-mod upload;
+pub(crate) mod upload;

 use anyhow::Context;
 use camino::Utf8Path;
@@ -229,6 +229,7 @@ use crate::{
    tenant::upload_queue::{
        UploadOp, UploadQueue, UploadQueueInitialized, UploadQueueStopped, UploadTask,
    },
+    TENANT_HEATMAP_BASENAME,
 };

 use utils::id::{TenantId, TimelineId};
@@ -521,8 +522,6 @@ impl RemoteTimelineClient {
            cancel,
        )
        .measure_remote_op(
-            self.tenant_shard_id.tenant_id,
-            self.timeline_id,
            RemoteOpFileKind::Index,
            RemoteOpKind::Download,
            Arc::clone(&self.metrics),
@@ -565,8 +564,6 @@ impl RemoteTimelineClient {
                cancel,
            )
            .measure_remote_op(
-                self.tenant_shard_id.tenant_id,
-                self.timeline_id,
                RemoteOpFileKind::Layer,
                RemoteOpKind::Download,
                Arc::clone(&self.metrics),
@@ -690,7 +687,10 @@ impl RemoteTimelineClient {
            .insert(layer.layer_desc().filename(), metadata.clone());
        upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;

-        info!("scheduled layer file upload {layer}");
+        info!(
+            "scheduled layer file upload {layer} gen={:?} shard={:?}",
+            metadata.generation, metadata.shard
+        );
        let op = UploadOp::UploadLayer(layer, metadata);
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
@@ -818,8 +818,25 @@ impl RemoteTimelineClient {
    fn schedule_deletion_of_unlinked0(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
-        with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
+        mut with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
    ) {
+        // Filter out any layers which were not created by this tenant shard.  These are
+        // layers that originate from some ancestor shard after a split, and may still
+        // be referenced by other shards. We are free to delete them locally and remove
+        // them from our index (and would have already done so when we reach this point
+        // in the code), but we may not delete them remotely.
+        with_metadata.retain(|(name, meta)| {
+            let retain = meta.shard.shard_number == self.tenant_shard_id.shard_number
+                && meta.shard.shard_count == self.tenant_shard_id.shard_count;
+            if !retain {
+                tracing::debug!(
+                    "Skipping deletion of ancestor-shard layer {name}, from shard {}",
+                    meta.shard
+                );
+            }
+            retain
+        });
+
        for (name, meta) in &with_metadata {
            info!(
                "scheduling deletion of layer {}{} (shard {})",
@@ -1330,8 +1347,6 @@ impl RemoteTimelineClient {
                        &self.cancel,
                    )
                    .measure_remote_op(
-                        self.tenant_shard_id.tenant_id,
-                        self.timeline_id,
                        RemoteOpFileKind::Layer,
                        RemoteOpKind::Upload,
                        Arc::clone(&self.metrics),
@@ -1357,8 +1372,6 @@ impl RemoteTimelineClient {
                        &self.cancel,
                    )
                    .measure_remote_op(
-                        self.tenant_shard_id.tenant_id,
-                        self.timeline_id,
                        RemoteOpFileKind::Index,
                        RemoteOpKind::Upload,
                        Arc::clone(&self.metrics),
@@ -1724,11 +1737,11 @@ pub fn remote_index_path(
    .expect("Failed to construct path")
 }

-pub const HEATMAP_BASENAME: &str = "heatmap-v1.json";
-
 pub(crate) fn remote_heatmap_path(tenant_shard_id: &TenantShardId) -> RemotePath {
-    RemotePath::from_string(&format!("tenants/{tenant_shard_id}/{HEATMAP_BASENAME}"))
-        .expect("Failed to construct path")
+    RemotePath::from_string(&format!(
+        "tenants/{tenant_shard_id}/{TENANT_HEATMAP_BASENAME}"
+    ))
+    .expect("Failed to construct path")
 }

 /// Given the key of an index, parse out the generation part of the name
@@ -1885,7 +1898,7 @@ mod tests {
        fn span(&self) -> tracing::Span {
            tracing::info_span!(
                "test",
-                tenant_id = %self.harness.tenant_id,
+                tenant_id = %self.harness.tenant_shard_id.tenant_id,
                timeline_id = %TIMELINE_ID
            )
        }
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -1,24 +1,62 @@
+mod downloader;
 pub mod heatmap;
 mod heatmap_uploader;
+mod scheduler;

-use std::sync::Arc;
+use std::{sync::Arc, time::SystemTime};

-use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
+use crate::{
+    config::PageServerConf,
+    disk_usage_eviction_task::DiskUsageEvictionInfo,
+    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
+    virtual_file::MaybeFatalIo,
+};

-use self::heatmap_uploader::heatmap_uploader_task;
+use self::{
+    downloader::{downloader_task, SecondaryDetail},
+    heatmap_uploader::heatmap_uploader_task,
+};

-use super::mgr::TenantManager;
+use super::{
+    config::{SecondaryLocationConfig, TenantConfOpt},
+    mgr::TenantManager,
+    span::debug_assert_current_span_has_tenant_id,
+    storage_layer::LayerFileName,
+};

-use pageserver_api::shard::TenantShardId;
+use pageserver_api::{
+    models,
+    shard::{ShardIdentity, TenantShardId},
+};
 use remote_storage::GenericRemoteStorage;

 use tokio_util::sync::CancellationToken;
-use utils::completion::Barrier;
+use tracing::instrument;
+use utils::{completion::Barrier, fs_ext, id::TimelineId, sync::gate::Gate};

+enum DownloadCommand {
+    Download(TenantShardId),
+}
 enum UploadCommand {
    Upload(TenantShardId),
 }

+impl UploadCommand {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        match self {
+            Self::Upload(id) => id,
+        }
+    }
+}
+
+impl DownloadCommand {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        match self {
+            Self::Download(id) => id,
+        }
+    }
+}
+
 struct CommandRequest<T> {
    payload: T,
    response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
@@ -28,12 +66,166 @@ struct CommandResponse {
    result: anyhow::Result<()>,
 }

+// Whereas [`Tenant`] represents an attached tenant, this type represents the work
+// we do for secondary tenant locations: where we are not serving clients or
+// ingesting WAL, but we are maintaining a warm cache of layer files.
+//
+// This type is all about the _download_ path for secondary mode.  The upload path
+// runs separately (see [`heatmap_uploader`]) while a regular attached `Tenant` exists.
+//
+// This structure coordinates TenantManager and SecondaryDownloader,
+// so that the downloader can indicate which tenants it is currently
+// operating on, and the manager can indicate when a particular
+// secondary tenant should cancel any work in flight.
+#[derive(Debug)]
+pub(crate) struct SecondaryTenant {
+    /// Carrying a tenant shard ID simplifies callers such as the downloader
+    /// which need to organize many of these objects by ID.
+    tenant_shard_id: TenantShardId,
+
+    /// Cancellation token indicates to SecondaryDownloader that it should stop doing
+    /// any work for this tenant at the next opportunity.
+    pub(crate) cancel: CancellationToken,
+
+    pub(crate) gate: Gate,
+
+    // Secondary mode does not need the full shard identity or the TenantConfOpt.  However,
+    // storing these enables us to report our full LocationConf, enabling convenient reconciliation
+    // by the control plane (see [`Self::get_location_conf`])
+    shard_identity: ShardIdentity,
+    tenant_conf: std::sync::Mutex<TenantConfOpt>,
+
+    detail: std::sync::Mutex<SecondaryDetail>,
+}
+
+impl SecondaryTenant {
+    pub(crate) fn new(
+        tenant_shard_id: TenantShardId,
+        shard_identity: ShardIdentity,
+        tenant_conf: TenantConfOpt,
+        config: &SecondaryLocationConfig,
+    ) -> Arc<Self> {
+        Arc::new(Self {
+            tenant_shard_id,
+            // todo: shall we make this a descendent of the
+            // main cancellation token, or is it sufficient that
+            // on shutdown we walk the tenants and fire their
+            // individual cancellations?
+            cancel: CancellationToken::new(),
+            gate: Gate::new(format!("SecondaryTenant {tenant_shard_id}")),
+
+            shard_identity,
+            tenant_conf: std::sync::Mutex::new(tenant_conf),
+
+            detail: std::sync::Mutex::new(SecondaryDetail::new(config.clone())),
+        })
+    }
+
+    pub(crate) async fn shutdown(&self) {
+        self.cancel.cancel();
+
+        // Wait for any secondary downloader work to complete
+        self.gate.close().await;
+    }
+
+    pub(crate) fn set_config(&self, config: &SecondaryLocationConfig) {
+        self.detail.lock().unwrap().config = config.clone();
+    }
+
+    pub(crate) fn set_tenant_conf(&self, config: &TenantConfOpt) {
+        *(self.tenant_conf.lock().unwrap()) = *config;
+    }
+
+    /// For API access: generate a LocationConfig equivalent to the one that would be used to
+    /// create a Tenant in the same state.  Do not use this in hot paths: it's for relatively
+    /// rare external API calls, like a reconciliation at startup.
+    pub(crate) fn get_location_conf(&self) -> models::LocationConfig {
+        let conf = self.detail.lock().unwrap().config.clone();
+
+        let conf = models::LocationConfigSecondary { warm: conf.warm };
+
+        let tenant_conf = *self.tenant_conf.lock().unwrap();
+        models::LocationConfig {
+            mode: models::LocationConfigMode::Secondary,
+            generation: None,
+            secondary_conf: Some(conf),
+            shard_number: self.tenant_shard_id.shard_number.0,
+            shard_count: self.tenant_shard_id.shard_count.0,
+            shard_stripe_size: self.shard_identity.stripe_size.0,
+            tenant_conf: tenant_conf.into(),
+        }
+    }
+
+    pub(crate) fn get_tenant_shard_id(&self) -> &TenantShardId {
+        &self.tenant_shard_id
+    }
+
+    pub(crate) fn get_layers_for_eviction(self: &Arc<Self>) -> DiskUsageEvictionInfo {
+        self.detail.lock().unwrap().get_layers_for_eviction(self)
+    }
+
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug(), timeline_id=%timeline_id, name=%name))]
+    pub(crate) async fn evict_layer(
+        &self,
+        conf: &PageServerConf,
+        timeline_id: TimelineId,
+        name: LayerFileName,
+    ) {
+        debug_assert_current_span_has_tenant_id();
+
+        let _guard = match self.gate.enter() {
+            Ok(g) => g,
+            Err(_) => {
+                tracing::debug!("Dropping layer evictions, secondary tenant shutting down",);
+                return;
+            }
+        };
+
+        let now = SystemTime::now();
+
+        let path = conf
+            .timeline_path(&self.tenant_shard_id, &timeline_id)
+            .join(name.file_name());
+
+        // We tolerate ENOENT, because between planning eviction and executing
+        // it, the secondary downloader could have seen an updated heatmap that
+        // resulted in a layer being deleted.
+        // Other local I/O errors are process-fatal: these should never happen.
+        tokio::fs::remove_file(path)
+            .await
+            .or_else(fs_ext::ignore_not_found)
+            .fatal_err("Deleting layer during eviction");
+
+        // Update the timeline's state.  This does not have to be synchronized with
+        // the download process, because:
+        // - If downloader is racing with us to remove a file (e.g. because it is
+        //   removed from heatmap), then our mutual .remove() operations will both
+        //   succeed.
+        // - If downloader is racing with us to download the object (this would require
+        //   multiple eviction iterations to race with multiple download iterations), then
+        //   if we remove it from the state, the worst that happens is the downloader
+        //   downloads it again before re-inserting, or we delete the file but it remains
+        //   in the state map (in which case it will be downloaded if this secondary
+        //   tenant transitions to attached and tries to access it)
+        //
+        // The important assumption here is that the secondary timeline state does not
+        // have to 100% match what is on disk, because it's a best-effort warming
+        // of the cache.
+        let mut detail = self.detail.lock().unwrap();
+        if let Some(timeline_detail) = detail.timelines.get_mut(&timeline_id) {
+            timeline_detail.on_disk_layers.remove(&name);
+            timeline_detail.evicted_at.insert(name, now);
+        }
+    }
+}
+
 /// The SecondaryController is a pseudo-rpc client for administrative control of secondary mode downloads,
 /// and heatmap uploads.  This is not a hot data path: it's primarily a hook for tests,
 /// where we want to immediately upload/download for a particular tenant.  In normal operation
 /// uploads & downloads are autonomous and not driven by this interface.
 pub struct SecondaryController {
    upload_req_tx: tokio::sync::mpsc::Sender<CommandRequest<UploadCommand>>,
+    download_req_tx: tokio::sync::mpsc::Sender<CommandRequest<DownloadCommand>>,
 }

 impl SecondaryController {
@@ -63,6 +255,13 @@ impl SecondaryController {
        self.dispatch(&self.upload_req_tx, UploadCommand::Upload(tenant_shard_id))
            .await
    }
+    pub async fn download_tenant(&self, tenant_shard_id: TenantShardId) -> anyhow::Result<()> {
+        self.dispatch(
+            &self.download_req_tx,
+            DownloadCommand::Download(tenant_shard_id),
+        )
+        .await
+    }
 }

 pub fn spawn_tasks(
@@ -71,9 +270,37 @@ pub fn spawn_tasks(
    background_jobs_can_start: Barrier,
    cancel: CancellationToken,
 ) -> SecondaryController {
+    let mgr_clone = tenant_manager.clone();
+    let storage_clone = remote_storage.clone();
+    let cancel_clone = cancel.clone();
+    let bg_jobs_clone = background_jobs_can_start.clone();
+
+    let (download_req_tx, download_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
    let (upload_req_tx, upload_req_rx) =
        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);

+    task_mgr::spawn(
+        BACKGROUND_RUNTIME.handle(),
+        TaskKind::SecondaryDownloads,
+        None,
+        None,
+        "secondary tenant downloads",
+        false,
+        async move {
+            downloader_task(
+                mgr_clone,
+                storage_clone,
+                download_req_rx,
+                bg_jobs_clone,
+                cancel_clone,
+            )
+            .await;
+
+            Ok(())
+        },
+    );
+
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::SecondaryUploads,
@@ -89,16 +316,26 @@ pub fn spawn_tasks(
                background_jobs_can_start,
                cancel,
            )
-            .await
+            .await;
+
+            Ok(())
        },
    );

-    SecondaryController { upload_req_tx }
+    SecondaryController {
+        download_req_tx,
+        upload_req_tx,
+    }
 }

 /// For running with remote storage disabled: a SecondaryController that is connected to nothing.
 pub fn null_controller() -> SecondaryController {
+    let (download_req_tx, _download_req_rx) =
+        tokio::sync::mpsc::channel::<CommandRequest<DownloadCommand>>(16);
    let (upload_req_tx, _upload_req_rx) =
        tokio::sync::mpsc::channel::<CommandRequest<UploadCommand>>(16);
-    SecondaryController { upload_req_tx }
+    SecondaryController {
+        upload_req_tx,
+        download_req_tx,
+    }
 }
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -0,0 +1,843 @@
+use std::{
+    collections::{HashMap, HashSet},
+    pin::Pin,
+    str::FromStr,
+    sync::Arc,
+    time::{Duration, Instant, SystemTime},
+};
+
+use crate::{
+    config::PageServerConf,
+    disk_usage_eviction_task::{
+        finite_f32, DiskUsageEvictionInfo, EvictionCandidate, EvictionLayer, EvictionSecondaryLayer,
+    },
+    metrics::SECONDARY_MODE,
+    tenant::{
+        config::SecondaryLocationConfig,
+        debug_assert_current_span_has_tenant_and_timeline_id,
+        remote_timeline_client::{
+            index::LayerFileMetadata, FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES,
+        },
+        span::debug_assert_current_span_has_tenant_id,
+        storage_layer::LayerFileName,
+        tasks::{warn_when_period_overrun, BackgroundLoopKind},
+    },
+    virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
+    METADATA_FILE_NAME, TEMP_FILE_SUFFIX,
+};
+
+use super::{
+    heatmap::HeatMapLayer,
+    scheduler::{self, Completion, JobGenerator, SchedulingResult, TenantBackgroundJobs},
+    SecondaryTenant,
+};
+
+use crate::tenant::{
+    mgr::TenantManager,
+    remote_timeline_client::{download::download_layer_file, remote_heatmap_path},
+};
+
+use chrono::format::{DelayedFormat, StrftimeItems};
+use futures::Future;
+use pageserver_api::shard::TenantShardId;
+use rand::Rng;
+use remote_storage::{DownloadError, GenericRemoteStorage};
+
+use tokio_util::sync::CancellationToken;
+use tracing::{info_span, instrument, Instrument};
+use utils::{
+    backoff, completion::Barrier, crashsafe::path_with_suffix_extension, fs_ext, id::TimelineId,
+};
+
+use super::{
+    heatmap::{HeatMapTenant, HeatMapTimeline},
+    CommandRequest, DownloadCommand,
+};
+
+/// For each tenant, how long must have passed since the last download_tenant call before
+/// calling it again.  This is approximately the time by which local data is allowed
+/// to fall behind remote data.
+///
+/// TODO: this should just be a default, and the actual period should be controlled
+/// via the heatmap itself
+/// `<ttps://github.com/neondatabase/neon/issues/6200>`
+const DOWNLOAD_FRESHEN_INTERVAL: Duration = Duration::from_millis(60000);
+
+pub(super) async fn downloader_task(
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    command_queue: tokio::sync::mpsc::Receiver<CommandRequest<DownloadCommand>>,
+    background_jobs_can_start: Barrier,
+    cancel: CancellationToken,
+) {
+    let concurrency = tenant_manager.get_conf().secondary_download_concurrency;
+
+    let generator = SecondaryDownloader {
+        tenant_manager,
+        remote_storage,
+    };
+    let mut scheduler = Scheduler::new(generator, concurrency);
+
+    scheduler
+        .run(command_queue, background_jobs_can_start, cancel)
+        .instrument(info_span!("secondary_downloads"))
+        .await
+}
+
+struct SecondaryDownloader {
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+}
+
+#[derive(Debug, Clone)]
+pub(super) struct OnDiskState {
+    metadata: LayerFileMetadata,
+    access_time: SystemTime,
+}
+
+impl OnDiskState {
+    fn new(
+        _conf: &'static PageServerConf,
+        _tenant_shard_id: &TenantShardId,
+        _imeline_id: &TimelineId,
+        _ame: LayerFileName,
+        metadata: LayerFileMetadata,
+        access_time: SystemTime,
+    ) -> Self {
+        Self {
+            metadata,
+            access_time,
+        }
+    }
+}
+
+#[derive(Debug, Clone, Default)]
+pub(super) struct SecondaryDetailTimeline {
+    pub(super) on_disk_layers: HashMap<LayerFileName, OnDiskState>,
+
+    /// We remember when layers were evicted, to prevent re-downloading them.
+    pub(super) evicted_at: HashMap<LayerFileName, SystemTime>,
+}
+
+/// This state is written by the secondary downloader, it is opaque
+/// to TenantManager
+#[derive(Debug)]
+pub(super) struct SecondaryDetail {
+    pub(super) config: SecondaryLocationConfig,
+
+    last_download: Option<Instant>,
+    next_download: Option<Instant>,
+    pub(super) timelines: HashMap<TimelineId, SecondaryDetailTimeline>,
+}
+
+/// Helper for logging SystemTime
+fn strftime(t: &'_ SystemTime) -> DelayedFormat<StrftimeItems<'_>> {
+    let datetime: chrono::DateTime<chrono::Utc> = (*t).into();
+    datetime.format("%d/%m/%Y %T")
+}
+
+impl SecondaryDetail {
+    pub(super) fn new(config: SecondaryLocationConfig) -> Self {
+        Self {
+            config,
+            last_download: None,
+            next_download: None,
+            timelines: HashMap::new(),
+        }
+    }
+
+    pub(super) fn get_layers_for_eviction(
+        &self,
+        parent: &Arc<SecondaryTenant>,
+    ) -> DiskUsageEvictionInfo {
+        let mut result = DiskUsageEvictionInfo {
+            max_layer_size: None,
+            resident_layers: Vec::new(),
+        };
+        for (timeline_id, timeline_detail) in &self.timelines {
+            result
+                .resident_layers
+                .extend(timeline_detail.on_disk_layers.iter().map(|(name, ods)| {
+                    EvictionCandidate {
+                        layer: EvictionLayer::Secondary(EvictionSecondaryLayer {
+                            secondary_tenant: parent.clone(),
+                            timeline_id: *timeline_id,
+                            name: name.clone(),
+                            metadata: ods.metadata.clone(),
+                        }),
+                        last_activity_ts: ods.access_time,
+                        relative_last_activity: finite_f32::FiniteF32::ZERO,
+                    }
+                }));
+        }
+        result.max_layer_size = result
+            .resident_layers
+            .iter()
+            .map(|l| l.layer.get_file_size())
+            .max();
+
+        tracing::debug!(
+            "eviction: secondary tenant {} found {} timelines, {} layers",
+            parent.get_tenant_shard_id(),
+            self.timelines.len(),
+            result.resident_layers.len()
+        );
+
+        result
+    }
+}
+
+struct PendingDownload {
+    secondary_state: Arc<SecondaryTenant>,
+    last_download: Option<Instant>,
+    target_time: Option<Instant>,
+    period: Option<Duration>,
+}
+
+impl scheduler::PendingJob for PendingDownload {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        self.secondary_state.get_tenant_shard_id()
+    }
+}
+
+struct RunningDownload {
+    barrier: Barrier,
+}
+
+impl scheduler::RunningJob for RunningDownload {
+    fn get_barrier(&self) -> Barrier {
+        self.barrier.clone()
+    }
+}
+
+struct CompleteDownload {
+    secondary_state: Arc<SecondaryTenant>,
+    completed_at: Instant,
+}
+
+impl scheduler::Completion for CompleteDownload {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        self.secondary_state.get_tenant_shard_id()
+    }
+}
+
+type Scheduler = TenantBackgroundJobs<
+    SecondaryDownloader,
+    PendingDownload,
+    RunningDownload,
+    CompleteDownload,
+    DownloadCommand,
+>;
+
+impl JobGenerator<PendingDownload, RunningDownload, CompleteDownload, DownloadCommand>
+    for SecondaryDownloader
+{
+    #[instrument(skip_all, fields(tenant_id=%completion.get_tenant_shard_id().tenant_id, shard_id=%completion.get_tenant_shard_id().shard_slug()))]
+    fn on_completion(&mut self, completion: CompleteDownload) {
+        let CompleteDownload {
+            secondary_state,
+            completed_at: _completed_at,
+        } = completion;
+
+        tracing::debug!("Secondary tenant download completed");
+
+        // Update freshened_at even if there was an error: we don't want errored tenants to implicitly
+        // take priority to run again.
+        let mut detail = secondary_state.detail.lock().unwrap();
+        detail.next_download = Some(Instant::now() + DOWNLOAD_FRESHEN_INTERVAL);
+    }
+
+    async fn schedule(&mut self) -> SchedulingResult<PendingDownload> {
+        let mut result = SchedulingResult {
+            jobs: Vec::new(),
+            want_interval: None,
+        };
+
+        // Step 1: identify some tenants that we may work on
+        let mut tenants: Vec<Arc<SecondaryTenant>> = Vec::new();
+        self.tenant_manager
+            .foreach_secondary_tenants(|_id, secondary_state| {
+                tenants.push(secondary_state.clone());
+            });
+
+        // Step 2: filter out tenants which are not yet elegible to run
+        let now = Instant::now();
+        result.jobs = tenants
+            .into_iter()
+            .filter_map(|secondary_tenant| {
+                let (last_download, next_download) = {
+                    let mut detail = secondary_tenant.detail.lock().unwrap();
+
+                    if !detail.config.warm {
+                        // Downloads are disabled for this tenant
+                        detail.next_download = None;
+                        return None;
+                    }
+
+                    if detail.next_download.is_none() {
+                        // Initialize with a jitter: this spreads initial downloads on startup
+                        // or mass-attach across our freshen interval.
+                        let jittered_period =
+                            rand::thread_rng().gen_range(Duration::ZERO..DOWNLOAD_FRESHEN_INTERVAL);
+                        detail.next_download = Some(now.checked_add(jittered_period).expect(
+                        "Using our constant, which is known to be small compared with clock range",
+                    ));
+                    }
+                    (detail.last_download, detail.next_download.unwrap())
+                };
+
+                if now < next_download {
+                    Some(PendingDownload {
+                        secondary_state: secondary_tenant,
+                        last_download,
+                        target_time: Some(next_download),
+                        period: Some(DOWNLOAD_FRESHEN_INTERVAL),
+                    })
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        // Step 3: sort by target execution time to run most urgent first.
+        result.jobs.sort_by_key(|j| j.target_time);
+
+        result
+    }
+
+    fn on_command(&mut self, command: DownloadCommand) -> anyhow::Result<PendingDownload> {
+        let tenant_shard_id = command.get_tenant_shard_id();
+
+        let tenant = self
+            .tenant_manager
+            .get_secondary_tenant_shard(*tenant_shard_id);
+        let Some(tenant) = tenant else {
+            {
+                return Err(anyhow::anyhow!("Not found or not in Secondary mode"));
+            }
+        };
+
+        Ok(PendingDownload {
+            target_time: None,
+            period: None,
+            last_download: None,
+            secondary_state: tenant,
+        })
+    }
+
+    fn spawn(
+        &mut self,
+        job: PendingDownload,
+    ) -> (
+        RunningDownload,
+        Pin<Box<dyn Future<Output = CompleteDownload> + Send>>,
+    ) {
+        let PendingDownload {
+            secondary_state,
+            last_download,
+            target_time,
+            period,
+        } = job;
+
+        let (completion, barrier) = utils::completion::channel();
+        let remote_storage = self.remote_storage.clone();
+        let conf = self.tenant_manager.get_conf();
+        let tenant_shard_id = *secondary_state.get_tenant_shard_id();
+        (RunningDownload { barrier }, Box::pin(async move {
+            let _completion = completion;
+
+            match TenantDownloader::new(conf, &remote_storage, &secondary_state)
+                .download()
+                .await
+            {
+                Err(UpdateError::NoData) => {
+                    tracing::info!("No heatmap found for tenant.  This is fine if it is new.");
+                },
+                Err(UpdateError::NoSpace) => {
+                    tracing::warn!("Insufficient space while downloading.  Will retry later.");
+                }
+                Err(UpdateError::Cancelled) => {
+                    tracing::debug!("Shut down while downloading");
+                },
+                Err(UpdateError::Deserialize(e)) => {
+                    tracing::error!("Corrupt content while downloading tenant: {e}");
+                },
+                Err(e @ (UpdateError::DownloadError(_) | UpdateError::Other(_))) => {
+                    tracing::error!("Error while downloading tenant: {e}");
+                },
+                Ok(()) => {}
+            };
+
+            // Irrespective of the result, we will reschedule ourselves to run after our usual period.
+
+            // If the job had a target execution time, we may check our final execution
+            // time against that for observability purposes.
+            if let (Some(target_time), Some(period)) = (target_time, period) {
+                // Only track execution lag if this isn't our first download: otherwise, it is expected
+                // that execution will have taken longer than our configured interval, for example
+                // when starting up a pageserver and
+                if last_download.is_some() {
+                    // Elapsed time includes any scheduling lag as well as the execution of the job
+                    let elapsed = Instant::now().duration_since(target_time);
+
+                    warn_when_period_overrun(
+                        elapsed,
+                        period,
+                        BackgroundLoopKind::SecondaryDownload,
+                    );
+                }
+            }
+
+            CompleteDownload {
+                    secondary_state,
+                    completed_at: Instant::now(),
+                }
+        }.instrument(info_span!(parent: None, "secondary_download", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
+    }
+}
+
+/// This type is a convenience to group together the various functions involved in
+/// freshening a secondary tenant.
+struct TenantDownloader<'a> {
+    conf: &'static PageServerConf,
+    remote_storage: &'a GenericRemoteStorage,
+    secondary_state: &'a SecondaryTenant,
+}
+
+/// Errors that may be encountered while updating a tenant
+#[derive(thiserror::Error, Debug)]
+enum UpdateError {
+    #[error("No remote data found")]
+    NoData,
+    #[error("Insufficient local storage space")]
+    NoSpace,
+    #[error("Failed to download")]
+    DownloadError(DownloadError),
+    #[error(transparent)]
+    Deserialize(#[from] serde_json::Error),
+    #[error("Cancelled")]
+    Cancelled,
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl From<DownloadError> for UpdateError {
+    fn from(value: DownloadError) -> Self {
+        match &value {
+            DownloadError::Cancelled => Self::Cancelled,
+            DownloadError::NotFound => Self::NoData,
+            _ => Self::DownloadError(value),
+        }
+    }
+}
+
+impl From<std::io::Error> for UpdateError {
+    fn from(value: std::io::Error) -> Self {
+        if let Some(nix::errno::Errno::ENOSPC) = value.raw_os_error().map(nix::errno::from_i32) {
+            UpdateError::NoSpace
+        } else {
+            // An I/O error from e.g. tokio::io::copy is most likely a remote storage issue
+            UpdateError::Other(anyhow::anyhow!(value))
+        }
+    }
+}
+
+impl<'a> TenantDownloader<'a> {
+    fn new(
+        conf: &'static PageServerConf,
+        remote_storage: &'a GenericRemoteStorage,
+        secondary_state: &'a SecondaryTenant,
+    ) -> Self {
+        Self {
+            conf,
+            remote_storage,
+            secondary_state,
+        }
+    }
+
+    async fn download(&self) -> Result<(), UpdateError> {
+        debug_assert_current_span_has_tenant_id();
+
+        // For the duration of a download, we must hold the SecondaryTenant::gate, to ensure
+        // cover our access to local storage.
+        let Ok(_guard) = self.secondary_state.gate.enter() else {
+            // Shutting down
+            return Ok(());
+        };
+
+        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+        // Download the tenant's heatmap
+        let heatmap_bytes = tokio::select!(
+            bytes = self.download_heatmap() => {bytes?},
+            _ = self.secondary_state.cancel.cancelled() => return Ok(())
+        );
+
+        let heatmap = serde_json::from_slice::<HeatMapTenant>(&heatmap_bytes)?;
+
+        // Save the heatmap: this will be useful on restart, allowing us to reconstruct
+        // layer metadata without having to re-download it.
+        let heatmap_path = self.conf.tenant_heatmap_path(tenant_shard_id);
+
+        let temp_path = path_with_suffix_extension(&heatmap_path, TEMP_FILE_SUFFIX);
+        let context_msg = format!("write tenant {tenant_shard_id} heatmap to {heatmap_path}");
+        let heatmap_path_bg = heatmap_path.clone();
+        tokio::task::spawn_blocking(move || {
+            tokio::runtime::Handle::current().block_on(async move {
+                VirtualFile::crashsafe_overwrite(&heatmap_path_bg, &temp_path, &heatmap_bytes).await
+            })
+        })
+        .await
+        .expect("Blocking task is never aborted")
+        .maybe_fatal_err(&context_msg)?;
+
+        tracing::debug!("Wrote local heatmap to {}", heatmap_path);
+
+        // Download the layers in the heatmap
+        for timeline in heatmap.timelines {
+            if self.secondary_state.cancel.is_cancelled() {
+                return Ok(());
+            }
+
+            let timeline_id = timeline.timeline_id;
+            self.download_timeline(timeline)
+                .instrument(tracing::info_span!(
+                    "secondary_download_timeline",
+                    tenant_id=%tenant_shard_id.tenant_id,
+                    shard_id=%tenant_shard_id.shard_slug(),
+                    %timeline_id
+                ))
+                .await?;
+        }
+
+        Ok(())
+    }
+
+    async fn download_heatmap(&self) -> Result<Vec<u8>, UpdateError> {
+        debug_assert_current_span_has_tenant_id();
+        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+        // TODO: make download conditional on ETag having changed since last download
+        // (https://github.com/neondatabase/neon/issues/6199)
+        tracing::debug!("Downloading heatmap for secondary tenant",);
+
+        let heatmap_path = remote_heatmap_path(tenant_shard_id);
+
+        let heatmap_bytes = backoff::retry(
+            || async {
+                let download = self
+                    .remote_storage
+                    .download(&heatmap_path)
+                    .await
+                    .map_err(UpdateError::from)?;
+                let mut heatmap_bytes = Vec::new();
+                let mut body = tokio_util::io::StreamReader::new(download.download_stream);
+                let _size = tokio::io::copy(&mut body, &mut heatmap_bytes).await?;
+                Ok(heatmap_bytes)
+            },
+            |e| matches!(e, UpdateError::NoData | UpdateError::Cancelled),
+            FAILED_DOWNLOAD_WARN_THRESHOLD,
+            FAILED_REMOTE_OP_RETRIES,
+            "download heatmap",
+            backoff::Cancel::new(self.secondary_state.cancel.clone(), || {
+                UpdateError::Cancelled
+            }),
+        )
+        .await?;
+
+        SECONDARY_MODE.download_heatmap.inc();
+
+        Ok(heatmap_bytes)
+    }
+
+    async fn download_timeline(&self, timeline: HeatMapTimeline) -> Result<(), UpdateError> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+        let tenant_shard_id = self.secondary_state.get_tenant_shard_id();
+        let timeline_path = self
+            .conf
+            .timeline_path(tenant_shard_id, &timeline.timeline_id);
+
+        // Accumulate updates to the state
+        let mut touched = Vec::new();
+
+        // Clone a view of what layers already exist on disk
+        let timeline_state = self
+            .secondary_state
+            .detail
+            .lock()
+            .unwrap()
+            .timelines
+            .get(&timeline.timeline_id)
+            .cloned();
+
+        let timeline_state = match timeline_state {
+            Some(t) => t,
+            None => {
+                // We have no existing state: need to scan local disk for layers first.
+                let timeline_state =
+                    init_timeline_state(self.conf, tenant_shard_id, &timeline).await;
+
+                // Re-acquire detail lock now that we're done with async load from local FS
+                self.secondary_state
+                    .detail
+                    .lock()
+                    .unwrap()
+                    .timelines
+                    .insert(timeline.timeline_id, timeline_state.clone());
+                timeline_state
+            }
+        };
+
+        let layers_in_heatmap = timeline
+            .layers
+            .iter()
+            .map(|l| &l.name)
+            .collect::<HashSet<_>>();
+        let layers_on_disk = timeline_state
+            .on_disk_layers
+            .iter()
+            .map(|l| l.0)
+            .collect::<HashSet<_>>();
+
+        // Remove on-disk layers that are no longer present in heatmap
+        for layer in layers_on_disk.difference(&layers_in_heatmap) {
+            let local_path = timeline_path.join(layer.to_string());
+            tracing::info!("Removing secondary local layer {layer} because it's absent in heatmap",);
+            tokio::fs::remove_file(&local_path)
+                .await
+                .or_else(fs_ext::ignore_not_found)
+                .maybe_fatal_err("Removing secondary layer")?;
+        }
+
+        // Download heatmap layers that are not present on local disk, or update their
+        // access time if they are already present.
+        for layer in timeline.layers {
+            if self.secondary_state.cancel.is_cancelled() {
+                return Ok(());
+            }
+
+            // Existing on-disk layers: just update their access time.
+            if let Some(on_disk) = timeline_state.on_disk_layers.get(&layer.name) {
+                tracing::debug!("Layer {} is already on disk", layer.name);
+                if on_disk.metadata != LayerFileMetadata::from(&layer.metadata)
+                    || on_disk.access_time != layer.access_time
+                {
+                    // We already have this layer on disk.  Update its access time.
+                    tracing::debug!(
+                        "Access time updated for layer {}: {} -> {}",
+                        layer.name,
+                        strftime(&on_disk.access_time),
+                        strftime(&layer.access_time)
+                    );
+                    touched.push(layer);
+                }
+                continue;
+            } else {
+                tracing::debug!("Layer {} not present on disk yet", layer.name);
+            }
+
+            // Eviction: if we evicted a layer, then do not re-download it unless it was accessed more
+            // recently than it was evicted.
+            if let Some(evicted_at) = timeline_state.evicted_at.get(&layer.name) {
+                if &layer.access_time > evicted_at {
+                    tracing::info!(
+                        "Re-downloading evicted layer {}, accessed at {}, evicted at {}",
+                        layer.name,
+                        strftime(&layer.access_time),
+                        strftime(evicted_at)
+                    );
+                } else {
+                    tracing::trace!(
+                        "Not re-downloading evicted layer {}, accessed at {}, evicted at {}",
+                        layer.name,
+                        strftime(&layer.access_time),
+                        strftime(evicted_at)
+                    );
+                    continue;
+                }
+            }
+
+            // Note: no backoff::retry wrapper here because download_layer_file does its own retries internally
+            let downloaded_bytes = match download_layer_file(
+                self.conf,
+                self.remote_storage,
+                *tenant_shard_id,
+                timeline.timeline_id,
+                &layer.name,
+                &LayerFileMetadata::from(&layer.metadata),
+                &self.secondary_state.cancel,
+            )
+            .await
+            {
+                Ok(bytes) => bytes,
+                Err(e) => {
+                    if let DownloadError::NotFound = e {
+                        // A heatmap might be out of date and refer to a layer that doesn't exist any more.
+                        // This is harmless: continue to download the next layer. It is expected during compaction
+                        // GC.
+                        tracing::debug!(
+                            "Skipped downloading missing layer {}, raced with compaction/gc?",
+                            layer.name
+                        );
+                        continue;
+                    } else {
+                        return Err(e.into());
+                    }
+                }
+            };
+
+            if downloaded_bytes != layer.metadata.file_size {
+                let local_path = timeline_path.join(layer.name.to_string());
+
+                tracing::warn!(
+                    "Downloaded layer {} with unexpected size {} != {}.  Removing download.",
+                    layer.name,
+                    downloaded_bytes,
+                    layer.metadata.file_size
+                );
+
+                tokio::fs::remove_file(&local_path)
+                    .await
+                    .or_else(fs_ext::ignore_not_found)?;
+            }
+
+            SECONDARY_MODE.download_layer.inc();
+            touched.push(layer)
+        }
+
+        // Write updates to state to record layers we just downloaded or touched.
+        {
+            let mut detail = self.secondary_state.detail.lock().unwrap();
+            let timeline_detail = detail.timelines.entry(timeline.timeline_id).or_default();
+
+            tracing::info!("Wrote timeline_detail for {} touched layers", touched.len());
+
+            for t in touched {
+                use std::collections::hash_map::Entry;
+                match timeline_detail.on_disk_layers.entry(t.name.clone()) {
+                    Entry::Occupied(mut v) => {
+                        v.get_mut().access_time = t.access_time;
+                    }
+                    Entry::Vacant(e) => {
+                        e.insert(OnDiskState::new(
+                            self.conf,
+                            tenant_shard_id,
+                            &timeline.timeline_id,
+                            t.name,
+                            LayerFileMetadata::from(&t.metadata),
+                            t.access_time,
+                        ));
+                    }
+                }
+            }
+        }
+
+        Ok(())
+    }
+}
+
+/// Scan local storage and build up Layer objects based on the metadata in a HeatMapTimeline
+async fn init_timeline_state(
+    conf: &'static PageServerConf,
+    tenant_shard_id: &TenantShardId,
+    heatmap: &HeatMapTimeline,
+) -> SecondaryDetailTimeline {
+    let timeline_path = conf.timeline_path(tenant_shard_id, &heatmap.timeline_id);
+    let mut detail = SecondaryDetailTimeline::default();
+
+    let mut dir = match tokio::fs::read_dir(&timeline_path).await {
+        Ok(d) => d,
+        Err(e) => {
+            if e.kind() == std::io::ErrorKind::NotFound {
+                let context = format!("Creating timeline directory {timeline_path}");
+                tracing::info!("{}", context);
+                tokio::fs::create_dir_all(&timeline_path)
+                    .await
+                    .fatal_err(&context);
+
+                // No entries to report: drop out.
+                return detail;
+            } else {
+                on_fatal_io_error(&e, &format!("Reading timeline dir {timeline_path}"));
+            }
+        }
+    };
+
+    // As we iterate through layers found on disk, we will look up their metadata from this map.
+    // Layers not present in metadata will be discarded.
+    let heatmap_metadata: HashMap<&LayerFileName, &HeatMapLayer> =
+        heatmap.layers.iter().map(|l| (&l.name, l)).collect();
+
+    while let Some(dentry) = dir
+        .next_entry()
+        .await
+        .fatal_err(&format!("Listing {timeline_path}"))
+    {
+        let dentry_file_name = dentry.file_name();
+        let file_name = dentry_file_name.to_string_lossy();
+        let local_meta = dentry.metadata().await.fatal_err(&format!(
+            "Read metadata on {}",
+            dentry.path().to_string_lossy()
+        ));
+
+        // Secondary mode doesn't use local metadata files, but they might have been left behind by an attached tenant.
+        if file_name == METADATA_FILE_NAME {
+            continue;
+        }
+
+        match LayerFileName::from_str(&file_name) {
+            Ok(name) => {
+                let remote_meta = heatmap_metadata.get(&name);
+                match remote_meta {
+                    Some(remote_meta) => {
+                        // TODO: checksums for layers (https://github.com/neondatabase/neon/issues/2784)
+                        if local_meta.len() != remote_meta.metadata.file_size {
+                            // This should not happen, because we do crashsafe write-then-rename when downloading
+                            // layers, and layers in remote storage are immutable.  Remove the local file because
+                            // we cannot trust it.
+                            tracing::warn!(
+                                "Removing local layer {name} with unexpected local size {} != {}",
+                                local_meta.len(),
+                                remote_meta.metadata.file_size
+                            );
+                        } else {
+                            // We expect the access time to be initialized immediately afterwards, when
+                            // the latest heatmap is applied to the state.
+                            detail.on_disk_layers.insert(
+                                name.clone(),
+                                OnDiskState::new(
+                                    conf,
+                                    tenant_shard_id,
+                                    &heatmap.timeline_id,
+                                    name,
+                                    LayerFileMetadata::from(&remote_meta.metadata),
+                                    remote_meta.access_time,
+                                ),
+                            );
+                        }
+                    }
+                    None => {
+                        // FIXME: consider some optimization when transitioning from attached to secondary: maybe
+                        // wait until we have seen a heatmap that is more recent than the most recent on-disk state?  Otherwise
+                        // we will end up deleting any layers which were created+uploaded more recently than the heatmap.
+                        tracing::info!(
+                            "Removing secondary local layer {} because it's absent in heatmap",
+                            name
+                        );
+                        tokio::fs::remove_file(&dentry.path())
+                            .await
+                            .or_else(fs_ext::ignore_not_found)
+                            .fatal_err(&format!(
+                                "Removing layer {}",
+                                dentry.path().to_string_lossy()
+                            ));
+                    }
+                }
+            }
+            Err(_) => {
+                // Ignore it.
+                tracing::warn!("Unexpected file in timeline directory: {file_name}");
+            }
+        }
+    }
+
+    detail
+}
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -1,5 +1,6 @@
 use std::{
    collections::HashMap,
+    pin::Pin,
    sync::{Arc, Weak},
    time::{Duration, Instant},
 };
@@ -7,35 +8,86 @@ use std::{
 use crate::{
    metrics::SECONDARY_MODE,
    tenant::{
-        config::AttachmentMode, mgr::TenantManager, remote_timeline_client::remote_heatmap_path,
-        secondary::CommandResponse, span::debug_assert_current_span_has_tenant_id, Tenant,
+        config::AttachmentMode,
+        mgr::TenantManager,
+        remote_timeline_client::remote_heatmap_path,
+        span::debug_assert_current_span_has_tenant_id,
+        tasks::{warn_when_period_overrun, BackgroundLoopKind},
+        Tenant,
    },
 };

+use futures::Future;
 use md5;
 use pageserver_api::shard::TenantShardId;
+use rand::Rng;
 use remote_storage::GenericRemoteStorage;

-use tokio::task::JoinSet;
+use super::{
+    scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
+    CommandRequest,
+};
 use tokio_util::sync::CancellationToken;
-use tracing::instrument;
-use utils::{backoff, completion::Barrier};
+use tracing::{info_span, instrument, Instrument};
+use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};

-use super::{heatmap::HeatMapTenant, CommandRequest, UploadCommand};
+use super::{heatmap::HeatMapTenant, UploadCommand};

-/// Period between heatmap uploader walking Tenants to look for work to do.
-/// If any tenants have a heatmap upload period lower than this, it will be adjusted
-/// downward to match.
-const DEFAULT_SCHEDULING_INTERVAL: Duration = Duration::from_millis(60000);
-const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_millis(1000);
+pub(super) async fn heatmap_uploader_task(
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
+    background_jobs_can_start: Barrier,
+    cancel: CancellationToken,
+) {
+    let concurrency = tenant_manager.get_conf().heatmap_upload_concurrency;
+
+    let generator = HeatmapUploader {
+        tenant_manager,
+        remote_storage,
+        cancel: cancel.clone(),
+        tenants: HashMap::new(),
+    };
+    let mut scheduler = Scheduler::new(generator, concurrency);
+
+    scheduler
+        .run(command_queue, background_jobs_can_start, cancel)
+        .instrument(info_span!("heatmap_uploader"))
+        .await
+}
+
+/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
+/// handling loop and mutates it as needed: there are no locks here, because that event loop
+/// can hold &mut references to this type throughout.
+struct HeatmapUploader {
+    tenant_manager: Arc<TenantManager>,
+    remote_storage: GenericRemoteStorage,
+    cancel: CancellationToken,
+
+    tenants: HashMap<TenantShardId, UploaderTenantState>,
+}

 struct WriteInProgress {
    barrier: Barrier,
 }

+impl RunningJob for WriteInProgress {
+    fn get_barrier(&self) -> Barrier {
+        self.barrier.clone()
+    }
+}
+
 struct UploadPending {
    tenant: Arc<Tenant>,
    last_digest: Option<md5::Digest>,
+    target_time: Option<Instant>,
+    period: Option<Duration>,
+}
+
+impl scheduler::PendingJob for UploadPending {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        self.tenant.get_tenant_shard_id()
+    }
 }

 struct WriteComplete {
@@ -45,6 +97,12 @@ struct WriteComplete {
    next_upload: Option<Instant>,
 }

+impl scheduler::Completion for WriteComplete {
+    fn get_tenant_shard_id(&self) -> &TenantShardId {
+        &self.tenant_shard_id
+    }
+}
+
 /// The heatmap uploader keeps a little bit of per-tenant state, mainly to remember
 /// when we last did a write.  We only populate this after doing at least one
 /// write for a tenant -- this avoids holding state for tenants that have
@@ -68,267 +126,110 @@ struct UploaderTenantState {
    next_upload: Option<Instant>,
 }

-/// This type is owned by a single task ([`heatmap_uploader_task`]) which runs an event
-/// handling loop and mutates it as needed: there are no locks here, because that event loop
-/// can hold &mut references to this type throughout.
-struct HeatmapUploader {
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    cancel: CancellationToken,
+type Scheduler = TenantBackgroundJobs<
+    HeatmapUploader,
+    UploadPending,
+    WriteInProgress,
+    WriteComplete,
+    UploadCommand,
+>;

-    tenants: HashMap<TenantShardId, UploaderTenantState>,
-
-    /// Tenants with work to do, for which tasks should be spawned as soon as concurrency
-    /// limits permit it.
-    tenants_pending: std::collections::VecDeque<UploadPending>,
-
-    /// Tenants for which a task in `tasks` has been spawned.
-    tenants_uploading: HashMap<TenantShardId, WriteInProgress>,
-
-    tasks: JoinSet<()>,
-
-    /// Channel for our child tasks to send results to: we use a channel for results rather than
-    /// just getting task results via JoinSet because we need the channel's recv() "sleep until something
-    /// is available" semantic, rather than JoinSet::join_next()'s "sleep until next thing is available _or_ I'm empty"
-    /// behavior.
-    task_result_tx: tokio::sync::mpsc::UnboundedSender<WriteComplete>,
-    task_result_rx: tokio::sync::mpsc::UnboundedReceiver<WriteComplete>,
-
-    concurrent_uploads: usize,
-
-    scheduling_interval: Duration,
-}
-
-/// The uploader task runs a loop that periodically wakes up and schedules tasks for
-/// tenants that require an upload, or handles any commands that have been sent into
-/// `command_queue`.  No I/O is done in this loop: that all happens in the tasks we
-/// spawn.
-///
-/// Scheduling iterations are somewhat infrequent.  However, each one will enqueue
-/// all tenants that require an upload, and in between scheduling iterations we will
-/// continue to spawn new tasks for pending tenants, as our concurrency limit permits.
-///
-/// While we take a CancellationToken here, it is subordinate to the CancellationTokens
-/// of tenants: i.e. we expect all Tenants to have been shut down before we are shut down, otherwise
-/// we might block waiting on a Tenant.
-pub(super) async fn heatmap_uploader_task(
-    tenant_manager: Arc<TenantManager>,
-    remote_storage: GenericRemoteStorage,
-    mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<UploadCommand>>,
-    background_jobs_can_start: Barrier,
-    cancel: CancellationToken,
-) -> anyhow::Result<()> {
-    let concurrent_uploads = tenant_manager.get_conf().heatmap_upload_concurrency;
-
-    let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();
-
-    let mut uploader = HeatmapUploader {
-        tenant_manager,
-        remote_storage,
-        cancel: cancel.clone(),
-        tasks: JoinSet::new(),
-        tenants: HashMap::new(),
-        tenants_pending: std::collections::VecDeque::new(),
-        tenants_uploading: HashMap::new(),
-        task_result_tx: result_tx,
-        task_result_rx: result_rx,
-        concurrent_uploads,
-        scheduling_interval: DEFAULT_SCHEDULING_INTERVAL,
-    };
-
-    tracing::info!("Waiting for background_jobs_can start...");
-    background_jobs_can_start.wait().await;
-    tracing::info!("background_jobs_can is ready, proceeding.");
-
-    while !cancel.is_cancelled() {
-        // Look for new work: this is relatively expensive because we have to go acquire the lock on
-        // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
-        // require an upload.
-        uploader.schedule_iteration().await?;
-
-        // Between scheduling iterations, we will:
-        //  - Drain any complete tasks and spawn pending tasks
-        //  - Handle incoming administrative commands
-        //  - Check our cancellation token
-        let next_scheduling_iteration = Instant::now()
-            .checked_add(uploader.scheduling_interval)
-            .unwrap_or_else(|| {
-                tracing::warn!(
-                    "Scheduling interval invalid ({}s), running immediately!",
-                    uploader.scheduling_interval.as_secs_f64()
-                );
-                Instant::now()
-            });
-        loop {
-            tokio::select! {
-                _ = cancel.cancelled() => {
-                    // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
-                    tracing::info!("Heatmap uploader joining tasks");
-                    while let Some(_r) = uploader.tasks.join_next().await {};
-                    tracing::info!("Heatmap uploader terminating");
-
-                    break;
-                },
-                _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
-                    tracing::debug!("heatmap_uploader_task: woke for scheduling interval");
-                    break;},
-                cmd = command_queue.recv() => {
-                    tracing::debug!("heatmap_uploader_task: woke for command queue");
-                    let cmd = match cmd {
-                        Some(c) =>c,
-                        None => {
-                            // SecondaryController was destroyed, and this has raced with
-                            // our CancellationToken
-                            tracing::info!("Heatmap uploader terminating");
-                            cancel.cancel();
-                            break;
-                        }
-                    };
-
-                    let CommandRequest{
-                        response_tx,
-                        payload
-                    } = cmd;
-                    uploader.handle_command(payload, response_tx);
-                },
-                _ = uploader.process_next_completion() => {
-                    if !cancel.is_cancelled() {
-                        uploader.spawn_pending();
-                    }
-                }
-            }
-        }
-    }
-
-    Ok(())
-}
-
-impl HeatmapUploader {
-    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
-    async fn schedule_iteration(&mut self) -> anyhow::Result<()> {
+impl JobGenerator<UploadPending, WriteInProgress, WriteComplete, UploadCommand>
+    for HeatmapUploader
+{
+    async fn schedule(&mut self) -> SchedulingResult<UploadPending> {
        // Cull any entries in self.tenants whose Arc<Tenant> is gone
        self.tenants
            .retain(|_k, v| v.tenant.upgrade().is_some() && v.next_upload.is_some());

-        // The priority order of previously scheduled work may be invalidated by current state: drop
-        // all pending work (it will be re-scheduled if still needed)
-        self.tenants_pending.clear();
-
-        // Used a fixed 'now' through the following loop, for efficiency and fairness.
        let now = Instant::now();

-        // While iterating over the potentially-long list of tenants, we will periodically yield
-        // to avoid blocking executor.
-        const YIELD_ITERATIONS: usize = 1000;
+        let mut result = SchedulingResult {
+            jobs: Vec::new(),
+            want_interval: None,
+        };

-        // Iterate over tenants looking for work to do.
        let tenants = self.tenant_manager.get_attached_active_tenant_shards();
-        for (i, tenant) in tenants.into_iter().enumerate() {
-            // Process is shutting down, drop out
-            if self.cancel.is_cancelled() {
-                return Ok(());
-            }

-            // Skip tenants that already have a write in flight
-            if self
-                .tenants_uploading
-                .contains_key(tenant.get_tenant_shard_id())
-            {
-                continue;
-            }
+        yielding_loop(1000, &self.cancel, tenants.into_iter(), |tenant| {
+            let period = match tenant.get_heatmap_period() {
+                None => {
+                    // Heatmaps are disabled for this tenant
+                    return;
+                }
+                Some(period) => {
+                    // If any tenant has asked for uploads more frequent than our scheduling interval,
+                    // reduce it to match so that we can keep up.  This is mainly useful in testing, where
+                    // we may set rather short intervals.
+                    result.want_interval = match result.want_interval {
+                        None => Some(period),
+                        Some(existing) => Some(std::cmp::min(period, existing)),
+                    };

-            self.maybe_schedule_upload(&now, tenant);
+                    period
+                }
+            };

-            if i + 1 % YIELD_ITERATIONS == 0 {
-                tokio::task::yield_now().await;
-            }
-        }
-
-        // Spawn tasks for as many of our pending tenants as we can.
-        self.spawn_pending();
-
-        Ok(())
-    }
-
-    ///
-    /// Cancellation: this method is cancel-safe.
-    async fn process_next_completion(&mut self) {
-        match self.task_result_rx.recv().await {
-            Some(r) => {
-                self.on_completion(r);
-            }
-            None => {
-                unreachable!("Result sender is stored on Self");
-            }
-        }
-    }
-
-    /// The 'maybe' refers to the tenant's state: whether it is configured
-    /// for heatmap uploads at all, and whether sufficient time has passed
-    /// since the last upload.
-    fn maybe_schedule_upload(&mut self, now: &Instant, tenant: Arc<Tenant>) {
-        match tenant.get_heatmap_period() {
-            None => {
-                // Heatmaps are disabled for this tenant
+            // Stale attachments do not upload anything: if we are in this state, there is probably some
+            // other attachment in mode Single or Multi running on another pageserver, and we don't
+            // want to thrash and overwrite their heatmap uploads.
+            if tenant.get_attach_mode() == AttachmentMode::Stale {
                return;
            }
-            Some(period) => {
-                // If any tenant has asked for uploads more frequent than our scheduling interval,
-                // reduce it to match so that we can keep up.  This is mainly useful in testing, where
-                // we may set rather short intervals.
-                if period < self.scheduling_interval {
-                    self.scheduling_interval = std::cmp::max(period, MIN_SCHEDULING_INTERVAL);
-                }
+
+            // Create an entry in self.tenants if one doesn't already exist: this will later be updated
+            // with the completion time in on_completion.
+            let state = self
+                .tenants
+                .entry(*tenant.get_tenant_shard_id())
+                .or_insert_with(|| {
+                    let jittered_period = rand::thread_rng().gen_range(Duration::ZERO..period);
+
+                    UploaderTenantState {
+                        tenant: Arc::downgrade(&tenant),
+                        last_upload: None,
+                        next_upload: Some(now.checked_add(jittered_period).unwrap_or(now)),
+                        last_digest: None,
+                    }
+                });
+
+            // Decline to do the upload if insufficient time has passed
+            if state.next_upload.map(|nu| nu > now).unwrap_or(false) {
+                return;
            }
-        }

-        // Stale attachments do not upload anything: if we are in this state, there is probably some
-        // other attachment in mode Single or Multi running on another pageserver, and we don't
-        // want to thrash and overwrite their heatmap uploads.
-        if tenant.get_attach_mode() == AttachmentMode::Stale {
-            return;
-        }
-
-        // Create an entry in self.tenants if one doesn't already exist: this will later be updated
-        // with the completion time in on_completion.
-        let state = self
-            .tenants
-            .entry(*tenant.get_tenant_shard_id())
-            .or_insert_with(|| UploaderTenantState {
-                tenant: Arc::downgrade(&tenant),
-                last_upload: None,
-                next_upload: Some(Instant::now()),
-                last_digest: None,
+            let last_digest = state.last_digest;
+            result.jobs.push(UploadPending {
+                tenant,
+                last_digest,
+                target_time: state.next_upload,
+                period: Some(period),
            });
+        })
+        .await
+        .ok();

-        // Decline to do the upload if insufficient time has passed
-        if state.next_upload.map(|nu| &nu > now).unwrap_or(false) {
-            return;
-        }
+        result
+    }

-        let last_digest = state.last_digest;
-        self.tenants_pending.push_back(UploadPending {
+    fn spawn(
+        &mut self,
+        job: UploadPending,
+    ) -> (
+        WriteInProgress,
+        Pin<Box<dyn Future<Output = WriteComplete> + Send>>,
+    ) {
+        let UploadPending {
            tenant,
            last_digest,
-        })
-    }
+            target_time,
+            period,
+        } = job;

-    fn spawn_pending(&mut self) {
-        while !self.tenants_pending.is_empty()
-            && self.tenants_uploading.len() < self.concurrent_uploads
-        {
-            // unwrap: loop condition includes !is_empty()
-            let pending = self.tenants_pending.pop_front().unwrap();
-            self.spawn_upload(pending.tenant, pending.last_digest);
-        }
-    }
-
-    fn spawn_upload(&mut self, tenant: Arc<Tenant>, last_digest: Option<md5::Digest>) {
        let remote_storage = self.remote_storage.clone();
-        let tenant_shard_id = *tenant.get_tenant_shard_id();
        let (completion, barrier) = utils::completion::channel();
-        let result_tx = self.task_result_tx.clone();
-        self.tasks.spawn(async move {
+        let tenant_shard_id = *tenant.get_tenant_shard_id();
+        (WriteInProgress { barrier }, Box::pin(async move {
            // Guard for the barrier in [`WriteInProgress`]
            let _completion = completion;

@@ -362,22 +263,47 @@ impl HeatmapUploader {
            };

            let now = Instant::now();
+
+            // If the job had a target execution time, we may check our final execution
+            // time against that for observability purposes.
+            if let (Some(target_time), Some(period)) = (target_time, period) {
+                // Elapsed time includes any scheduling lag as well as the execution of the job
+                let elapsed = now.duration_since(target_time);
+
+                warn_when_period_overrun(elapsed, period, BackgroundLoopKind::HeatmapUpload);
+            }
+
            let next_upload = tenant
                .get_heatmap_period()
                .and_then(|period| now.checked_add(period));

-            result_tx
-                .send(WriteComplete {
+            WriteComplete {
                    tenant_shard_id: *tenant.get_tenant_shard_id(),
                    completed_at: now,
                    digest,
                    next_upload,
-                })
-                .ok();
-        });
+                }
+        }.instrument(info_span!(parent: None, "heatmap_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))))
+    }

-        self.tenants_uploading
-            .insert(tenant_shard_id, WriteInProgress { barrier });
+    fn on_command(&mut self, command: UploadCommand) -> anyhow::Result<UploadPending> {
+        let tenant_shard_id = command.get_tenant_shard_id();
+
+        tracing::info!(
+            tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
+            "Starting heatmap write on command");
+        let tenant = self
+            .tenant_manager
+            .get_attached_tenant_shard(*tenant_shard_id, true)
+            .map_err(|e| anyhow::anyhow!(e))?;
+
+        Ok(UploadPending {
+            // Ignore our state for last digest: this forces an upload even if nothing has changed
+            last_digest: None,
+            tenant,
+            target_time: None,
+            period: None,
+        })
    }

    #[instrument(skip_all, fields(tenant_id=%completion.tenant_shard_id.tenant_id, shard_id=%completion.tenant_shard_id.shard_slug()))]
@@ -389,7 +315,6 @@ impl HeatmapUploader {
            digest,
            next_upload,
        } = completion;
-        self.tenants_uploading.remove(&tenant_shard_id);
        use std::collections::hash_map::Entry;
        match self.tenants.entry(tenant_shard_id) {
            Entry::Vacant(_) => {
@@ -402,69 +327,6 @@ impl HeatmapUploader {
            }
        }
    }
-
-    fn handle_command(
-        &mut self,
-        command: UploadCommand,
-        response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
-    ) {
-        match command {
-            UploadCommand::Upload(tenant_shard_id) => {
-                // If an upload was ongoing for this tenant, let it finish first.
-                let barrier = if let Some(writing_state) =
-                    self.tenants_uploading.get(&tenant_shard_id)
-                {
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Waiting for heatmap write to complete");
-                    writing_state.barrier.clone()
-                } else {
-                    // Spawn the upload then immediately wait for it.  This will block processing of other commands and
-                    // starting of other background work.
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Starting heatmap write on command");
-                    let tenant = match self
-                        .tenant_manager
-                        .get_attached_tenant_shard(tenant_shard_id, true)
-                    {
-                        Ok(t) => t,
-                        Err(e) => {
-                            // Drop result of send: we don't care if caller dropped their receiver
-                            drop(response_tx.send(CommandResponse {
-                                result: Err(e.into()),
-                            }));
-                            return;
-                        }
-                    };
-                    self.spawn_upload(tenant, None);
-                    let writing_state = self
-                        .tenants_uploading
-                        .get(&tenant_shard_id)
-                        .expect("We just inserted this");
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Waiting for heatmap upload to complete");
-
-                    writing_state.barrier.clone()
-                };
-
-                // This task does no I/O: it only listens for a barrier's completion and then
-                // sends to the command response channel.  It is therefore safe to spawn this without
-                // any gates/task_mgr hooks.
-                tokio::task::spawn(async move {
-                    barrier.wait().await;
-
-                    tracing::info!(
-                        tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
-                        "Heatmap upload complete");
-
-                    // Drop result of send: we don't care if caller dropped their receiver
-                    drop(response_tx.send(CommandResponse { result: Ok(()) }))
-                });
-            }
-        }
-    }
 }

 enum UploadHeatmapOutcome {
@@ -487,7 +349,6 @@ enum UploadHeatmapError {

 /// The inner upload operation.  This will skip if `last_digest` is Some and matches the digest
 /// of the object we would have uploaded.
-#[instrument(skip_all, fields(tenant_id = %tenant.get_tenant_shard_id().tenant_id, shard_id = %tenant.get_tenant_shard_id().shard_slug()))]
 async fn upload_tenant_heatmap(
    remote_storage: GenericRemoteStorage,
    tenant: &Arc<Tenant>,
--- a/pageserver/src/tenant/secondary/scheduler.rs
+++ b/pageserver/src/tenant/secondary/scheduler.rs
@@ -0,0 +1,359 @@
+use futures::Future;
+use std::{
+    collections::HashMap,
+    marker::PhantomData,
+    pin::Pin,
+    time::{Duration, Instant},
+};
+
+use pageserver_api::shard::TenantShardId;
+use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
+use utils::{completion::Barrier, yielding_loop::yielding_loop};
+
+use super::{CommandRequest, CommandResponse};
+
+/// Scheduling interval is the time between calls to JobGenerator::schedule.
+/// When we schedule jobs, the job generator may provide a hint of its preferred
+/// interval, which we will respect within these intervals.
+const MAX_SCHEDULING_INTERVAL: Duration = Duration::from_secs(10);
+const MIN_SCHEDULING_INTERVAL: Duration = Duration::from_secs(1);
+
+/// Scheduling helper for background work across many tenants.
+///
+/// Systems that need to run background work across many tenants may use this type
+/// to schedule jobs within a concurrency limit, along with their own [`JobGenerator`]
+/// implementation to provide the work to execute.  This is a simple scheduler that just
+/// polls the generator for outstanding work, replacing its queue of pending work with
+/// what the generator yields on each call: the job generator can change its mind about
+/// the order of jobs between calls.  The job generator is notified when jobs complete,
+/// and additionally may expose a command hook to generate jobs on-demand (e.g. to implement
+/// admin APIs).
+///
+/// For an example see [`crate::tenant::secondary::heatmap_uploader`]
+///
+/// G: A JobGenerator that this scheduler will poll to find pending jobs
+/// PJ: 'Pending Job': type for job descriptors that are ready to run
+/// RJ: 'Running Job' type' for jobs that have been spawned
+/// C : 'Completion' type that spawned jobs will send when they finish
+/// CMD: 'Command' type that the job generator will accept to create jobs on-demand
+pub(super) struct TenantBackgroundJobs<G, PJ, RJ, C, CMD>
+where
+    G: JobGenerator<PJ, RJ, C, CMD>,
+    C: Completion,
+    PJ: PendingJob,
+    RJ: RunningJob,
+{
+    generator: G,
+
+    /// Ready to run.  Will progress to `running` once concurrent limit is satisfied, or
+    /// be removed on next scheduling pass.
+    pending: std::collections::VecDeque<PJ>,
+
+    /// Tasks currently running in Self::tasks for these tenants.  Check this map
+    /// before pushing more work into pending for the same tenant.
+    running: HashMap<TenantShardId, RJ>,
+
+    tasks: JoinSet<C>,
+
+    concurrency: usize,
+
+    /// How often we would like schedule_interval to be called.
+    pub(super) scheduling_interval: Duration,
+
+    _phantom: PhantomData<(PJ, RJ, C, CMD)>,
+}
+
+pub(crate) trait JobGenerator<PJ, RJ, C, CMD>
+where
+    C: Completion,
+    PJ: PendingJob,
+    RJ: RunningJob,
+{
+    /// Called at each scheduling interval.  Return a list of jobs to run, most urgent first.
+    ///
+    /// This function may be expensive (e.g. walk all tenants), but should not do any I/O.
+    /// Implementations should take care to yield the executor periodically if running
+    /// very long loops.
+    ///
+    /// Yielding a job here does _not_ guarantee that it will run: if the queue of pending
+    /// jobs is not drained by the next scheduling interval, pending jobs will be cleared
+    /// and re-generated.
+    async fn schedule(&mut self) -> SchedulingResult<PJ>;
+
+    /// Called when a pending job is ready to be run.
+    ///
+    /// The job generation provides a future, and a RJ (Running Job) descriptor that tracks it.
+    fn spawn(&mut self, pending_job: PJ) -> (RJ, Pin<Box<dyn Future<Output = C> + Send>>);
+
+    /// Called when a job previously spawned with spawn() transmits its completion
+    fn on_completion(&mut self, completion: C);
+
+    /// Called when a command is received.  A job will be spawned immediately if the return
+    /// value is Some, ignoring concurrency limits and the pending queue.
+    fn on_command(&mut self, cmd: CMD) -> anyhow::Result<PJ>;
+}
+
+/// [`JobGenerator`] returns this to provide pending jobs, and hints about scheduling
+pub(super) struct SchedulingResult<PJ> {
+    pub(super) jobs: Vec<PJ>,
+    /// The job generator would like to be called again this soon
+    pub(super) want_interval: Option<Duration>,
+}
+
+/// See [`TenantBackgroundJobs`].
+pub(super) trait PendingJob {
+    fn get_tenant_shard_id(&self) -> &TenantShardId;
+}
+
+/// See [`TenantBackgroundJobs`].
+pub(super) trait Completion: Send + 'static {
+    fn get_tenant_shard_id(&self) -> &TenantShardId;
+}
+
+/// See [`TenantBackgroundJobs`].
+pub(super) trait RunningJob {
+    fn get_barrier(&self) -> Barrier;
+}
+
+impl<G, PJ, RJ, C, CMD> TenantBackgroundJobs<G, PJ, RJ, C, CMD>
+where
+    C: Completion,
+    PJ: PendingJob,
+    RJ: RunningJob,
+    G: JobGenerator<PJ, RJ, C, CMD>,
+{
+    pub(super) fn new(generator: G, concurrency: usize) -> Self {
+        Self {
+            generator,
+            pending: std::collections::VecDeque::new(),
+            running: HashMap::new(),
+            tasks: JoinSet::new(),
+            concurrency,
+            scheduling_interval: MAX_SCHEDULING_INTERVAL,
+            _phantom: PhantomData,
+        }
+    }
+
+    pub(super) async fn run(
+        &mut self,
+        mut command_queue: tokio::sync::mpsc::Receiver<CommandRequest<CMD>>,
+        background_jobs_can_start: Barrier,
+        cancel: CancellationToken,
+    ) {
+        tracing::info!("Waiting for background_jobs_can start...");
+        background_jobs_can_start.wait().await;
+        tracing::info!("background_jobs_can is ready, proceeding.");
+
+        while !cancel.is_cancelled() {
+            // Look for new work: this is relatively expensive because we have to go acquire the lock on
+            // the tenant manager to retrieve tenants, and then iterate over them to figure out which ones
+            // require an upload.
+            self.schedule_iteration(&cancel).await;
+
+            if cancel.is_cancelled() {
+                return;
+            }
+
+            // Schedule some work, if concurrency limit permits it
+            self.spawn_pending();
+
+            // Between scheduling iterations, we will:
+            //  - Drain any complete tasks and spawn pending tasks
+            //  - Handle incoming administrative commands
+            //  - Check our cancellation token
+            let next_scheduling_iteration = Instant::now()
+                .checked_add(self.scheduling_interval)
+                .unwrap_or_else(|| {
+                    tracing::warn!(
+                        "Scheduling interval invalid ({}s)",
+                        self.scheduling_interval.as_secs_f64()
+                    );
+                    // unwrap(): this constant is small, cannot fail to add to time unless
+                    // we are close to the end of the universe.
+                    Instant::now().checked_add(MIN_SCHEDULING_INTERVAL).unwrap()
+                });
+            loop {
+                tokio::select! {
+                    _ = cancel.cancelled() => {
+                        tracing::info!("joining tasks");
+                        // We do not simply drop the JoinSet, in order to have an orderly shutdown without cancellation.
+                        // It is the callers responsibility to make sure that the tasks they scheduled
+                        // respect an appropriate cancellation token, to shut down promptly.  It is only
+                        // safe to wait on joining these tasks because we can see the cancellation token
+                        // has been set.
+                        while let Some(_r) = self.tasks.join_next().await {}
+                        tracing::info!("terminating on cancellation token.");
+
+                        break;
+                    },
+                    _ = tokio::time::sleep(next_scheduling_iteration.duration_since(Instant::now())) => {
+                        tracing::debug!("woke for scheduling interval");
+                        break;},
+                    cmd = command_queue.recv() => {
+                        tracing::debug!("woke for command queue");
+                        let cmd = match cmd {
+                            Some(c) =>c,
+                            None => {
+                                // SecondaryController was destroyed, and this has raced with
+                                // our CancellationToken
+                                tracing::info!("terminating on command queue destruction");
+                                cancel.cancel();
+                                break;
+                            }
+                        };
+
+                        let CommandRequest{
+                            response_tx,
+                            payload
+                        } = cmd;
+                        self.handle_command(payload, response_tx);
+                    },
+                    _ = async {
+                        let completion = self.process_next_completion().await;
+                        match completion {
+                            Some(c) => {
+                                self.generator.on_completion(c);
+                                if !cancel.is_cancelled() {
+                                    self.spawn_pending();
+                                }
+                            },
+                            None => {
+                                // Nothing is running, so just wait: expect that this future
+                                // will be dropped when something in the outer select! fires.
+                                cancel.cancelled().await;
+                            }
+                        }
+
+                     } => {}
+                }
+            }
+        }
+    }
+
+    fn do_spawn(&mut self, job: PJ) {
+        let tenant_shard_id = *job.get_tenant_shard_id();
+        let (in_progress, fut) = self.generator.spawn(job);
+
+        self.tasks.spawn(fut);
+
+        self.running.insert(tenant_shard_id, in_progress);
+    }
+
+    /// For all pending tenants that are elegible for execution, spawn their task.
+    ///
+    /// Caller provides the spawn operation, we track the resulting execution.
+    fn spawn_pending(&mut self) {
+        while !self.pending.is_empty() && self.running.len() < self.concurrency {
+            // unwrap: loop condition includes !is_empty()
+            let pending = self.pending.pop_front().unwrap();
+            self.do_spawn(pending);
+        }
+    }
+
+    /// For administrative commands: skip the pending queue, ignore concurrency limits
+    fn spawn_now(&mut self, job: PJ) -> &RJ {
+        let tenant_shard_id = *job.get_tenant_shard_id();
+        self.do_spawn(job);
+        self.running
+            .get(&tenant_shard_id)
+            .expect("We just inserted this")
+    }
+
+    /// Wait until the next task completes, and handle its completion
+    ///
+    /// Cancellation: this method is cancel-safe.
+    async fn process_next_completion(&mut self) -> Option<C> {
+        match self.tasks.join_next().await {
+            Some(r) => {
+                // We use a channel to drive completions, but also
+                // need to drain the JoinSet to avoid completed tasks
+                // accumulating.  These calls are 1:1 because every task
+                // we spawn into this joinset submits is result to the channel.
+                let completion = r.expect("Panic in background task");
+
+                self.running.remove(completion.get_tenant_shard_id());
+                Some(completion)
+            }
+            None => {
+                // Nothing is running, so we have nothing to wait for.  We may drop out: the
+                // main even loop will call us again after the next time it has run something.
+                None
+            }
+        }
+    }
+
+    /// Convert the command into a pending job, spawn it, and when the spawned
+    /// job completes, send the result down `response_tx`.
+    fn handle_command(
+        &mut self,
+        cmd: CMD,
+        response_tx: tokio::sync::oneshot::Sender<CommandResponse>,
+    ) {
+        let job = match self.generator.on_command(cmd) {
+            Ok(j) => j,
+            Err(e) => {
+                response_tx.send(CommandResponse { result: Err(e) }).ok();
+                return;
+            }
+        };
+
+        let tenant_shard_id = job.get_tenant_shard_id();
+        let barrier = if let Some(barrier) = self.get_running(tenant_shard_id) {
+            barrier
+        } else {
+            let running = self.spawn_now(job);
+            running.get_barrier().clone()
+        };
+
+        // This task does no I/O: it only listens for a barrier's completion and then
+        // sends to the command response channel.  It is therefore safe to spawn this without
+        // any gates/task_mgr hooks.
+        tokio::task::spawn(async move {
+            barrier.wait().await;
+
+            response_tx.send(CommandResponse { result: Ok(()) }).ok();
+        });
+    }
+
+    fn get_running(&self, tenant_shard_id: &TenantShardId) -> Option<Barrier> {
+        self.running.get(tenant_shard_id).map(|r| r.get_barrier())
+    }
+
+    /// Periodic execution phase: inspect all attached tenants and schedule any work they require.
+    ///
+    /// The type in `tenants` should be a tenant-like structure, e.g. [`crate::tenant::Tenant`] or [`crate::tenant::secondary::SecondaryTenant`]
+    ///
+    /// This function resets the pending list: it is assumed that the caller may change their mind about
+    /// which tenants need work between calls to schedule_iteration.
+    async fn schedule_iteration(&mut self, cancel: &CancellationToken) {
+        let SchedulingResult {
+            jobs,
+            want_interval,
+        } = self.generator.schedule().await;
+
+        // Adjust interval based on feedback from the job generator
+        if let Some(want_interval) = want_interval {
+            // Calculation uses second granularity: this scheduler is not intended for high frequency tasks
+            self.scheduling_interval = Duration::from_secs(std::cmp::min(
+                std::cmp::max(MIN_SCHEDULING_INTERVAL.as_secs(), want_interval.as_secs()),
+                MAX_SCHEDULING_INTERVAL.as_secs(),
+            ));
+        }
+
+        // The priority order of previously scheduled work may be invalidated by current state: drop
+        // all pending work (it will be re-scheduled if still needed)
+        self.pending.clear();
+
+        // While iterating over the potentially-long list of tenants, we will periodically yield
+        // to avoid blocking executor.
+        yielding_loop(1000, cancel, jobs.into_iter(), |job| {
+            // Skip tenants that already have a write in flight
+            if !self.running.contains_key(job.get_tenant_shard_id()) {
+                self.pending.push_back(job);
+            }
+        })
+        .await
+        .ok();
+    }
+}
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -320,8 +320,8 @@ impl DeltaLayer {
            .metadata()
            .context("get file metadata to determine size")?;

-        // TODO(sharding): we must get the TenantShardId from the path instead of reading the Summary.
-        // we should also validate the path against the Summary, as both should contain the same tenant, timeline, key, lsn.
+        // This function is never used for constructing layers in a running pageserver,
+        // so it does not need an accurate TenantShardId.
        let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);

        Ok(DeltaLayer {
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -278,8 +278,8 @@ impl ImageLayer {
            .metadata()
            .context("get file metadata to determine size")?;

-        // TODO(sharding): we should get TenantShardId from path.
-        // OR, not at all: any layer we load from disk should also get reconciled with remote IndexPart.
+        // This function is never used for constructing layers in a running pageserver,
+        // so it does not need an accurate TenantShardId.
        let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);

        Ok(ImageLayer {
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -23,7 +23,7 @@ use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
 use std::ops::Range;
-use tokio::sync::RwLock;
+use tokio::sync::{RwLock, RwLockWriteGuard};

 use super::{DeltaLayerWriter, ResidentLayer};

@@ -246,16 +246,43 @@ impl InMemoryLayer {

    /// Common subroutine of the public put_wal_record() and put_page_image() functions.
    /// Adds the page version to the in-memory tree
-    pub async fn put_value(
+    pub(crate) async fn put_value(
        &self,
        key: Key,
        lsn: Lsn,
        val: &Value,
        ctx: &RequestContext,
    ) -> Result<()> {
-        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
-        let inner: &mut _ = &mut *self.inner.write().await;
+        let mut inner = self.inner.write().await;
        self.assert_writable();
+        self.put_value_locked(&mut inner, key, lsn, val, ctx).await
+    }
+
+    pub(crate) async fn put_values(
+        &self,
+        values: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        let mut inner = self.inner.write().await;
+        self.assert_writable();
+        for (key, vals) in values {
+            for (lsn, val) in vals {
+                self.put_value_locked(&mut inner, *key, *lsn, val, ctx)
+                    .await?;
+            }
+        }
+        Ok(())
+    }
+
+    async fn put_value_locked(
+        &self,
+        locked_inner: &mut RwLockWriteGuard<'_, InMemoryLayerInner>,
+        key: Key,
+        lsn: Lsn,
+        val: &Value,
+        ctx: &RequestContext,
+    ) -> Result<()> {
+        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);

        let off = {
            // Avoid doing allocations for "small" values.
@@ -264,7 +291,7 @@ impl InMemoryLayer {
            let mut buf = smallvec::SmallVec::<[u8; 256]>::new();
            buf.clear();
            val.ser_into(&mut buf)?;
-            inner
+            locked_inner
                .file
                .write_blob(
                    &buf,
@@ -275,7 +302,7 @@ impl InMemoryLayer {
                .await?
        };

-        let vec_map = inner.index.entry(key).or_default();
+        let vec_map = locked_inner.index.entry(key).or_default();
        let old = vec_map.append_or_update_last(lsn, off).unwrap().0;
        if old.is_some() {
            // We already had an entry for this LSN. That's odd..
@@ -285,13 +312,11 @@ impl InMemoryLayer {
        Ok(())
    }

-    pub async fn put_tombstone(&self, _key_range: Range<Key>, _lsn: Lsn) -> Result<()> {
+    pub(crate) async fn put_tombstones(&self, _key_ranges: &[(Range<Key>, Lsn)]) -> Result<()> {
        // TODO: Currently, we just leak the storage for any deleted keys
-
        Ok(())
    }

-    /// Make the layer non-writeable. Only call once.
    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
    pub async fn freeze(&self, end_lsn: Lsn) {
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -15,7 +15,7 @@ use utils::sync::heavier_once_cell;
 use crate::config::PageServerConf;
 use crate::context::RequestContext;
 use crate::repository::Key;
-use crate::tenant::{remote_timeline_client::LayerFileMetadata, RemoteTimelineClient, Timeline};
+use crate::tenant::{remote_timeline_client::LayerFileMetadata, Timeline};

 use super::delta_layer::{self, DeltaEntry};
 use super::image_layer;
@@ -204,17 +204,14 @@ impl Layer {
    ///
    /// Technically cancellation safe, but cancelling might shift the viewpoint of what generation
    /// of download-evict cycle on retry.
-    pub(crate) async fn evict_and_wait(
-        &self,
-        rtc: &RemoteTimelineClient,
-    ) -> Result<(), EvictionError> {
-        self.0.evict_and_wait(rtc).await
+    pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
+        self.0.evict_and_wait().await
    }

    /// Delete the layer file when the `self` gets dropped, also try to schedule a remote index upload
    /// then.
    ///
-    /// On drop, this will cause a call to [`RemoteTimelineClient::schedule_deletion_of_unlinked`].
+    /// On drop, this will cause a call to [`crate::tenant::remote_timeline_client::RemoteTimelineClient::schedule_deletion_of_unlinked`].
    /// This means that the unlinking by [gc] or [compaction] must have happened strictly before
    /// the value this is called on gets dropped.
    ///
@@ -606,10 +603,7 @@ impl LayerInner {

    /// Cancellation safe, however dropping the future and calling this method again might result
    /// in a new attempt to evict OR join the previously started attempt.
-    pub(crate) async fn evict_and_wait(
-        &self,
-        _: &RemoteTimelineClient,
-    ) -> Result<(), EvictionError> {
+    pub(crate) async fn evict_and_wait(&self) -> Result<(), EvictionError> {
        use tokio::sync::broadcast::error::RecvError;

        assert!(self.have_remote_client);
@@ -945,8 +939,18 @@ impl LayerInner {
            Ok((Err(e), _permit)) => {
                // sleep already happened in the spawned task, if it was not cancelled
                let consecutive_failures = self.consecutive_failures.load(Ordering::Relaxed);
-                tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
-                Err(DownloadError::DownloadFailed)
+
+                match e.downcast_ref::<remote_storage::DownloadError>() {
+                    // If the download failed due to its cancellation token,
+                    // propagate the cancellation error upstream.
+                    Some(remote_storage::DownloadError::Cancelled) => {
+                        Err(DownloadError::DownloadCancelled)
+                    }
+                    _ => {
+                        tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
+                        Err(DownloadError::DownloadFailed)
+                    }
+                }
            }
            Err(_gone) => Err(DownloadError::DownloadCancelled),
        }
@@ -1118,6 +1122,7 @@ impl LayerInner {
                        tracing::info!("evicted layer after unknown residence period");
                    }
                }
+                timeline.metrics.evictions.inc();
                timeline
                    .metrics
                    .resident_physical_size_sub(self.desc.file_size);
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -45,6 +45,8 @@ pub(crate) enum BackgroundLoopKind {
    ConsumptionMetricsCollectMetrics,
    ConsumptionMetricsSyntheticSizeWorker,
    InitialLogicalSizeCalculation,
+    HeatmapUpload,
+    SecondaryDownload,
 }

 impl BackgroundLoopKind {
@@ -63,6 +65,11 @@ pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
        .with_label_values(&[loop_kind.as_static_str()])
        .guard();

+    pausable_failpoint!(
+        "initial-size-calculation-permit-pause",
+        loop_kind == BackgroundLoopKind::InitialLogicalSizeCalculation
+    );
+
    match CONCURRENT_BACKGROUND_TASKS.acquire().await {
        Ok(permit) => permit,
        Err(_closed) => unreachable!("we never close the semaphore"),
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -15,9 +15,10 @@ use fail::fail_point;
 use itertools::Itertools;
 use pageserver_api::{
    models::{
-        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, LayerMapInfo,
-        TimelineState,
+        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, EvictionPolicy,
+        LayerMapInfo, TimelineState,
    },
+    reltag::BlockNumber,
    shard::{ShardIdentity, TenantShardId},
 };
 use rand::Rng;
@@ -42,33 +43,38 @@ use std::{
    ops::ControlFlow,
 };

-use crate::context::{
-    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
-};
-use crate::tenant::storage_layer::delta_layer::DeltaEntry;
-use crate::tenant::storage_layer::{
-    AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
-    LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult,
-    ValueReconstructState,
-};
-use crate::tenant::tasks::BackgroundLoopKind;
 use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    layer_map::{LayerMap, SearchResult},
    metadata::{save_metadata, TimelineMetadata},
    par_fsync,
 };
+use crate::{
+    context::{AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder},
+    disk_usage_eviction_task::DiskUsageEvictionInfo,
+};
 use crate::{deletion_queue::DeletionQueueClient, tenant::remote_timeline_client::StopError};
+use crate::{
+    disk_usage_eviction_task::finite_f32,
+    tenant::storage_layer::{
+        AsLayerDesc, DeltaLayerWriter, EvictionError, ImageLayerWriter, InMemoryLayer, Layer,
+        LayerAccessStatsReset, LayerFileName, ResidentLayer, ValueReconstructResult,
+        ValueReconstructState,
+    },
+};
+use crate::{
+    disk_usage_eviction_task::EvictionCandidate, tenant::storage_layer::delta_layer::DeltaEntry,
+};
+use crate::{pgdatadir_mapping::LsnForTimestamp, tenant::tasks::BackgroundLoopKind};

 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace, KeySpaceRandomAccum};
 use crate::metrics::{
    TimelineMetrics, MATERIALIZED_PAGE_CACHE_HIT, MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
 };
-use crate::pgdatadir_mapping::LsnForTimestamp;
+use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 use crate::pgdatadir_mapping::{is_inherited_key, is_rel_fsm_block_key, is_rel_vm_block_key};
-use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
-use crate::tenant::config::{EvictionPolicy, TenantConfOpt};
+use crate::tenant::config::TenantConfOpt;
 use pageserver_api::reltag::RelTag;
 use pageserver_api::shard::ShardIndex;

@@ -246,6 +252,10 @@ pub struct Timeline {

    pub(super) metrics: TimelineMetrics,

+    // `Timeline` doesn't write these metrics itself, but it manages the lifetime.  Code
+    // in `crate::page_service` writes these metrics.
+    pub(crate) query_metrics: crate::metrics::SmgrQueryTimePerTimeline,
+
    /// Ensures layers aren't frozen by checkpointer between
    /// [`Timeline::get_layer_for_write`] and layer reads.
    /// Locked automatically by [`TimelineWriter`] and checkpointer.
@@ -373,15 +383,20 @@ pub struct GcInfo {
 }

 /// An error happened in a get() operation.
-#[derive(thiserror::Error)]
-pub enum PageReconstructError {
+#[derive(thiserror::Error, Debug)]
+pub(crate) enum PageReconstructError {
    #[error(transparent)]
    Other(#[from] anyhow::Error),

+    #[error("Ancestor LSN wait error: {0}")]
+    AncestorLsnTimeout(#[from] WaitLsnError),
+
    /// The operation was cancelled
+    #[error("Cancelled")]
    Cancelled,

    /// The ancestor of this is being stopped
+    #[error("ancestor timeline {0} is being stopped")]
    AncestorStopping(TimelineId),

    /// An error happened replaying WAL records
@@ -402,32 +417,6 @@ enum FlushLayerError {
    Other(#[from] anyhow::Error),
 }

-impl std::fmt::Debug for PageReconstructError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
-        match self {
-            Self::Other(err) => err.fmt(f),
-            Self::Cancelled => write!(f, "cancelled"),
-            Self::AncestorStopping(timeline_id) => {
-                write!(f, "ancestor timeline {timeline_id} is being stopped")
-            }
-            Self::WalRedo(err) => err.fmt(f),
-        }
-    }
-}
-
-impl std::fmt::Display for PageReconstructError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> Result<(), std::fmt::Error> {
-        match self {
-            Self::Other(err) => err.fmt(f),
-            Self::Cancelled => write!(f, "cancelled"),
-            Self::AncestorStopping(timeline_id) => {
-                write!(f, "ancestor timeline {timeline_id} is being stopped")
-            }
-            Self::WalRedo(err) => err.fmt(f),
-        }
-    }
-}
-
 #[derive(Clone, Copy)]
 pub enum LogicalSizeCalculationCause {
    Initial,
@@ -452,6 +441,21 @@ impl std::fmt::Debug for Timeline {
    }
 }

+#[derive(thiserror::Error, Debug)]
+pub(crate) enum WaitLsnError {
+    // Called on a timeline which is shutting down
+    #[error("Shutdown")]
+    Shutdown,
+
+    // Called on an timeline not in active state or shutting down
+    #[error("Bad state (not active)")]
+    BadState,
+
+    // Timeout expired while waiting for LSN to catch up with goal.
+    #[error("{0}")]
+    Timeout(String),
+}
+
 /// Public interface functions
 impl Timeline {
    /// Get the LSN where this branch was created
@@ -486,7 +490,7 @@ impl Timeline {
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
-    pub async fn get(
+    pub(crate) async fn get(
        &self,
        key: Key,
        lsn: Lsn,
@@ -496,6 +500,11 @@ impl Timeline {
            return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
        }

+        // This check is debug-only because of the cost of hashing, and because it's a double-check: we
+        // already checked the key against the shard_identity when looking up the Timeline from
+        // page_service.
+        debug_assert!(!self.shard_identity.is_key_disposable(&key));
+
        // XXX: structured stats collection for layer eviction here.
        trace!(
            "get page request for {}@{} from task kind {:?}",
@@ -629,24 +638,28 @@ impl Timeline {
    /// You should call this before any of the other get_* or list_* functions. Calling
    /// those functions with an LSN that has been processed yet is an error.
    ///
-    pub async fn wait_lsn(
+    pub(crate) async fn wait_lsn(
        &self,
        lsn: Lsn,
        _ctx: &RequestContext, /* Prepare for use by cancellation */
-    ) -> anyhow::Result<()> {
-        anyhow::ensure!(self.is_active(), "Cannot wait for Lsn on inactive timeline");
+    ) -> Result<(), WaitLsnError> {
+        if self.cancel.is_cancelled() {
+            return Err(WaitLsnError::Shutdown);
+        } else if !self.is_active() {
+            return Err(WaitLsnError::BadState);
+        }

        // This should never be called from the WAL receiver, because that could lead
        // to a deadlock.
-        anyhow::ensure!(
+        debug_assert!(
            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverManager),
            "wait_lsn cannot be called in WAL receiver"
        );
-        anyhow::ensure!(
+        debug_assert!(
            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionHandler),
            "wait_lsn cannot be called in WAL receiver"
        );
-        anyhow::ensure!(
+        debug_assert!(
            task_mgr::current_task_kind() != Some(TaskKind::WalReceiverConnectionPoller),
            "wait_lsn cannot be called in WAL receiver"
        );
@@ -660,18 +673,22 @@ impl Timeline {
        {
            Ok(()) => Ok(()),
            Err(e) => {
-                // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
-                drop(_timer);
-                let walreceiver_status = self.walreceiver_status();
-                Err(anyhow::Error::new(e).context({
-                    format!(
+                use utils::seqwait::SeqWaitError::*;
+                match e {
+                    Shutdown => Err(WaitLsnError::Shutdown),
+                    Timeout => {
+                        // don't count the time spent waiting for lock below, and also in walreceiver.status(), towards the wait_lsn_time_histo
+                        drop(_timer);
+                        let walreceiver_status = self.walreceiver_status();
+                        Err(WaitLsnError::Timeout(format!(
                        "Timed out while waiting for WAL record at LSN {} to arrive, last_record_lsn {} disk consistent LSN={}, WalReceiver status: {}",
                        lsn,
                        self.get_last_record_lsn(),
                        self.get_disk_consistent_lsn(),
                        walreceiver_status,
-                    )
-                }))
+                    )))
+                    }
+                }
            }
        }
    }
@@ -1127,12 +1144,7 @@ impl Timeline {
            return Ok(None);
        };

-        let rtc = self
-            .remote_client
-            .as_ref()
-            .ok_or_else(|| anyhow::anyhow!("remote storage not configured; cannot evict"))?;
-
-        match local_layer.evict_and_wait(rtc).await {
+        match local_layer.evict_and_wait().await {
            Ok(()) => Ok(Some(true)),
            Err(EvictionError::NotFound) => Ok(Some(false)),
            Err(EvictionError::Downloaded) => Ok(Some(false)),
@@ -1307,6 +1319,11 @@ impl Timeline {
                    ),
                ),

+                query_metrics: crate::metrics::SmgrQueryTimePerTimeline::new(
+                    &tenant_shard_id,
+                    &timeline_id,
+                ),
+
                flush_loop_state: Mutex::new(FlushLoopState::NotStarted),

                layer_flush_start_tx,
@@ -1459,6 +1476,7 @@ impl Timeline {
                max_lsn_wal_lag,
                auth_token: crate::config::SAFEKEEPER_AUTH_TOKEN.get().cloned(),
                availability_zone: self.conf.availability_zone.clone(),
+                ingest_batch_size: self.conf.ingest_batch_size,
            },
            broker_client,
            ctx,
@@ -2095,7 +2113,7 @@ impl Timeline {
        let layer_file_names = eviction_info
            .resident_layers
            .iter()
-            .map(|l| l.layer.layer_desc().filename())
+            .map(|l| l.layer.get_name())
            .collect::<Vec<_>>();

        let decorated = match remote_client.get_layers_metadata(layer_file_names) {
@@ -2113,7 +2131,7 @@ impl Timeline {
        .filter_map(|(layer, remote_info)| {
            remote_info.map(|remote_info| {
                HeatMapLayer::new(
-                    layer.layer.layer_desc().filename(),
+                    layer.layer.get_name(),
                    IndexLayerMetadata::from(remote_info),
                    layer.last_activity_ts,
                )
@@ -2223,13 +2241,13 @@ impl Timeline {
                    return Err(layer_traversal_error(
                        if cfg!(test) {
                            format!(
-                                "could not find data for key {} at LSN {}, for request at LSN {}\n{}",
-                                key, cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
+                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}\n{}",
+                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn, std::backtrace::Backtrace::force_capture(),
                            )
                        } else {
                            format!(
-                                "could not find data for key {} at LSN {}, for request at LSN {}",
-                                key, cont_lsn, request_lsn
+                                "could not find data for key {} (shard {:?}) at LSN {}, for request at LSN {}",
+                                key, self.shard_identity.get_shard_number(&key), cont_lsn, request_lsn
                            )
                        },
                        traversal_path,
@@ -2289,11 +2307,12 @@ impl Timeline {
                ancestor
                    .wait_lsn(timeline.ancestor_lsn, ctx)
                    .await
-                    .with_context(|| {
-                        format!(
-                            "wait for lsn {} on ancestor timeline_id={}",
-                            timeline.ancestor_lsn, ancestor.timeline_id
-                        )
+                    .map_err(|e| match e {
+                        e @ WaitLsnError::Timeout(_) => PageReconstructError::AncestorLsnTimeout(e),
+                        WaitLsnError::Shutdown => PageReconstructError::Cancelled,
+                        e @ WaitLsnError::BadState => {
+                            PageReconstructError::Other(anyhow::anyhow!(e))
+                        }
                    })?;

                timeline_owned = ancestor;
@@ -2471,9 +2490,27 @@ impl Timeline {
        Ok(())
    }

-    async fn put_tombstone(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
-        let layer = self.get_layer_for_write(lsn).await?;
-        layer.put_tombstone(key_range, lsn).await?;
+    async fn put_values(
+        &self,
+        values: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        // Pick the first LSN in the batch to get the layer to write to.
+        for lsns in values.values() {
+            if let Some((lsn, _)) = lsns.first() {
+                let layer = self.get_layer_for_write(*lsn).await?;
+                layer.put_values(values, ctx).await?;
+                break;
+            }
+        }
+        Ok(())
+    }
+
+    async fn put_tombstones(&self, tombstones: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        if let Some((_, lsn)) = tombstones.first() {
+            let layer = self.get_layer_for_write(*lsn).await?;
+            layer.put_tombstones(tombstones).await?;
+        }
        Ok(())
    }

@@ -3035,6 +3072,15 @@ impl Timeline {
                for range in &partition.ranges {
                    let mut key = range.start;
                    while key < range.end {
+                        if self.shard_identity.is_key_disposable(&key) {
+                            debug!(
+                                "Dropping key {} during compaction (it belongs on shard {:?})",
+                                key,
+                                self.shard_identity.get_shard_number(&key)
+                            );
+                            key = key.next();
+                            continue;
+                        }
                        let img = match self.get(key, lsn, ctx).await {
                            Ok(img) => img,
                            Err(err) => {
@@ -3061,6 +3107,7 @@ impl Timeline {
                                }
                            }
                        };
+
                        image_layer_writer.put_image(key, &img).await?;
                        key = key.next();
                    }
@@ -3094,11 +3141,13 @@ impl Timeline {
            .await
            .context("fsync of newly created layer files")?;

-        par_fsync::par_fsync_async(&[self
-            .conf
-            .timeline_path(&self.tenant_shard_id, &self.timeline_id)])
-        .await
-        .context("fsync of timeline dir")?;
+        if !all_paths.is_empty() {
+            par_fsync::par_fsync_async(&[self
+                .conf
+                .timeline_path(&self.tenant_shard_id, &self.timeline_id)])
+            .await
+            .context("fsync of timeline dir")?;
+        }

        let mut guard = self.layers.write().await;

@@ -3631,7 +3680,15 @@ impl Timeline {
                )))
            });

-            writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+            if !self.shard_identity.is_key_disposable(&key) {
+                writer.as_mut().unwrap().put_value(key, lsn, value).await?;
+            } else {
+                debug!(
+                    "Dropping key {} during compaction (it belongs on shard {:?})",
+                    key,
+                    self.shard_identity.get_shard_number(&key)
+                );
+            }

            if !new_layers.is_empty() {
                fail_point!("after-timeline-compacted-first-L1");
@@ -4186,7 +4243,7 @@ impl Timeline {
                    .context("Failed to reconstruct a page image:")
                {
                    Ok(img) => img,
-                    Err(e) => return Err(PageReconstructError::from(e)),
+                    Err(e) => return Err(PageReconstructError::WalRedo(e)),
                };

                if img.len() == page_cache::PAGE_SZ {
@@ -4377,43 +4434,6 @@ impl Timeline {
    }
 }

-pub(crate) struct DiskUsageEvictionInfo {
-    /// Timeline's largest layer (remote or resident)
-    pub max_layer_size: Option<u64>,
-    /// Timeline's resident layers
-    pub resident_layers: Vec<LocalLayerInfoForDiskUsageEviction>,
-}
-
-pub(crate) struct LocalLayerInfoForDiskUsageEviction {
-    pub layer: Layer,
-    pub last_activity_ts: SystemTime,
-}
-
-impl std::fmt::Debug for LocalLayerInfoForDiskUsageEviction {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // format the tv_sec, tv_nsec into rfc3339 in case someone is looking at it
-        // having to allocate a string to this is bad, but it will rarely be formatted
-        let ts = chrono::DateTime::<chrono::Utc>::from(self.last_activity_ts);
-        let ts = ts.to_rfc3339_opts(chrono::SecondsFormat::Nanos, true);
-        struct DisplayIsDebug<'a, T>(&'a T);
-        impl<'a, T: std::fmt::Display> std::fmt::Debug for DisplayIsDebug<'a, T> {
-            fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-                write!(f, "{}", self.0)
-            }
-        }
-        f.debug_struct("LocalLayerInfoForDiskUsageEviction")
-            .field("layer", &DisplayIsDebug(&self.layer))
-            .field("last_activity", &ts)
-            .finish()
-    }
-}
-
-impl LocalLayerInfoForDiskUsageEviction {
-    pub fn file_size(&self) -> u64 {
-        self.layer.layer_desc().file_size
-    }
-}
-
 impl Timeline {
    /// Returns non-remote layers for eviction.
    pub(crate) async fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo {
@@ -4447,9 +4467,10 @@ impl Timeline {
                SystemTime::now()
            });

-            resident_layers.push(LocalLayerInfoForDiskUsageEviction {
-                layer: l.drop_eviction_guard(),
+            resident_layers.push(EvictionCandidate {
+                layer: l.drop_eviction_guard().into(),
                last_activity_ts,
+                relative_last_activity: finite_f32::FiniteF32::ZERO,
            });
        }

@@ -4529,8 +4550,16 @@ impl<'a> TimelineWriter<'a> {
        self.tl.put_value(key, lsn, value, ctx).await
    }

-    pub async fn delete(&self, key_range: Range<Key>, lsn: Lsn) -> anyhow::Result<()> {
-        self.tl.put_tombstone(key_range, lsn).await
+    pub(crate) async fn put_batch(
+        &self,
+        batch: &HashMap<Key, Vec<(Lsn, Value)>>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
+        self.tl.put_values(batch, ctx).await
+    }
+
+    pub(crate) async fn delete_batch(&self, batch: &[(Range<Key>, Lsn)]) -> anyhow::Result<()> {
+        self.tl.put_tombstones(batch).await
    }

    /// Track the end of the latest digested WAL record.
@@ -4541,11 +4570,11 @@ impl<'a> TimelineWriter<'a> {
    /// 'lsn' must be aligned. This wakes up any wait_lsn() callers waiting for
    /// the 'lsn' or anything older. The previous last record LSN is stored alongside
    /// the latest and can be read.
-    pub fn finish_write(&self, new_lsn: Lsn) {
+    pub(crate) fn finish_write(&self, new_lsn: Lsn) {
        self.tl.finish_write(new_lsn);
    }

-    pub fn update_current_logical_size(&self, delta: i64) {
+    pub(crate) fn update_current_logical_size(&self, delta: i64) {
        self.tl.update_current_logical_size(delta)
    }
 }
@@ -4598,11 +4627,6 @@ mod tests {
            .await
            .unwrap();

-        let rtc = timeline
-            .remote_client
-            .clone()
-            .expect("just configured this");
-
        let layer = find_some_layer(&timeline).await;
        let layer = layer
            .keep_resident()
@@ -4611,8 +4635,8 @@ mod tests {
            .expect("should had been resident")
            .drop_eviction_guard();

-        let first = async { layer.evict_and_wait(&rtc).await };
-        let second = async { layer.evict_and_wait(&rtc).await };
+        let first = async { layer.evict_and_wait().await };
+        let second = async { layer.evict_and_wait().await };

        let (first, second) = tokio::join!(first, second);

--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -20,6 +20,7 @@ use std::{
    time::{Duration, SystemTime},
 };

+use pageserver_api::models::{EvictionPolicy, EvictionPolicyLayerAccessThreshold};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
@@ -29,10 +30,7 @@ use crate::{
    pgdatadir_mapping::CollectKeySpaceError,
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
-        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
-        tasks::BackgroundLoopKind,
-        timeline::EvictionError,
-        LogicalSizeCalculationCause, Tenant,
+        tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
    },
 };

@@ -215,13 +213,10 @@ impl Timeline {

        // So, we just need to deal with this.

-        let remote_client = match self.remote_client.as_ref() {
-            Some(c) => c,
-            None => {
-                error!("no remote storage configured, cannot evict layers");
-                return ControlFlow::Continue(());
-            }
-        };
+        if self.remote_client.is_none() {
+            error!("no remote storage configured, cannot evict layers");
+            return ControlFlow::Continue(());
+        }

        let mut js = tokio::task::JoinSet::new();
        {
@@ -274,9 +269,8 @@ impl Timeline {
                };
                let layer = guard.drop_eviction_guard();
                if no_activity_for > p.threshold {
-                    let remote_client = remote_client.clone();
                    // this could cause a lot of allocations in some cases
-                    js.spawn(async move { layer.evict_and_wait(&remote_client).await });
+                    js.spawn(async move { layer.evict_and_wait().await });
                    stats.candidates += 1;
                }
            }
--- a/Show More
+++ b/Show More