Add /terminate API (#6745 ) (#6853 )

this is to speed up suspends, see https://github.com/neondatabase/cloud/issues/10284 Cherry-pick to release branch to build new compute images
Merge pull request #6803 from neondatabase/releases/2024-02-19
2026-01-22 12:52:55 +00:00 · 2024-02-22 11:51:19 +02:00 · 2024-02-19 16:38:35 +04:00 · 2024-02-18 15:55:19 +00:00 · 2024-02-18 12:16:07 +00:00 · 2024-02-18 08:51:12 +00:00
110 changed files with 3965 additions and 2172 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -253,7 +253,7 @@ jobs:
          done

          if [ "${FAILED}" = "true" ]; then
-            echo >&2 "Please update vendors/revisions.json if these changes are intentional"
+            echo >&2 "Please update vendor/revisions.json if these changes are intentional"
            exit 1
          fi

--- a/Cargo.lock
+++ b/Cargo.lock
@@ -1813,6 +1813,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "e875f1719c16de097dee81ed675e2d9bb63096823ed3f0ca827b7dea3028bbbb"
 dependencies = [
 "enumset_derive",
+ "serde",
 ]

 [[package]]
@@ -2757,6 +2758,17 @@ version = "1.3.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"

+[[package]]
+name = "leaky-bucket"
+version = "1.0.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "8eb491abd89e9794d50f93c8db610a29509123e3fbbc9c8c67a528e9391cd853"
+dependencies = [
+ "parking_lot 0.12.1",
+ "tokio",
+ "tracing",
+]
+
 [[package]]
 name = "libc"
 version = "0.2.150"
@@ -3448,6 +3460,7 @@ name = "pageserver"
 version = "0.1.0"
 dependencies = [
 "anyhow",
+ "arc-swap",
 "async-compression",
 "async-stream",
 "async-trait",
@@ -3475,6 +3488,7 @@ dependencies = [
 "humantime-serde",
 "hyper",
 "itertools",
+ "leaky-bucket",
 "md5",
 "metrics",
 "nix 0.27.1",
@@ -4436,6 +4450,7 @@ dependencies = [
 "futures",
 "futures-util",
 "http-types",
+ "humantime",
 "hyper",
 "itertools",
 "metrics",
@@ -4447,6 +4462,7 @@ dependencies = [
 "serde_json",
 "test-context",
 "tokio",
+ "tokio-stream",
 "tokio-util",
 "toml_edit",
 "tracing",
@@ -6345,6 +6361,7 @@ dependencies = [
 "hex-literal",
 "hyper",
 "jsonwebtoken",
+ "leaky-bucket",
 "metrics",
 "nix 0.27.1",
 "once_cell",
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -97,6 +97,7 @@ ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "9"
 lasso = "0.7"
+leaky-bucket = "1.0.1"
 libc = "0.2"
 md5 = "0.7.0"
 memoffset = "0.8"
--- a/2
+++ b/2
@@ -47,7 +47,7 @@ COPY --chown=nonroot . .
 # Show build caching stats to check if it was used in the end.
 # Has to be the part of the same RUN since cachepot daemon is killed in the end of this RUN, losing the compilation stats.
 RUN set -e \
-    && mold -run cargo build  \
+    && RUSTFLAGS="-Clinker=clang -Clink-arg=-fuse-ld=mold -Clink-arg=-Wl,--no-rosegment" cargo build  \
      --bin pg_sni_router  \
      --bin pageserver  \
      --bin pagectl  \
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -769,6 +769,24 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install

+#########################################################################################
+#
+# Layer "pg_ivm"
+# compile pg_ivm extension
+#
+#########################################################################################
+FROM build-deps AS pg-ivm-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/sraoss/pg_ivm/archive/refs/tags/v1.7.tar.gz -O pg_ivm.tar.gz && \
+    echo "ebfde04f99203c7be4b0e873f91104090e2e83e5429c32ac242d00f334224d5e pg_ivm.tar.gz" | sha256sum --check && \
+    mkdir pg_ivm-src && cd pg_ivm-src && tar xvzf ../pg_ivm.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pg_ivm.control
+
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -810,6 +828,7 @@ COPY --from=pg-semver-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY --from=pg-anon-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-ivm-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -820,6 +839,10 @@ RUN make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/neon_utils \
        -s install && \
+    make -j $(getconf _NPROCESSORS_ONLN) \
+        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
+        -C pgxn/neon_test_utils \
+        -s install && \
    make -j $(getconf _NPROCESSORS_ONLN) \
        PG_CONFIG=/usr/local/pgsql/bin/pg_config \
        -C pgxn/neon_rmgr \
--- a/16
+++ b/16
@@ -159,8 +159,8 @@ neon-pg-ext-%: postgres-%
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-utils-$* \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon_utils/Makefile install

-.PHONY: neon-pg-ext-clean-%
-neon-pg-ext-clean-%:
+.PHONY: neon-pg-clean-ext-%
+neon-pg-clean-ext-%:
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
 	-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
 	-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
@@ -216,11 +216,11 @@ neon-pg-ext: \
 	neon-pg-ext-v15 \
 	neon-pg-ext-v16

-.PHONY: neon-pg-ext-clean
-neon-pg-ext-clean: \
-	neon-pg-ext-clean-v14 \
-	neon-pg-ext-clean-v15 \
-	neon-pg-ext-clean-v16
+.PHONY: neon-pg-clean-ext
+neon-pg-clean-ext: \
+	neon-pg-clean-ext-v14 \
+	neon-pg-clean-ext-v15 \
+	neon-pg-clean-ext-v16

 # shorthand to build all Postgres versions
 .PHONY: postgres
@@ -249,7 +249,7 @@ postgres-check: \

 # This doesn't remove the effects of 'configure'.
 .PHONY: clean
-clean: postgres-clean neon-pg-ext-clean
+clean: postgres-clean neon-pg-clean-ext
 	$(CARGO_CMD_PREFIX) cargo clean

 # This removes everything
--- a/README.md
+++ b/README.md
@@ -249,6 +249,16 @@ testing locally, it is convenient to run just one set of permutations, like this
 DEFAULT_PG_VERSION=15 BUILD_TYPE=release ./scripts/pytest
 ```

+## Flamegraphs
+
+You may find yourself in need of flamegraphs for software in this repository.
+You can use [`flamegraph-rs`](https://github.com/flamegraph-rs/flamegraph) or the original [`flamegraph.pl`](https://github.com/brendangregg/FlameGraph). Your choice!
+
+>[!IMPORTANT]
+> If you're using `lld` or `mold`, you need the `--no-rosegment` linker argument.
+> It's a [general thing with Rust / lld / mold](https://crbug.com/919499#c16), not specific to this repository.
+> See [this PR for further instructions](https://github.com/neondatabase/neon/pull/6764).
+
 ## Documentation

 [docs](/docs) Contains a top-level overview of all available markdown documentation.
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -45,7 +45,6 @@ use std::{thread, time::Duration};
 use anyhow::{Context, Result};
 use chrono::Utc;
 use clap::Arg;
-use nix::sys::signal::{kill, Signal};
 use signal_hook::consts::{SIGQUIT, SIGTERM};
 use signal_hook::{consts::SIGINT, iterator::Signals};
 use tracing::{error, info};
@@ -53,7 +52,9 @@ use url::Url;

 use compute_api::responses::ComputeStatus;

-use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec, PG_PID, SYNC_SAFEKEEPERS_PID};
+use compute_tools::compute::{
+    forward_termination_signal, ComputeNode, ComputeState, ParsedSpec, PG_PID,
+};
 use compute_tools::configurator::launch_configurator;
 use compute_tools::extension_server::get_pg_version;
 use compute_tools::http::api::launch_http_server;
@@ -394,6 +395,15 @@ fn main() -> Result<()> {
        info!("synced safekeepers at lsn {lsn}");
    }

+    let mut state = compute.state.lock().unwrap();
+    if state.status == ComputeStatus::TerminationPending {
+        state.status = ComputeStatus::Terminated;
+        compute.state_changed.notify_all();
+        // we were asked to terminate gracefully, don't exit to avoid restart
+        delay_exit = true
+    }
+    drop(state);
+
    if let Err(err) = compute.check_for_core_dumps() {
        error!("error while checking for core dumps: {err:?}");
    }
@@ -523,16 +533,7 @@ fn cli() -> clap::Command {
 /// wait for termination which would be easy then.
 fn handle_exit_signal(sig: i32) {
    info!("received {sig} termination signal");
-    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
-    if ss_pid != 0 {
-        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
-        kill(ss_pid, Signal::SIGTERM).ok();
-    }
-    let pg_pid = PG_PID.load(Ordering::SeqCst);
-    if pg_pid != 0 {
-        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
-        kill(pg_pid, Signal::SIGTERM).ok();
-    }
+    forward_termination_signal();
    exit(1);
 }

--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -2,7 +2,7 @@ use std::collections::HashMap;
 use std::env;
 use std::fs;
 use std::io::BufRead;
-use std::os::unix::fs::PermissionsExt;
+use std::os::unix::fs::{symlink, PermissionsExt};
 use std::path::Path;
 use std::process::{Command, Stdio};
 use std::str::FromStr;
@@ -28,6 +28,8 @@ use compute_api::responses::{ComputeMetrics, ComputeStatus};
 use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

+use nix::sys::signal::{kill, Signal};
+
 use remote_storage::{DownloadError, RemotePath};

 use crate::checker::create_availability_check_data;
@@ -324,7 +326,8 @@ impl ComputeNode {
        let spec = compute_state.pspec.as_ref().expect("spec must be set");
        let start_time = Instant::now();

-        let mut config = postgres::Config::from_str(&spec.pageserver_connstr)?;
+        let shard0_connstr = spec.pageserver_connstr.split(',').next().unwrap();
+        let mut config = postgres::Config::from_str(shard0_connstr)?;

        // Use the storage auth token from the config file, if given.
        // Note: this overrides any password set in the connection string.
@@ -634,6 +637,48 @@ impl ComputeNode {
        // Update pg_hba.conf received with basebackup.
        update_pg_hba(pgdata_path)?;

+        // Place pg_dynshmem under /dev/shm. This allows us to use
+        // 'dynamic_shared_memory_type = mmap' so that the files are placed in
+        // /dev/shm, similar to how 'dynamic_shared_memory_type = posix' works.
+        //
+        // Why on earth don't we just stick to the 'posix' default, you might
+        // ask.  It turns out that making large allocations with 'posix' doesn't
+        // work very well with autoscaling. The behavior we want is that:
+        //
+        // 1. You can make large DSM allocations, larger than the current RAM
+        //    size of the VM, without errors
+        //
+        // 2. If the allocated memory is really used, the VM is scaled up
+        //    automatically to accommodate that
+        //
+        // We try to make that possible by having swap in the VM. But with the
+        // default 'posix' DSM implementation, we fail step 1, even when there's
+        // plenty of swap available. PostgreSQL uses posix_fallocate() to create
+        // the shmem segment, which is really just a file in /dev/shm in Linux,
+        // but posix_fallocate() on tmpfs returns ENOMEM if the size is larger
+        // than available RAM.
+        //
+        // Using 'dynamic_shared_memory_type = mmap' works around that, because
+        // the Postgres 'mmap' DSM implementation doesn't use
+        // posix_fallocate(). Instead, it uses repeated calls to write(2) to
+        // fill the file with zeros. It's weird that that differs between
+        // 'posix' and 'mmap', but we take advantage of it. When the file is
+        // filled slowly with write(2), the kernel allows it to grow larger, as
+        // long as there's swap available.
+        //
+        // In short, using 'dynamic_shared_memory_type = mmap' allows us one DSM
+        // segment to be larger than currently available RAM. But because we
+        // don't want to store it on a real file, which the kernel would try to
+        // flush to disk, so symlink pg_dynshm to /dev/shm.
+        //
+        // We don't set 'dynamic_shared_memory_type = mmap' here, we let the
+        // control plane control that option. If 'mmap' is not used, this
+        // symlink doesn't affect anything.
+        //
+        // See https://github.com/neondatabase/autoscaling/issues/800
+        std::fs::remove_dir(pgdata_path.join("pg_dynshmem"))?;
+        symlink("/dev/shm/", pgdata_path.join("pg_dynshmem"))?;
+
        match spec.mode {
            ComputeMode::Primary => {}
            ComputeMode::Replica | ComputeMode::Static(..) => {
@@ -1279,3 +1324,17 @@ LIMIT 100",
        Ok(remote_ext_metrics)
    }
 }
+
+pub fn forward_termination_signal() {
+    let ss_pid = SYNC_SAFEKEEPERS_PID.load(Ordering::SeqCst);
+    if ss_pid != 0 {
+        let ss_pid = nix::unistd::Pid::from_raw(ss_pid as i32);
+        kill(ss_pid, Signal::SIGTERM).ok();
+    }
+    let pg_pid = PG_PID.load(Ordering::SeqCst);
+    if pg_pid != 0 {
+        let pg_pid = nix::unistd::Pid::from_raw(pg_pid as i32);
+        // use 'immediate' shutdown (SIGQUIT): https://www.postgresql.org/docs/current/server-shutdown.html
+        kill(pg_pid, Signal::SIGQUIT).ok();
+    }
+}
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -51,6 +51,9 @@ pub fn write_postgres_conf(
    if let Some(s) = &spec.pageserver_connstring {
        writeln!(file, "neon.pageserver_connstring={}", escape_conf_value(s))?;
    }
+    if let Some(stripe_size) = spec.shard_stripe_size {
+        writeln!(file, "neon.stripe_size={stripe_size}")?;
+    }
    if !spec.safekeeper_connstrings.is_empty() {
        writeln!(
            file,
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -5,6 +5,7 @@ use std::net::SocketAddr;
 use std::sync::Arc;
 use std::thread;

+use crate::compute::forward_termination_signal;
 use crate::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_api::requests::ConfigurationRequest;
 use compute_api::responses::{ComputeStatus, ComputeStatusResponse, GenericAPIError};
@@ -123,6 +124,17 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }

+        (&Method::POST, "/terminate") => {
+            info!("serving /terminate POST request");
+            match handle_terminate_request(compute).await {
+                Ok(()) => Response::new(Body::empty()),
+                Err((msg, code)) => {
+                    error!("error handling /terminate request: {msg}");
+                    render_json_error(&msg, code)
+                }
+            }
+        }
+
        // download extension files from remote extension storage on demand
        (&Method::POST, route) if route.starts_with("/extension_server/") => {
            info!("serving {:?} POST request", route);
@@ -297,6 +309,49 @@ fn render_json_error(e: &str, status: StatusCode) -> Response<Body> {
        .unwrap()
 }

+async fn handle_terminate_request(compute: &Arc<ComputeNode>) -> Result<(), (String, StatusCode)> {
+    {
+        let mut state = compute.state.lock().unwrap();
+        if state.status == ComputeStatus::Terminated {
+            return Ok(());
+        }
+        if state.status != ComputeStatus::Empty && state.status != ComputeStatus::Running {
+            let msg = format!(
+                "invalid compute status for termination request: {:?}",
+                state.status.clone()
+            );
+            return Err((msg, StatusCode::PRECONDITION_FAILED));
+        }
+        state.status = ComputeStatus::TerminationPending;
+        compute.state_changed.notify_all();
+        drop(state);
+    }
+    forward_termination_signal();
+    info!("sent signal and notified waiters");
+
+    // Spawn a blocking thread to wait for compute to become Terminated.
+    // This is needed to do not block the main pool of workers and
+    // be able to serve other requests while some particular request
+    // is waiting for compute to finish configuration.
+    let c = compute.clone();
+    task::spawn_blocking(move || {
+        let mut state = c.state.lock().unwrap();
+        while state.status != ComputeStatus::Terminated {
+            state = c.state_changed.wait(state).unwrap();
+            info!(
+                "waiting for compute to become Terminated, current status: {:?}",
+                state.status
+            );
+        }
+
+        Ok(())
+    })
+    .await
+    .unwrap()?;
+    info!("terminated Postgres");
+    Ok(())
+}
+
 // Main Hyper HTTP server function that runs it and blocks waiting on it forever.
 #[tokio::main]
 async fn serve(port: u16, state: Arc<ComputeNode>) {
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -168,6 +168,29 @@ paths:
              schema:
                $ref: "#/components/schemas/GenericError"

+  /terminate:
+    post:
+      tags:
+      - Terminate
+      summary: Terminate Postgres and wait for it to exit
+      description: ""
+      operationId: terminate
+      responses:
+        200:
+          description: Result
+        412:
+          description: "wrong state"
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+        500:
+          description: "Unexpected error"
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
+
 components:
  securitySchemes:
    JWT:
--- a/control_plane/attachment_service/Cargo.toml
+++ b/control_plane/attachment_service/Cargo.toml
@@ -4,6 +4,11 @@ version = "0.1.0"
 edition.workspace = true
 license.workspace = true

+[features]
+default = []
+# Enables test-only APIs and behaviors
+testing = []
+
 [dependencies]
 anyhow.workspace = true
 aws-config.workspace = true
--- a/control_plane/attachment_service/src/compute_hook.rs
+++ b/control_plane/attachment_service/src/compute_hook.rs
@@ -3,7 +3,7 @@ use std::{collections::HashMap, time::Duration};
 use control_plane::endpoint::{ComputeControlPlane, EndpointStatus};
 use control_plane::local_env::LocalEnv;
 use hyper::{Method, StatusCode};
-use pageserver_api::shard::{ShardCount, ShardIndex, ShardNumber, TenantShardId};
+use pageserver_api::shard::{ShardIndex, ShardNumber, TenantShardId};
 use postgres_connection::parse_host_port;
 use serde::{Deserialize, Serialize};
 use tokio_util::sync::CancellationToken;
@@ -77,7 +77,7 @@ impl ComputeHookTenant {
        self.shards
            .sort_by_key(|(shard, _node_id)| shard.shard_number);

-        if self.shards.len() == shard_count.0 as usize || shard_count == ShardCount(0) {
+        if self.shards.len() == shard_count.count() as usize || shard_count.is_unsharded() {
            // We have pageservers for all the shards: emit a configuration update
            return Some(ComputeHookNotifyRequest {
                tenant_id,
@@ -94,7 +94,7 @@ impl ComputeHookTenant {
            tracing::info!(
                "ComputeHookTenant::maybe_reconfigure: not enough shards ({}/{})",
                self.shards.len(),
-                shard_count.0
+                shard_count.count()
            );
        }

@@ -155,7 +155,7 @@ impl ComputeHook {

        for (endpoint_name, endpoint) in &cplane.endpoints {
            if endpoint.tenant_id == tenant_id && endpoint.status() == EndpointStatus::Running {
-                tracing::info!("🔁 Reconfiguring endpoint {}", endpoint_name,);
+                tracing::info!("Reconfiguring endpoint {}", endpoint_name,);
                endpoint.reconfigure(compute_pageservers.clone()).await?;
            }
        }
@@ -177,7 +177,7 @@ impl ComputeHook {
            req
        };

-        tracing::debug!(
+        tracing::info!(
            "Sending notify request to {} ({:?})",
            url,
            reconfigure_request
@@ -266,7 +266,7 @@ impl ComputeHook {
    /// periods, but we don't retry forever.  The **caller** is responsible for handling failures and
    /// ensuring that they eventually call again to ensure that the compute is eventually notified of
    /// the proper pageserver nodes for a tenant.
-    #[tracing::instrument(skip_all, fields(tenant_shard_id, node_id))]
+    #[tracing::instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), node_id))]
    pub(super) async fn notify(
        &self,
        tenant_shard_id: TenantShardId,
@@ -298,7 +298,7 @@ impl ComputeHook {
        let Some(reconfigure_request) = reconfigure_request else {
            // The tenant doesn't yet have pageservers for all its shards: we won't notify anything
            // until it does.
-            tracing::debug!("Tenant isn't yet ready to emit a notification",);
+            tracing::info!("Tenant isn't yet ready to emit a notification");
            return Ok(());
        };

--- a/control_plane/attachment_service/src/lib.rs
+++ b/control_plane/attachment_service/src/lib.rs
@@ -37,6 +37,12 @@ impl std::fmt::Display for Sequence {
    }
 }

+impl std::fmt::Debug for Sequence {
+    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
+        write!(f, "{}", self.0)
+    }
+}
+
 impl MonotonicCounter<Sequence> for Sequence {
    fn cnt_advance(&mut self, v: Sequence) {
        assert!(*self <= v);
--- a/control_plane/attachment_service/src/main.rs
+++ b/control_plane/attachment_service/src/main.rs
@@ -15,6 +15,7 @@ use diesel::Connection;
 use metrics::launch_timestamp::LaunchTimestamp;
 use std::sync::Arc;
 use tokio::signal::unix::SignalKind;
+use tokio_util::sync::CancellationToken;
 use utils::auth::{JwtAuth, SwappableJwtAuth};
 use utils::logging::{self, LogFormat};

@@ -237,15 +238,23 @@ async fn async_main() -> anyhow::Result<()> {
    let auth = secrets
        .public_key
        .map(|jwt_auth| Arc::new(SwappableJwtAuth::new(jwt_auth)));
-    let router = make_router(service, auth)
+    let router = make_router(service.clone(), auth)
        .build()
        .map_err(|err| anyhow!(err))?;
    let router_service = utils::http::RouterService::new(router).unwrap();
-    let server = hyper::Server::from_tcp(http_listener)?.serve(router_service);

+    // Start HTTP server
+    let server_shutdown = CancellationToken::new();
+    let server = hyper::Server::from_tcp(http_listener)?
+        .serve(router_service)
+        .with_graceful_shutdown({
+            let server_shutdown = server_shutdown.clone();
+            async move {
+                server_shutdown.cancelled().await;
+            }
+        });
    tracing::info!("Serving on {0}", args.listen);
-
-    tokio::task::spawn(server);
+    let server_task = tokio::task::spawn(server);

    // Wait until we receive a signal
    let mut sigint = tokio::signal::unix::signal(SignalKind::interrupt())?;
@@ -266,5 +275,16 @@ async fn async_main() -> anyhow::Result<()> {
        }
    }

+    // Stop HTTP server first, so that we don't have to service requests
+    // while shutting down Service
+    server_shutdown.cancel();
+    if let Err(e) = server_task.await {
+        tracing::error!("Error joining HTTP server task: {e}")
+    }
+    tracing::info!("Joined HTTP server task");
+
+    service.shutdown().await;
+    tracing::info!("Service shutdown complete");
+
    std::process::exit(0);
 }
--- a/control_plane/attachment_service/src/persistence.rs
+++ b/control_plane/attachment_service/src/persistence.rs
@@ -222,7 +222,7 @@ impl Persistence {
            let tenant_shard_id = TenantShardId {
                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
                shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount(tsp.shard_count as u8),
+                shard_count: ShardCount::new(tsp.shard_count as u8),
            };

            tenants_map.insert(tenant_shard_id, tsp);
@@ -318,7 +318,7 @@ impl Persistence {
                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())
                    .map_err(|e| DatabaseError::Logical(format!("Malformed tenant id: {e}")))?,
                shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount(tsp.shard_count as u8),
+                shard_count: ShardCount::new(tsp.shard_count as u8),
            };
            result.insert(tenant_shard_id, Generation::new(tsp.generation as u32));
        }
@@ -340,7 +340,7 @@ impl Persistence {
                let updated = diesel::update(tenant_shards)
                    .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                    .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                    .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
+                    .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
                    .set((
                        generation.eq(generation + 1),
                        generation_pageserver.eq(node_id.0 as i64),
@@ -362,7 +362,7 @@ impl Persistence {
            let updated = diesel::update(tenant_shards)
                .filter(tenant_id.eq(tenant_shard_id.tenant_id.to_string()))
                .filter(shard_number.eq(tenant_shard_id.shard_number.0 as i32))
-                .filter(shard_count.eq(tenant_shard_id.shard_count.0 as i32))
+                .filter(shard_count.eq(tenant_shard_id.shard_count.literal() as i32))
                .set((
                    generation_pageserver.eq(i64::MAX),
                    placement_policy.eq(serde_json::to_string(&PlacementPolicy::Detached).unwrap()),
@@ -381,7 +381,6 @@ impl Persistence {
    //
    // We create the child shards here, so that they will be available for increment_generation calls
    // if some pageserver holding a child shard needs to restart before the overall tenant split is complete.
-    #[allow(dead_code)]
    pub(crate) async fn begin_shard_split(
        &self,
        old_shard_count: ShardCount,
@@ -393,21 +392,19 @@ impl Persistence {
            conn.transaction(|conn| -> DatabaseResult<()> {
                // Mark parent shards as splitting

-                let expect_parent_records = std::cmp::max(1, old_shard_count.0);
-
                let updated = diesel::update(tenant_shards)
                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(old_shard_count.0 as i32))
+                    .filter(shard_count.eq(old_shard_count.literal() as i32))
                    .set((splitting.eq(1),))
                    .execute(conn)?;
                if u8::try_from(updated)
                    .map_err(|_| DatabaseError::Logical(
                        format!("Overflow existing shard count {} while splitting", updated))
-                    )? != expect_parent_records {
+                    )? != old_shard_count.count() {
                    // Perhaps a deletion or another split raced with this attempt to split, mutating
                    // the parent shards that we intend to split. In this case the split request should fail.
                    return Err(DatabaseError::Logical(
-                        format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {expect_parent_records})")
+                        format!("Unexpected existing shard count {updated} when preparing tenant for split (expected {})", old_shard_count.count())
                    ));
                }

@@ -419,7 +416,7 @@ impl Persistence {
                    let mut parent = crate::schema::tenant_shards::table
                        .filter(tenant_id.eq(parent_shard_id.tenant_id.to_string()))
                        .filter(shard_number.eq(parent_shard_id.shard_number.0 as i32))
-                        .filter(shard_count.eq(parent_shard_id.shard_count.0 as i32))
+                        .filter(shard_count.eq(parent_shard_id.shard_count.literal() as i32))
                        .load::<TenantShardPersistence>(conn)?;
                    let parent = if parent.len() != 1 {
                        return Err(DatabaseError::Logical(format!(
@@ -449,7 +446,6 @@ impl Persistence {

    // When we finish shard splitting, we must atomically clean up the old shards
    // and insert the new shards, and clear the splitting marker.
-    #[allow(dead_code)]
    pub(crate) async fn complete_shard_split(
        &self,
        split_tenant_id: TenantId,
@@ -461,7 +457,7 @@ impl Persistence {
                // Drop parent shards
                diesel::delete(tenant_shards)
                    .filter(tenant_id.eq(split_tenant_id.to_string()))
-                    .filter(shard_count.eq(old_shard_count.0 as i32))
+                    .filter(shard_count.eq(old_shard_count.literal() as i32))
                    .execute(conn)?;

                // Clear sharding flag
--- a/control_plane/attachment_service/src/reconciler.rs
+++ b/control_plane/attachment_service/src/reconciler.rs
@@ -13,6 +13,7 @@ use tokio_util::sync::CancellationToken;
 use utils::generation::Generation;
 use utils::id::{NodeId, TimelineId};
 use utils::lsn::Lsn;
+use utils::sync::gate::GateGuard;

 use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
@@ -53,6 +54,10 @@ pub(super) struct Reconciler {
    /// the tenant is changed.
    pub(crate) cancel: CancellationToken,

+    /// Reconcilers are registered with a Gate so that during a graceful shutdown we
+    /// can wait for all the reconcilers to respond to their cancellation tokens.
+    pub(crate) _gate_guard: GateGuard,
+
    /// Access to persistent storage for updating generation numbers
    pub(crate) persistence: Arc<Persistence>,
 }
@@ -263,7 +268,7 @@ impl Reconciler {
                secondary_conf,
                tenant_conf: config.clone(),
                shard_number: shard.number.0,
-                shard_count: shard.count.0,
+                shard_count: shard.count.literal(),
                shard_stripe_size: shard.stripe_size.0,
            }
        }
@@ -458,7 +463,7 @@ impl Reconciler {
                    generation: None,
                    secondary_conf: None,
                    shard_number: self.shard.number.0,
-                    shard_count: self.shard.count.0,
+                    shard_count: self.shard.count.literal(),
                    shard_stripe_size: self.shard.stripe_size.0,
                    tenant_conf: self.config.clone(),
                },
@@ -506,7 +511,7 @@ pub(crate) fn attached_location_conf(
        generation: generation.into(),
        secondary_conf: None,
        shard_number: shard.number.0,
-        shard_count: shard.count.0,
+        shard_count: shard.count.literal(),
        shard_stripe_size: shard.stripe_size.0,
        tenant_conf: config.clone(),
    }
@@ -521,7 +526,7 @@ pub(crate) fn secondary_location_conf(
        generation: None,
        secondary_conf: Some(LocationConfigSecondary { warm: true }),
        shard_number: shard.number.0,
-        shard_count: shard.count.0,
+        shard_count: shard.count.literal(),
        shard_stripe_size: shard.stripe_size.0,
        tenant_conf: config.clone(),
    }
--- a/control_plane/attachment_service/src/scheduler.rs
+++ b/control_plane/attachment_service/src/scheduler.rs
@@ -77,12 +77,11 @@ impl Scheduler {
            return Err(ScheduleError::ImpossibleConstraint);
        }

-        for (node_id, count) in &tenant_counts {
-            tracing::info!("tenant_counts[{node_id}]={count}");
-        }
-
        let node_id = tenant_counts.first().unwrap().0;
-        tracing::info!("scheduler selected node {node_id}");
+        tracing::info!(
+            "scheduler selected node {node_id} (elegible nodes {:?}, exclude: {hard_exclude:?})",
+            tenant_counts.iter().map(|i| i.0 .0).collect::<Vec<_>>()
+        );
        *self.tenant_counts.get_mut(&node_id).unwrap() += 1;
        Ok(node_id)
    }
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -30,6 +30,7 @@ use pageserver_api::{
 };
 use pageserver_client::mgmt_api;
 use tokio_util::sync::CancellationToken;
+use tracing::instrument;
 use utils::{
    backoff,
    completion::Barrier,
@@ -37,6 +38,7 @@ use utils::{
    http::error::ApiError,
    id::{NodeId, TenantId, TimelineId},
    seqwait::SeqWait,
+    sync::gate::Gate,
 };

 use crate::{
@@ -124,6 +126,12 @@ pub struct Service {
    config: Config,
    persistence: Arc<Persistence>,

+    // Process shutdown will fire this token
+    cancel: CancellationToken,
+
+    // Background tasks will hold this gate
+    gate: Gate,
+
    /// This waits for initial reconciliation with pageservers to complete.  Until this barrier
    /// passes, it isn't safe to do any actions that mutate tenants.
    pub(crate) startup_complete: Barrier,
@@ -144,8 +152,9 @@ impl Service {
        &self.config
    }

-    /// TODO: don't allow other API calls until this is done, don't start doing any background housekeeping
-    /// until this is done.
+    /// Called once on startup, this function attempts to contact all pageservers to build an up-to-date
+    /// view of the world, and determine which pageservers are responsive.
+    #[instrument(skip_all)]
    async fn startup_reconcile(&self) {
        // For all tenant shards, a vector of observed states on nodes (where None means
        // indeterminate, same as in [`ObservedStateLocation`])
@@ -153,9 +162,6 @@ impl Service {

        let mut nodes_online = HashSet::new();

-        // TODO: give Service a cancellation token for clean shutdown
-        let cancel = CancellationToken::new();
-
        // TODO: issue these requests concurrently
        {
            let nodes = {
@@ -190,7 +196,7 @@ impl Service {
                    1,
                    5,
                    "Location config listing",
-                    &cancel,
+                    &self.cancel,
                )
                .await;
                let Some(list_response) = list_response else {
@@ -292,7 +298,7 @@ impl Service {
                        generation: None,
                        secondary_conf: None,
                        shard_number: tenant_shard_id.shard_number.0,
-                        shard_count: tenant_shard_id.shard_count.0,
+                        shard_count: tenant_shard_id.shard_count.literal(),
                        shard_stripe_size: 0,
                        tenant_conf: models::TenantConfig::default(),
                    },
@@ -331,7 +337,7 @@ impl Service {
        let stream = futures::stream::iter(compute_notifications.into_iter())
            .map(|(tenant_shard_id, node_id)| {
                let compute_hook = compute_hook.clone();
-                let cancel = cancel.clone();
+                let cancel = self.cancel.clone();
                async move {
                    if let Err(e) = compute_hook.notify(tenant_shard_id, node_id, &cancel).await {
                        tracing::error!(
@@ -368,8 +374,98 @@ impl Service {
        tracing::info!("Startup complete, spawned {reconcile_tasks} reconciliation tasks ({shard_count} shards total)");
    }

+    /// Long running background task that periodically wakes up and looks for shards that need
+    /// reconciliation.  Reconciliation is fallible, so any reconciliation tasks that fail during
+    /// e.g. a tenant create/attach/migrate must eventually be retried: this task is responsible
+    /// for those retries.
+    #[instrument(skip_all)]
+    async fn background_reconcile(&self) {
+        self.startup_complete.clone().wait().await;
+
+        const BACKGROUND_RECONCILE_PERIOD: Duration = Duration::from_secs(20);
+
+        let mut interval = tokio::time::interval(BACKGROUND_RECONCILE_PERIOD);
+        while !self.cancel.is_cancelled() {
+            tokio::select! {
+              _ = interval.tick() => { self.reconcile_all(); }
+              _ = self.cancel.cancelled() => return
+            }
+        }
+    }
+
+    #[instrument(skip_all)]
+    async fn process_results(
+        &self,
+        mut result_rx: tokio::sync::mpsc::UnboundedReceiver<ReconcileResult>,
+    ) {
+        loop {
+            // Wait for the next result, or for cancellation
+            let result = tokio::select! {
+                r = result_rx.recv() => {
+                    match r {
+                        Some(result) => {result},
+                        None => {break;}
+                    }
+                }
+                _ = self.cancel.cancelled() => {
+                    break;
+                }
+            };
+
+            tracing::info!(
+                "Reconcile result for sequence {}, ok={}",
+                result.sequence,
+                result.result.is_ok()
+            );
+            let mut locked = self.inner.write().unwrap();
+            let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else {
+                // A reconciliation result might race with removing a tenant: drop results for
+                // tenants that aren't in our map.
+                continue;
+            };
+
+            // Usually generation should only be updated via this path, so the max() isn't
+            // needed, but it is used to handle out-of-band updates via. e.g. test hook.
+            tenant.generation = std::cmp::max(tenant.generation, result.generation);
+
+            // If the reconciler signals that it failed to notify compute, set this state on
+            // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
+            tenant.pending_compute_notification = result.pending_compute_notification;
+
+            match result.result {
+                Ok(()) => {
+                    for (node_id, loc) in &result.observed.locations {
+                        if let Some(conf) = &loc.conf {
+                            tracing::info!("Updating observed location {}: {:?}", node_id, conf);
+                        } else {
+                            tracing::info!("Setting observed location {} to None", node_id,)
+                        }
+                    }
+                    tenant.observed = result.observed;
+                    tenant.waiter.advance(result.sequence);
+                }
+                Err(e) => {
+                    tracing::warn!(
+                        "Reconcile error on tenant {}: {}",
+                        tenant.tenant_shard_id,
+                        e
+                    );
+
+                    // Ordering: populate last_error before advancing error_seq,
+                    // so that waiters will see the correct error after waiting.
+                    *(tenant.last_error.lock().unwrap()) = format!("{e}");
+                    tenant.error_waiter.advance(result.sequence);
+
+                    for (node_id, o) in result.observed.locations {
+                        tenant.observed.locations.insert(node_id, o);
+                    }
+                }
+            }
+        }
+    }
+
    pub async fn spawn(config: Config, persistence: Arc<Persistence>) -> anyhow::Result<Arc<Self>> {
-        let (result_tx, mut result_rx) = tokio::sync::mpsc::unbounded_channel();
+        let (result_tx, result_rx) = tokio::sync::mpsc::unbounded_channel();

        tracing::info!("Loading nodes from database...");
        let nodes = persistence.list_nodes().await?;
@@ -389,14 +485,14 @@ impl Service {
            let tenant_shard_id = TenantShardId {
                tenant_id: TenantId::from_str(tsp.tenant_id.as_str())?,
                shard_number: ShardNumber(tsp.shard_number as u8),
-                shard_count: ShardCount(tsp.shard_count as u8),
+                shard_count: ShardCount::new(tsp.shard_count as u8),
            };
            let shard_identity = if tsp.shard_count == 0 {
                ShardIdentity::unsharded()
            } else {
                ShardIdentity::new(
                    ShardNumber(tsp.shard_number as u8),
-                    ShardCount(tsp.shard_count as u8),
+                    ShardCount::new(tsp.shard_count as u8),
                    ShardStripeSize(tsp.shard_stripe_size as u32),
                )?
            };
@@ -418,6 +514,7 @@ impl Service {
                observed: ObservedState::new(),
                config: serde_json::from_str(&tsp.config).unwrap(),
                reconciler: None,
+                splitting: tsp.splitting,
                waiter: Arc::new(SeqWait::new(Sequence::initial())),
                error_waiter: Arc::new(SeqWait::new(Sequence::initial())),
                last_error: Arc::default(),
@@ -439,73 +536,35 @@ impl Service {
            config,
            persistence,
            startup_complete: startup_complete.clone(),
+            cancel: CancellationToken::new(),
+            gate: Gate::default(),
        });

        let result_task_this = this.clone();
        tokio::task::spawn(async move {
-            while let Some(result) = result_rx.recv().await {
-                tracing::info!(
-                    "Reconcile result for sequence {}, ok={}",
-                    result.sequence,
-                    result.result.is_ok()
-                );
-                let mut locked = result_task_this.inner.write().unwrap();
-                let Some(tenant) = locked.tenants.get_mut(&result.tenant_shard_id) else {
-                    // A reconciliation result might race with removing a tenant: drop results for
-                    // tenants that aren't in our map.
-                    continue;
-                };
-
-                // Usually generation should only be updated via this path, so the max() isn't
-                // needed, but it is used to handle out-of-band updates via. e.g. test hook.
-                tenant.generation = std::cmp::max(tenant.generation, result.generation);
-
-                // If the reconciler signals that it failed to notify compute, set this state on
-                // the shard so that a future [`TenantState::maybe_reconcile`] will try again.
-                tenant.pending_compute_notification = result.pending_compute_notification;
-
-                match result.result {
-                    Ok(()) => {
-                        for (node_id, loc) in &result.observed.locations {
-                            if let Some(conf) = &loc.conf {
-                                tracing::info!(
-                                    "Updating observed location {}: {:?}",
-                                    node_id,
-                                    conf
-                                );
-                            } else {
-                                tracing::info!("Setting observed location {} to None", node_id,)
-                            }
-                        }
-                        tenant.observed = result.observed;
-                        tenant.waiter.advance(result.sequence);
-                    }
-                    Err(e) => {
-                        tracing::warn!(
-                            "Reconcile error on tenant {}: {}",
-                            tenant.tenant_shard_id,
-                            e
-                        );
-
-                        // Ordering: populate last_error before advancing error_seq,
-                        // so that waiters will see the correct error after waiting.
-                        *(tenant.last_error.lock().unwrap()) = format!("{e}");
-                        tenant.error_waiter.advance(result.sequence);
-
-                        for (node_id, o) in result.observed.locations {
-                            tenant.observed.locations.insert(node_id, o);
-                        }
-                    }
-                }
+            // Block shutdown until we're done (we must respect self.cancel)
+            if let Ok(_gate) = result_task_this.gate.enter() {
+                result_task_this.process_results(result_rx).await
            }
        });

-        let startup_reconcile_this = this.clone();
-        tokio::task::spawn(async move {
-            // Block the [`Service::startup_complete`] barrier until we're done
-            let _completion = startup_completion;
+        tokio::task::spawn({
+            let this = this.clone();
+            // We will block the [`Service::startup_complete`] barrier until [`Self::startup_reconcile`]
+            // is done.
+            let startup_completion = startup_completion.clone();
+            async move {
+                // Block shutdown until we're done (we must respect self.cancel)
+                let Ok(_gate) = this.gate.enter() else {
+                    return;
+                };

-            startup_reconcile_this.startup_reconcile().await
+                this.startup_reconcile().await;
+
+                drop(startup_completion);
+
+                this.background_reconcile().await;
+            }
        });

        Ok(this)
@@ -526,7 +585,7 @@ impl Service {
            let tsp = TenantShardPersistence {
                tenant_id: attach_req.tenant_shard_id.tenant_id.to_string(),
                shard_number: attach_req.tenant_shard_id.shard_number.0 as i32,
-                shard_count: attach_req.tenant_shard_id.shard_count.0 as i32,
+                shard_count: attach_req.tenant_shard_id.shard_count.literal() as i32,
                shard_stripe_size: 0,
                generation: 0,
                generation_pageserver: i64::MAX,
@@ -620,6 +679,28 @@ impl Service {
            attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
        );

+        // Trick the reconciler into not doing anything for this tenant: this helps
+        // tests that manually configure a tenant on the pagesrever, and then call this
+        // attach hook: they don't want background reconciliation to modify what they
+        // did to the pageserver.
+        #[cfg(feature = "testing")]
+        {
+            if let Some(node_id) = attach_req.node_id {
+                tenant_state.observed.locations = HashMap::from([(
+                    node_id,
+                    ObservedStateLocation {
+                        conf: Some(attached_location_conf(
+                            tenant_state.generation,
+                            &tenant_state.shard,
+                            &tenant_state.config,
+                        )),
+                    },
+                )]);
+            } else {
+                tenant_state.observed.locations.clear();
+            }
+        }
+
        Ok(AttachHookResponse {
            gen: attach_req
                .node_id
@@ -726,16 +807,9 @@ impl Service {
        &self,
        create_req: TenantCreateRequest,
    ) -> Result<TenantCreateResponse, ApiError> {
-        // Shard count 0 is valid: it means create a single shard (ShardCount(0) means "unsharded")
-        let literal_shard_count = if create_req.shard_parameters.is_unsharded() {
-            1
-        } else {
-            create_req.shard_parameters.count.0
-        };
-
        // This service expects to handle sharding itself: it is an error to try and directly create
        // a particular shard here.
-        let tenant_id = if create_req.new_tenant_id.shard_count > ShardCount(1) {
+        let tenant_id = if !create_req.new_tenant_id.is_unsharded() {
            return Err(ApiError::BadRequest(anyhow::anyhow!(
                "Attempted to create a specific shard, this API is for creating the whole tenant"
            )));
@@ -749,7 +823,7 @@ impl Service {
            create_req.shard_parameters.count,
        );

-        let create_ids = (0..literal_shard_count)
+        let create_ids = (0..create_req.shard_parameters.count.count())
            .map(|i| TenantShardId {
                tenant_id,
                shard_number: ShardNumber(i),
@@ -769,7 +843,7 @@ impl Service {
            .map(|tenant_shard_id| TenantShardPersistence {
                tenant_id: tenant_shard_id.tenant_id.to_string(),
                shard_number: tenant_shard_id.shard_number.0 as i32,
-                shard_count: tenant_shard_id.shard_count.0 as i32,
+                shard_count: tenant_shard_id.shard_count.literal() as i32,
                shard_stripe_size: create_req.shard_parameters.stripe_size.0 as i32,
                generation: create_req.generation.map(|g| g as i32).unwrap_or(0),
                generation_pageserver: i64::MAX,
@@ -875,6 +949,8 @@ impl Service {
                        &compute_hook,
                        &self.config,
                        &self.persistence,
+                        &self.gate,
+                        &self.cancel,
                    )
                })
                .collect::<Vec<_>>();
@@ -914,7 +990,7 @@ impl Service {
        tenant_id: TenantId,
        req: TenantLocationConfigRequest,
    ) -> Result<TenantLocationConfigResponse, ApiError> {
-        if req.tenant_id.shard_count.0 > 1 {
+        if !req.tenant_id.is_unsharded() {
            return Err(ApiError::BadRequest(anyhow::anyhow!(
                "This API is for importing single-sharded or unsharded tenants"
            )));
@@ -977,6 +1053,8 @@ impl Service {
                    &compute_hook,
                    &self.config,
                    &self.persistence,
+                    &self.gate,
+                    &self.cancel,
                );
                if let Some(waiter) = maybe_waiter {
                    waiters.push(waiter);
@@ -1066,6 +1144,8 @@ impl Service {
    }

    pub(crate) async fn tenant_delete(&self, tenant_id: TenantId) -> Result<StatusCode, ApiError> {
+        self.ensure_attached_wait(tenant_id).await?;
+
        // TODO: refactor into helper
        let targets = {
            let locked = self.inner.read().unwrap();
@@ -1087,8 +1167,6 @@ impl Service {
            targets
        };

-        // TODO: error out if the tenant is not attached anywhere.
-
        // Phase 1: delete on the pageservers
        let mut any_pending = false;
        for (tenant_shard_id, node) in targets {
@@ -1424,9 +1502,6 @@ impl Service {
        let mut policy = None;
        let mut shard_ident = None;

-        // TODO: put a cancellation token on Service for clean shutdown
-        let cancel = CancellationToken::new();
-
        // A parent shard which will be split
        struct SplitTarget {
            parent_id: TenantShardId,
@@ -1449,7 +1524,7 @@ impl Service {
            for (tenant_shard_id, shard) in
                locked.tenants.range(TenantShardId::tenant_range(tenant_id))
            {
-                match shard.shard.count.0.cmp(&split_req.new_shard_count) {
+                match shard.shard.count.count().cmp(&split_req.new_shard_count) {
                    Ordering::Equal => {
                        //  Already split this
                        children_found.push(*tenant_shard_id);
@@ -1459,7 +1534,7 @@ impl Service {
                        return Err(ApiError::BadRequest(anyhow::anyhow!(
                            "Requested count {} but already have shards at count {}",
                            split_req.new_shard_count,
-                            shard.shard.count.0
+                            shard.shard.count.count()
                        )));
                    }
                    Ordering::Less => {
@@ -1489,7 +1564,7 @@ impl Service {
                    shard_ident = Some(shard.shard);
                }

-                if tenant_shard_id.shard_count == ShardCount(split_req.new_shard_count) {
+                if tenant_shard_id.shard_count.count() == split_req.new_shard_count {
                    tracing::info!(
                        "Tenant shard {} already has shard count {}",
                        tenant_shard_id,
@@ -1515,7 +1590,7 @@ impl Service {
                targets.push(SplitTarget {
                    parent_id: *tenant_shard_id,
                    node: node.clone(),
-                    child_ids: tenant_shard_id.split(ShardCount(split_req.new_shard_count)),
+                    child_ids: tenant_shard_id.split(ShardCount::new(split_req.new_shard_count)),
                });
            }

@@ -1562,7 +1637,7 @@ impl Service {
                this_child_tsps.push(TenantShardPersistence {
                    tenant_id: child.tenant_id.to_string(),
                    shard_number: child.shard_number.0 as i32,
-                    shard_count: child.shard_count.0 as i32,
+                    shard_count: child.shard_count.literal() as i32,
                    shard_stripe_size: shard_ident.stripe_size.0 as i32,
                    // Note: this generation is a placeholder, [`Persistence::begin_shard_split`] will
                    // populate the correct generation as part of its transaction, to protect us
@@ -1598,6 +1673,18 @@ impl Service {
            }
        }

+        // Now that I have persisted the splitting state, apply it in-memory.  This is infallible, so
+        // callers may assume that if splitting is set in memory, then it was persisted, and if splitting
+        // is not set in memory, then it was not persisted.
+        {
+            let mut locked = self.inner.write().unwrap();
+            for target in &targets {
+                if let Some(parent_shard) = locked.tenants.get_mut(&target.parent_id) {
+                    parent_shard.splitting = SplitState::Splitting;
+                }
+            }
+        }
+
        // FIXME: we have now committed the shard split state to the database, so any subsequent
        // failure needs to roll it back.  We will later wrap this function in logic to roll back
        // the split if it fails.
@@ -1657,7 +1744,7 @@ impl Service {
            .complete_shard_split(tenant_id, old_shard_count)
            .await?;

-        // Replace all the shards we just split with their children
+        // Replace all the shards we just split with their children: this phase is infallible.
        let mut response = TenantShardSplitResponse {
            new_shards: Vec::new(),
        };
@@ -1705,6 +1792,10 @@ impl Service {
                    child_state.generation = generation;
                    child_state.config = config.clone();

+                    // The child's TenantState::splitting is intentionally left at the default value of Idle,
+                    // as at this point in the split process we have succeeded and this part is infallible:
+                    // we will never need to do any special recovery from this state.
+
                    child_locations.push((child, pageserver));

                    locked.tenants.insert(child, child_state);
@@ -1716,7 +1807,7 @@ impl Service {
        // Send compute notifications for all the new shards
        let mut failed_notifications = Vec::new();
        for (child_id, child_ps) in child_locations {
-            if let Err(e) = compute_hook.notify(child_id, child_ps, &cancel).await {
+            if let Err(e) = compute_hook.notify(child_id, child_ps, &self.cancel).await {
                tracing::warn!("Failed to update compute of {}->{} during split, proceeding anyway to complete split ({e})",
                        child_id, child_ps);
                failed_notifications.push(child_id);
@@ -1792,6 +1883,8 @@ impl Service {
                &compute_hook,
                &self.config,
                &self.persistence,
+                &self.gate,
+                &self.cancel,
            )
        };

@@ -1993,6 +2086,8 @@ impl Service {
                                &compute_hook,
                                &self.config,
                                &self.persistence,
+                                &self.gate,
+                                &self.cancel,
                            );
                        }
                    }
@@ -2014,6 +2109,8 @@ impl Service {
                            &compute_hook,
                            &self.config,
                            &self.persistence,
+                            &self.gate,
+                            &self.cancel,
                        );
                    }
                }
@@ -2053,6 +2150,8 @@ impl Service {
                &compute_hook,
                &self.config,
                &self.persistence,
+                &self.gate,
+                &self.cancel,
            ) {
                waiters.push(waiter);
            }
@@ -2064,6 +2163,17 @@ impl Service {
        let ensure_waiters = {
            let locked = self.inner.write().unwrap();

+            // Check if the tenant is splitting: in this case, even if it is attached,
+            // we must act as if it is not: this blocks e.g. timeline creation/deletion
+            // operations during the split.
+            for (_shard_id, shard) in locked.tenants.range(TenantShardId::tenant_range(tenant_id)) {
+                if !matches!(shard.splitting, SplitState::Idle) {
+                    return Err(ApiError::ResourceUnavailable(
+                        "Tenant shards are currently splitting".into(),
+                    ));
+                }
+            }
+
            self.ensure_attached_schedule(locked, tenant_id)
                .map_err(ApiError::InternalServerError)?
        };
@@ -2095,8 +2205,25 @@ impl Service {
                    &compute_hook,
                    &self.config,
                    &self.persistence,
+                    &self.gate,
+                    &self.cancel,
                )
            })
            .count()
    }
+
+    pub async fn shutdown(&self) {
+        // Note that this already stops processing any results from reconciles: so
+        // we do not expect that our [`TenantState`] objects will reach a neat
+        // final state.
+        self.cancel.cancel();
+
+        // The cancellation tokens in [`crate::reconciler::Reconciler`] are children
+        // of our cancellation token, so we do not need to explicitly cancel each of
+        // them.
+
+        // Background tasks and reconcilers hold gate guards: this waits for them all
+        // to complete.
+        self.gate.close().await;
+    }
 }
--- a/control_plane/attachment_service/src/tenant_state.rs
+++ b/control_plane/attachment_service/src/tenant_state.rs
@@ -7,16 +7,18 @@ use pageserver_api::{
 };
 use tokio::task::JoinHandle;
 use tokio_util::sync::CancellationToken;
+use tracing::{instrument, Instrument};
 use utils::{
    generation::Generation,
    id::NodeId,
    seqwait::{SeqWait, SeqWaitError},
+    sync::gate::Gate,
 };

 use crate::{
    compute_hook::ComputeHook,
    node::Node,
-    persistence::Persistence,
+    persistence::{split_state::SplitState, Persistence},
    reconciler::{attached_location_conf, secondary_location_conf, ReconcileError, Reconciler},
    scheduler::{ScheduleError, Scheduler},
    service, PlacementPolicy, Sequence,
@@ -58,6 +60,11 @@ pub(crate) struct TenantState {
    /// cancellation token has been fired)
    pub(crate) reconciler: Option<ReconcilerHandle>,

+    /// If a tenant is being split, then all shards with that TenantId will have a
+    /// SplitState set, this acts as a guard against other operations such as background
+    /// reconciliation, and timeline creation.
+    pub(crate) splitting: SplitState,
+
    /// Optionally wait for reconciliation to complete up to a particular
    /// sequence number.
    pub(crate) waiter: std::sync::Arc<SeqWait<Sequence, Sequence>>,
@@ -238,6 +245,7 @@ impl TenantState {
            observed: ObservedState::default(),
            config: TenantConfig::default(),
            reconciler: None,
+            splitting: SplitState::Idle,
            sequence: Sequence(1),
            waiter: Arc::new(SeqWait::new(Sequence(0))),
            error_waiter: Arc::new(SeqWait::new(Sequence(0))),
@@ -415,6 +423,8 @@ impl TenantState {
        false
    }

+    #[allow(clippy::too_many_arguments)]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))]
    pub(crate) fn maybe_reconcile(
        &mut self,
        result_tx: tokio::sync::mpsc::UnboundedSender<ReconcileResult>,
@@ -422,6 +432,8 @@ impl TenantState {
        compute_hook: &Arc<ComputeHook>,
        service_config: &service::Config,
        persistence: &Arc<Persistence>,
+        gate: &Gate,
+        cancel: &CancellationToken,
    ) -> Option<ReconcilerWaiter> {
        // If there are any ambiguous observed states, and the nodes they refer to are available,
        // we should reconcile to clean them up.
@@ -443,6 +455,14 @@ impl TenantState {
            return None;
        }

+        // If we are currently splitting, then never start a reconciler task: the splitting logic
+        // requires that shards are not interfered with while it runs. Do this check here rather than
+        // up top, so that we only log this message if we would otherwise have done a reconciliation.
+        if !matches!(self.splitting, SplitState::Idle) {
+            tracing::info!("Refusing to reconcile, splitting in progress");
+            return None;
+        }
+
        // Reconcile already in flight for the current sequence?
        if let Some(handle) = &self.reconciler {
            if handle.sequence == self.sequence {
@@ -460,7 +480,12 @@ impl TenantState {
        // doing our sequence's work.
        let old_handle = self.reconciler.take();

-        let cancel = CancellationToken::new();
+        let Ok(gate_guard) = gate.enter() else {
+            // Shutting down, don't start a reconciler
+            return None;
+        };
+
+        let reconciler_cancel = cancel.child_token();
        let mut reconciler = Reconciler {
            tenant_shard_id: self.tenant_shard_id,
            shard: self.shard,
@@ -471,59 +496,66 @@ impl TenantState {
            pageservers: pageservers.clone(),
            compute_hook: compute_hook.clone(),
            service_config: service_config.clone(),
-            cancel: cancel.clone(),
+            _gate_guard: gate_guard,
+            cancel: reconciler_cancel.clone(),
            persistence: persistence.clone(),
            compute_notify_failure: false,
        };

        let reconcile_seq = self.sequence;

-        tracing::info!("Spawning Reconciler for sequence {}", self.sequence);
+        tracing::info!(seq=%reconcile_seq, "Spawning Reconciler for sequence {}", self.sequence);
        let must_notify = self.pending_compute_notification;
-        let join_handle = tokio::task::spawn(async move {
-            // Wait for any previous reconcile task to complete before we start
-            if let Some(old_handle) = old_handle {
-                old_handle.cancel.cancel();
-                if let Err(e) = old_handle.handle.await {
-                    // We can't do much with this other than log it: the task is done, so
-                    // we may proceed with our work.
-                    tracing::error!("Unexpected join error waiting for reconcile task: {e}");
+        let reconciler_span = tracing::info_span!(parent: None, "reconciler", seq=%reconcile_seq,
+                                                        tenant_id=%reconciler.tenant_shard_id.tenant_id,
+                                                        shard_id=%reconciler.tenant_shard_id.shard_slug());
+        let join_handle = tokio::task::spawn(
+            async move {
+                // Wait for any previous reconcile task to complete before we start
+                if let Some(old_handle) = old_handle {
+                    old_handle.cancel.cancel();
+                    if let Err(e) = old_handle.handle.await {
+                        // We can't do much with this other than log it: the task is done, so
+                        // we may proceed with our work.
+                        tracing::error!("Unexpected join error waiting for reconcile task: {e}");
+                    }
                }
+
+                // Early check for cancellation before doing any work
+                // TODO: wrap all remote API operations in cancellation check
+                // as well.
+                if reconciler.cancel.is_cancelled() {
+                    return;
+                }
+
+                // Attempt to make observed state match intent state
+                let result = reconciler.reconcile().await;
+
+                // If we know we had a pending compute notification from some previous action, send a notification irrespective
+                // of whether the above reconcile() did any work
+                if result.is_ok() && must_notify {
+                    // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
+                    reconciler.compute_notify().await.ok();
+                }
+
+                result_tx
+                    .send(ReconcileResult {
+                        sequence: reconcile_seq,
+                        result,
+                        tenant_shard_id: reconciler.tenant_shard_id,
+                        generation: reconciler.generation,
+                        observed: reconciler.observed,
+                        pending_compute_notification: reconciler.compute_notify_failure,
+                    })
+                    .ok();
            }
-
-            // Early check for cancellation before doing any work
-            // TODO: wrap all remote API operations in cancellation check
-            // as well.
-            if reconciler.cancel.is_cancelled() {
-                return;
-            }
-
-            // Attempt to make observed state match intent state
-            let result = reconciler.reconcile().await;
-
-            // If we know we had a pending compute notification from some previous action, send a notification irrespective
-            // of whether the above reconcile() did any work
-            if result.is_ok() && must_notify {
-                // If this fails we will send the need to retry in [`ReconcileResult::pending_compute_notification`]
-                reconciler.compute_notify().await.ok();
-            }
-
-            result_tx
-                .send(ReconcileResult {
-                    sequence: reconcile_seq,
-                    result,
-                    tenant_shard_id: reconciler.tenant_shard_id,
-                    generation: reconciler.generation,
-                    observed: reconciler.observed,
-                    pending_compute_notification: reconciler.compute_notify_failure,
-                })
-                .ok();
-        });
+            .instrument(reconciler_span),
+        );

        self.reconciler = Some(ReconcilerHandle {
            sequence: self.sequence,
            handle: join_handle,
-            cancel,
+            cancel: reconciler_cancel,
        });

        Some(ReconcilerWaiter {
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -450,7 +450,7 @@ async fn handle_tenant(
                    new_tenant_id: TenantShardId::unsharded(tenant_id),
                    generation: None,
                    shard_parameters: ShardParameters {
-                        count: ShardCount(shard_count),
+                        count: ShardCount::new(shard_count),
                        stripe_size: shard_stripe_size
                            .map(ShardStripeSize)
                            .unwrap_or(ShardParameters::DEFAULT_STRIPE_SIZE),
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -652,7 +652,9 @@ impl Endpoint {
                        }
                        ComputeStatus::Empty
                        | ComputeStatus::ConfigurationPending
-                        | ComputeStatus::Configuration => {
+                        | ComputeStatus::Configuration
+                        | ComputeStatus::TerminationPending
+                        | ComputeStatus::Terminated => {
                            bail!("unexpected compute status: {:?}", state.status)
                        }
                    }
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -400,6 +400,11 @@ impl PageServerNode {
                .map(|x| x.parse::<bool>())
                .transpose()
                .context("Failed to parse 'lazy_slru_download' as bool")?,
+            timeline_get_throttle: settings
+                .remove("timeline_get_throttle")
+                .map(serde_json::from_str)
+                .transpose()
+                .context("parse `timeline_get_throttle` from json")?,
        };
        if !settings.is_empty() {
            bail!("Unrecognized tenant settings: {settings:?}")
@@ -505,6 +510,11 @@ impl PageServerNode {
                    .map(|x| x.parse::<bool>())
                    .transpose()
                    .context("Failed to parse 'lazy_slru_download' as bool")?,
+                timeline_get_throttle: settings
+                    .remove("timeline_get_throttle")
+                    .map(serde_json::from_str)
+                    .transpose()
+                    .context("parse `timeline_get_throttle` from json")?,
            }
        };

--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -52,6 +52,10 @@ pub enum ComputeStatus {
    // compute will exit soon or is waiting for
    // control-plane to terminate it.
    Failed,
+    // Termination requested
+    TerminationPending,
+    // Terminated Postgres
+    Terminated,
 }

 fn rfc3339_serialize<S>(x: &Option<DateTime<Utc>>, s: S) -> Result<S::Ok, S::Error>
--- a/libs/metrics/src/lib.rs
+++ b/libs/metrics/src/lib.rs
@@ -115,7 +115,6 @@ pub fn set_build_info_metric(revision: &str, build_tag: &str) {
 // performed by the process.
 // We know the size of the block, so we can determine the I/O bytes out of it.
 // The value might be not 100% exact, but should be fine for Prometheus metrics in this case.
-#[allow(clippy::unnecessary_cast)]
 fn update_rusage_metrics() {
    let rusage_stats = get_rusage_stats();

--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -214,14 +214,14 @@ impl ShardParameters {
    pub const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);

    pub fn is_unsharded(&self) -> bool {
-        self.count == ShardCount(0)
+        self.count.is_unsharded()
    }
 }

 impl Default for ShardParameters {
    fn default() -> Self {
        Self {
-            count: ShardCount(0),
+            count: ShardCount::new(0),
            stripe_size: Self::DEFAULT_STRIPE_SIZE,
        }
    }
@@ -283,6 +283,7 @@ pub struct TenantConfig {
    pub gc_feedback: Option<bool>,
    pub heatmap_period: Option<String>,
    pub lazy_slru_download: Option<bool>,
+    pub timeline_get_throttle: Option<ThrottleConfig>,
 }

 #[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
@@ -309,6 +310,35 @@ pub struct EvictionPolicyLayerAccessThreshold {
    pub threshold: Duration,
 }

+#[derive(Debug, Serialize, Deserialize, Clone, PartialEq, Eq)]
+pub struct ThrottleConfig {
+    pub task_kinds: Vec<String>, // TaskKind
+    pub initial: usize,
+    #[serde(with = "humantime_serde")]
+    pub refill_interval: Duration,
+    pub refill_amount: NonZeroUsize,
+    pub max: usize,
+    pub fair: bool,
+}
+
+impl ThrottleConfig {
+    pub fn disabled() -> Self {
+        Self {
+            task_kinds: vec![], // effectively disables the throttle
+            // other values don't matter with emtpy `task_kinds`.
+            initial: 0,
+            refill_interval: Duration::from_millis(1),
+            refill_amount: NonZeroUsize::new(1).unwrap(),
+            max: 1,
+            fair: true,
+        }
+    }
+    /// The requests per second allowed  by the given config.
+    pub fn steady_rps(&self) -> f64 {
+        (self.refill_amount.get() as f64) / (self.refill_interval.as_secs_f64()) / 1e3
+    }
+}
+
 /// A flattened analog of a `pagesever::tenant::LocationMode`, which
 /// lists out all possible states (and the virtual "Detached" state)
 /// in a flat form rather than using rust-style enums.
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -13,10 +13,41 @@ use utils::id::TenantId;
 pub struct ShardNumber(pub u8);

 #[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
-pub struct ShardCount(pub u8);
+pub struct ShardCount(u8);

 impl ShardCount {
    pub const MAX: Self = Self(u8::MAX);
+
+    /// The internal value of a ShardCount may be zero, which means "1 shard, but use
+    /// legacy format for TenantShardId that excludes the shard suffix", also known
+    /// as `TenantShardId::unsharded`.
+    ///
+    /// This method returns the actual number of shards, i.e. if our internal value is
+    /// zero, we return 1 (unsharded tenants have 1 shard).
+    pub fn count(&self) -> u8 {
+        if self.0 > 0 {
+            self.0
+        } else {
+            1
+        }
+    }
+
+    /// The literal internal value: this is **not** the number of shards in the
+    /// tenant, as we have a special zero value for legacy unsharded tenants.  Use
+    /// [`Self::count`] if you want to know the cardinality of shards.
+    pub fn literal(&self) -> u8 {
+        self.0
+    }
+
+    pub fn is_unsharded(&self) -> bool {
+        self.0 == 0
+    }
+
+    /// `v` may be zero, or the number of shards in the tenant.  `v` is what
+    /// [`Self::literal`] would return.
+    pub fn new(val: u8) -> Self {
+        Self(val)
+    }
 }

 impl ShardNumber {
@@ -86,7 +117,7 @@ impl TenantShardId {
    }

    pub fn is_unsharded(&self) -> bool {
-        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
+        self.shard_number == ShardNumber(0) && self.shard_count.is_unsharded()
    }

    /// Convenience for dropping the tenant_id and just getting the ShardIndex: this
@@ -471,10 +502,12 @@ impl ShardIdentity {
    pub fn is_key_disposable(&self, key: &Key) -> bool {
        if key_is_shard0(key) {
            // Q: Why can't we dispose of shard0 content if we're not shard 0?
-            // A: because the WAL ingestion logic currently ingests some shard 0
-            //    content on all shards, even though it's only read on shard 0.  If we
-            //    dropped it, then subsequent WAL ingest to these keys would encounter
-            //    an error.
+            // A1: because the WAL ingestion logic currently ingests some shard 0
+            //     content on all shards, even though it's only read on shard 0.  If we
+            //     dropped it, then subsequent WAL ingest to these keys would encounter
+            //     an error.
+            // A2: because key_is_shard0 also covers relation size keys, which are written
+            //     on all shards even though they're only maintained accurately on shard 0.
            false
        } else {
            !self.is_key_local(key)
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -3,7 +3,7 @@
 #![allow(non_snake_case)]
 // bindgen creates some unsafe code with no doc comments.
 #![allow(clippy::missing_safety_doc)]
-// noted at 1.63 that in many cases there's a u32 -> u32 transmutes in bindgen code.
+// noted at 1.63 that in many cases there's u32 -> u32 transmutes in bindgen code.
 #![allow(clippy::useless_transmute)]
 // modules included with the postgres_ffi macro depend on the types of the specific version's
 // types, and trigger a too eager lint.
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -15,11 +15,13 @@ aws-sdk-s3.workspace = true
 aws-credential-types.workspace = true
 bytes.workspace = true
 camino.workspace = true
+humantime.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
+tokio-stream.workspace = true
 tokio-util = { workspace = true, features = ["compat"] }
 toml_edit.workspace = true
 tracing.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -22,16 +22,15 @@ use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerCl
 use bytes::Bytes;
 use futures::stream::Stream;
 use futures_util::StreamExt;
+use futures_util::TryStreamExt;
 use http_types::{StatusCode, Url};
-use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::debug;

-use crate::s3_bucket::RequestKind;
-use crate::TimeTravelError;
 use crate::{
-    AzureConfig, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode, RemotePath,
-    RemoteStorage, StorageMetadata,
+    error::Cancelled, s3_bucket::RequestKind, AzureConfig, ConcurrencyLimiter, Download,
+    DownloadError, Listing, ListingMode, RemotePath, RemoteStorage, StorageMetadata,
+    TimeTravelError, TimeoutOrCancel,
 };

 pub struct AzureBlobStorage {
@@ -39,10 +38,12 @@ pub struct AzureBlobStorage {
    prefix_in_container: Option<String>,
    max_keys_per_list_response: Option<NonZeroU32>,
    concurrency_limiter: ConcurrencyLimiter,
+    // Per-request timeout. Accessible for tests.
+    pub timeout: Duration,
 }

 impl AzureBlobStorage {
-    pub fn new(azure_config: &AzureConfig) -> Result<Self> {
+    pub fn new(azure_config: &AzureConfig, timeout: Duration) -> Result<Self> {
        debug!(
            "Creating azure remote storage for azure container {}",
            azure_config.container_name
@@ -79,6 +80,7 @@ impl AzureBlobStorage {
            prefix_in_container: azure_config.prefix_in_container.to_owned(),
            max_keys_per_list_response,
            concurrency_limiter: ConcurrencyLimiter::new(azure_config.concurrency_limit.get()),
+            timeout,
        })
    }

@@ -121,8 +123,11 @@ impl AzureBlobStorage {
    async fn download_for_builder(
        &self,
        builder: GetBlobBuilder,
+        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError> {
-        let mut response = builder.into_stream();
+        let kind = RequestKind::Get;
+
+        let _permit = self.permit(kind, cancel).await?;

        let mut etag = None;
        let mut last_modified = None;
@@ -130,39 +135,70 @@ impl AzureBlobStorage {
        // TODO give proper streaming response instead of buffering into RAM
        // https://github.com/neondatabase/neon/issues/5563

-        let mut bufs = Vec::new();
-        while let Some(part) = response.next().await {
-            let part = part.map_err(to_download_error)?;
-            let etag_str: &str = part.blob.properties.etag.as_ref();
-            if etag.is_none() {
-                etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
+        let download = async {
+            let response = builder
+                // convert to concrete Pageable
+                .into_stream()
+                // convert to TryStream
+                .into_stream()
+                .map_err(to_download_error);
+
+            // apply per request timeout
+            let response = tokio_stream::StreamExt::timeout(response, self.timeout);
+
+            // flatten
+            let response = response.map(|res| match res {
+                Ok(res) => res,
+                Err(_elapsed) => Err(DownloadError::Timeout),
+            });
+
+            let mut response = std::pin::pin!(response);
+
+            let mut bufs = Vec::new();
+            while let Some(part) = response.next().await {
+                let part = part?;
+                let etag_str: &str = part.blob.properties.etag.as_ref();
+                if etag.is_none() {
+                    etag = Some(etag.unwrap_or_else(|| etag_str.to_owned()));
+                }
+                if last_modified.is_none() {
+                    last_modified = Some(part.blob.properties.last_modified.into());
+                }
+                if let Some(blob_meta) = part.blob.metadata {
+                    metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
+                }
+                let data = part
+                    .data
+                    .collect()
+                    .await
+                    .map_err(|e| DownloadError::Other(e.into()))?;
+                bufs.push(data);
            }
-            if last_modified.is_none() {
-                last_modified = Some(part.blob.properties.last_modified.into());
-            }
-            if let Some(blob_meta) = part.blob.metadata {
-                metadata.extend(blob_meta.iter().map(|(k, v)| (k.to_owned(), v.to_owned())));
-            }
-            let data = part
-                .data
-                .collect()
-                .await
-                .map_err(|e| DownloadError::Other(e.into()))?;
-            bufs.push(data);
+            Ok(Download {
+                download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
+                etag,
+                last_modified,
+                metadata: Some(StorageMetadata(metadata)),
+            })
+        };
+
+        tokio::select! {
+            bufs = download => bufs,
+            _ = cancel.cancelled() => Err(DownloadError::Cancelled),
        }
-        Ok(Download {
-            download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
-            etag,
-            last_modified,
-            metadata: Some(StorageMetadata(metadata)),
-        })
    }

-    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
-        self.concurrency_limiter
-            .acquire(kind)
-            .await
-            .expect("semaphore is never closed")
+    async fn permit(
+        &self,
+        kind: RequestKind,
+        cancel: &CancellationToken,
+    ) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
+        let acquire = self.concurrency_limiter.acquire(kind);
+
+        tokio::select! {
+            permit = acquire => Ok(permit.expect("never closed")),
+            _ = cancel.cancelled() => Err(Cancelled),
+        }
    }
 }

@@ -192,66 +228,87 @@ impl RemoteStorage for AzureBlobStorage {
        prefix: Option<&RemotePath>,
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
    ) -> anyhow::Result<Listing, DownloadError> {
-        // get the passed prefix or if it is not set use prefix_in_bucket value
-        let list_prefix = prefix
-            .map(|p| self.relative_path_to_name(p))
-            .or_else(|| self.prefix_in_container.clone())
-            .map(|mut p| {
-                // required to end with a separator
-                // otherwise request will return only the entry of a prefix
-                if matches!(mode, ListingMode::WithDelimiter)
-                    && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
-                {
-                    p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-                }
-                p
+        let _permit = self.permit(RequestKind::List, cancel).await?;
+
+        let op = async {
+            // get the passed prefix or if it is not set use prefix_in_bucket value
+            let list_prefix = prefix
+                .map(|p| self.relative_path_to_name(p))
+                .or_else(|| self.prefix_in_container.clone())
+                .map(|mut p| {
+                    // required to end with a separator
+                    // otherwise request will return only the entry of a prefix
+                    if matches!(mode, ListingMode::WithDelimiter)
+                        && !p.ends_with(REMOTE_STORAGE_PREFIX_SEPARATOR)
+                    {
+                        p.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
+                    }
+                    p
+                });
+
+            let mut builder = self.client.list_blobs();
+
+            if let ListingMode::WithDelimiter = mode {
+                builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
+            }
+
+            if let Some(prefix) = list_prefix {
+                builder = builder.prefix(Cow::from(prefix.to_owned()));
+            }
+
+            if let Some(limit) = self.max_keys_per_list_response {
+                builder = builder.max_results(MaxResults::new(limit));
+            }
+
+            let response = builder.into_stream();
+            let response = response.into_stream().map_err(to_download_error);
+            let response = tokio_stream::StreamExt::timeout(response, self.timeout);
+            let response = response.map(|res| match res {
+                Ok(res) => res,
+                Err(_elapsed) => Err(DownloadError::Timeout),
            });

-        let mut builder = self.client.list_blobs();
+            let mut response = std::pin::pin!(response);

-        if let ListingMode::WithDelimiter = mode {
-            builder = builder.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
-        }
+            let mut res = Listing::default();

-        if let Some(prefix) = list_prefix {
-            builder = builder.prefix(Cow::from(prefix.to_owned()));
-        }
+            let mut max_keys = max_keys.map(|mk| mk.get());
+            while let Some(entry) = response.next().await {
+                let entry = entry?;
+                let prefix_iter = entry
+                    .blobs
+                    .prefixes()
+                    .map(|prefix| self.name_to_relative_path(&prefix.name));
+                res.prefixes.extend(prefix_iter);

-        if let Some(limit) = self.max_keys_per_list_response {
-            builder = builder.max_results(MaxResults::new(limit));
-        }
+                let blob_iter = entry
+                    .blobs
+                    .blobs()
+                    .map(|k| self.name_to_relative_path(&k.name));

-        let mut response = builder.into_stream();
-        let mut res = Listing::default();
-        // NonZeroU32 doesn't support subtraction apparently
-        let mut max_keys = max_keys.map(|mk| mk.get());
-        while let Some(l) = response.next().await {
-            let entry = l.map_err(to_download_error)?;
-            let prefix_iter = entry
-                .blobs
-                .prefixes()
-                .map(|prefix| self.name_to_relative_path(&prefix.name));
-            res.prefixes.extend(prefix_iter);
+                for key in blob_iter {
+                    res.keys.push(key);

-            let blob_iter = entry
-                .blobs
-                .blobs()
-                .map(|k| self.name_to_relative_path(&k.name));
-
-            for key in blob_iter {
-                res.keys.push(key);
-                if let Some(mut mk) = max_keys {
-                    assert!(mk > 0);
-                    mk -= 1;
-                    if mk == 0 {
-                        return Ok(res); // limit reached
+                    if let Some(mut mk) = max_keys {
+                        assert!(mk > 0);
+                        mk -= 1;
+                        if mk == 0 {
+                            return Ok(res); // limit reached
+                        }
+                        max_keys = Some(mk);
                    }
-                    max_keys = Some(mk);
                }
            }
+
+            Ok(res)
+        };
+
+        tokio::select! {
+            res = op => res,
+            _ = cancel.cancelled() => Err(DownloadError::Cancelled),
        }
-        Ok(res)
    }

    async fn upload(
@@ -260,35 +317,52 @@ impl RemoteStorage for AzureBlobStorage {
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Put).await;
-        let blob_client = self.client.blob_client(self.relative_path_to_name(to));
+        let _permit = self.permit(RequestKind::Put, cancel).await?;

-        let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
-            Box::pin(from);
+        let op = async {
+            let blob_client = self.client.blob_client(self.relative_path_to_name(to));

-        let from = NonSeekableStream::new(from, data_size_bytes);
+            let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
+                Box::pin(from);

-        let body = azure_core::Body::SeekableStream(Box::new(from));
+            let from = NonSeekableStream::new(from, data_size_bytes);

-        let mut builder = blob_client.put_block_blob(body);
+            let body = azure_core::Body::SeekableStream(Box::new(from));

-        if let Some(metadata) = metadata {
-            builder = builder.metadata(to_azure_metadata(metadata));
+            let mut builder = blob_client.put_block_blob(body);
+
+            if let Some(metadata) = metadata {
+                builder = builder.metadata(to_azure_metadata(metadata));
+            }
+
+            let fut = builder.into_future();
+            let fut = tokio::time::timeout(self.timeout, fut);
+
+            match fut.await {
+                Ok(Ok(_response)) => Ok(()),
+                Ok(Err(azure)) => Err(azure.into()),
+                Err(_timeout) => Err(TimeoutOrCancel::Cancel.into()),
+            }
+        };
+
+        tokio::select! {
+            res = op => res,
+            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
        }
-
-        let _response = builder.into_future().await?;
-
-        Ok(())
    }

-    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
-        let _permit = self.permit(RequestKind::Get).await;
+    async fn download(
+        &self,
+        from: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError> {
        let blob_client = self.client.blob_client(self.relative_path_to_name(from));

        let builder = blob_client.get();

-        self.download_for_builder(builder).await
+        self.download_for_builder(builder, cancel).await
    }

    async fn download_byte_range(
@@ -296,8 +370,8 @@ impl RemoteStorage for AzureBlobStorage {
        from: &RemotePath,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
+        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError> {
-        let _permit = self.permit(RequestKind::Get).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(from));

        let mut builder = blob_client.get();
@@ -309,82 +383,113 @@ impl RemoteStorage for AzureBlobStorage {
        };
        builder = builder.range(range);

-        self.download_for_builder(builder).await
+        self.download_for_builder(builder, cancel).await
    }

-    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Delete).await;
-        let blob_client = self.client.blob_client(self.relative_path_to_name(path));
+    async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
+        self.delete_objects(std::array::from_ref(path), cancel)
+            .await
+    }

-        let builder = blob_client.delete();
+    async fn delete_objects<'a>(
+        &self,
+        paths: &'a [RemotePath],
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        let _permit = self.permit(RequestKind::Delete, cancel).await?;

-        match builder.into_future().await {
-            Ok(_response) => Ok(()),
-            Err(e) => {
-                if let Some(http_err) = e.as_http_error() {
-                    if http_err.status() == StatusCode::NotFound {
-                        return Ok(());
+        let op = async {
+            // TODO batch requests are also not supported by the SDK
+            // https://github.com/Azure/azure-sdk-for-rust/issues/1068
+            // https://github.com/Azure/azure-sdk-for-rust/issues/1249
+            for path in paths {
+                let blob_client = self.client.blob_client(self.relative_path_to_name(path));
+
+                let request = blob_client.delete().into_future();
+
+                let res = tokio::time::timeout(self.timeout, request).await;
+
+                match res {
+                    Ok(Ok(_response)) => continue,
+                    Ok(Err(e)) => {
+                        if let Some(http_err) = e.as_http_error() {
+                            if http_err.status() == StatusCode::NotFound {
+                                continue;
+                            }
+                        }
+                        return Err(e.into());
                    }
+                    Err(_elapsed) => return Err(TimeoutOrCancel::Timeout.into()),
                }
-                Err(anyhow::Error::new(e))
            }
+
+            Ok(())
+        };
+
+        tokio::select! {
+            res = op => res,
+            _ = cancel.cancelled() => Err(TimeoutOrCancel::Cancel.into()),
        }
    }

-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        // Permit is already obtained by inner delete function
+    async fn copy(
+        &self,
+        from: &RemotePath,
+        to: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        let _permit = self.permit(RequestKind::Copy, cancel).await?;

-        // TODO batch requests are also not supported by the SDK
-        // https://github.com/Azure/azure-sdk-for-rust/issues/1068
-        // https://github.com/Azure/azure-sdk-for-rust/issues/1249
-        for path in paths {
-            self.delete(path).await?;
-        }
-        Ok(())
-    }
+        let timeout = tokio::time::sleep(self.timeout);

-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
-        let _permit = self.permit(RequestKind::Copy).await;
-        let blob_client = self.client.blob_client(self.relative_path_to_name(to));
+        let mut copy_status = None;

-        let source_url = format!(
-            "{}/{}",
-            self.client.url()?,
-            self.relative_path_to_name(from)
-        );
-        let builder = blob_client.copy(Url::from_str(&source_url)?);
+        let op = async {
+            let blob_client = self.client.blob_client(self.relative_path_to_name(to));

-        let result = builder.into_future().await?;
+            let source_url = format!(
+                "{}/{}",
+                self.client.url()?,
+                self.relative_path_to_name(from)
+            );

-        let mut copy_status = result.copy_status;
-        let start_time = Instant::now();
-        const MAX_WAIT_TIME: Duration = Duration::from_secs(60);
-        loop {
-            match copy_status {
-                CopyStatus::Aborted => {
-                    anyhow::bail!("Received abort for copy from {from} to {to}.");
+            let builder = blob_client.copy(Url::from_str(&source_url)?);
+            let copy = builder.into_future();
+
+            let result = copy.await?;
+
+            copy_status = Some(result.copy_status);
+            loop {
+                match copy_status.as_ref().expect("we always set it to Some") {
+                    CopyStatus::Aborted => {
+                        anyhow::bail!("Received abort for copy from {from} to {to}.");
+                    }
+                    CopyStatus::Failed => {
+                        anyhow::bail!("Received failure response for copy from {from} to {to}.");
+                    }
+                    CopyStatus::Success => return Ok(()),
+                    CopyStatus::Pending => (),
                }
-                CopyStatus::Failed => {
-                    anyhow::bail!("Received failure response for copy from {from} to {to}.");
-                }
-                CopyStatus::Success => return Ok(()),
-                CopyStatus::Pending => (),
+                // The copy is taking longer. Waiting a second and then re-trying.
+                // TODO estimate time based on copy_progress and adjust time based on that
+                tokio::time::sleep(Duration::from_millis(1000)).await;
+                let properties = blob_client.get_properties().into_future().await?;
+                let Some(status) = properties.blob.properties.copy_status else {
+                    tracing::warn!("copy_status for copy is None!, from={from}, to={to}");
+                    return Ok(());
+                };
+                copy_status = Some(status);
            }
-            // The copy is taking longer. Waiting a second and then re-trying.
-            // TODO estimate time based on copy_progress and adjust time based on that
-            tokio::time::sleep(Duration::from_millis(1000)).await;
-            let properties = blob_client.get_properties().into_future().await?;
-            let Some(status) = properties.blob.properties.copy_status else {
-                tracing::warn!("copy_status for copy is None!, from={from}, to={to}");
-                return Ok(());
-            };
-            if start_time.elapsed() > MAX_WAIT_TIME {
-                anyhow::bail!("Copy from from {from} to {to} took longer than limit MAX_WAIT_TIME={}s. copy_pogress={:?}.",
-                    MAX_WAIT_TIME.as_secs_f32(),
-                    properties.blob.properties.copy_progress,
-                );
-            }
-            copy_status = status;
+        };
+
+        tokio::select! {
+            res = op => res,
+            _ = cancel.cancelled() => Err(anyhow::Error::new(TimeoutOrCancel::Cancel)),
+            _ = timeout => {
+                let e = anyhow::Error::new(TimeoutOrCancel::Timeout);
+                let e = e.context(format!("Timeout, last status: {copy_status:?}"));
+                Err(e)
+            },
        }
    }

--- a/libs/remote_storage/src/error.rs
+++ b/libs/remote_storage/src/error.rs
@@ -0,0 +1,181 @@
+/// Reasons for downloads or listings to fail.
+#[derive(Debug)]
+pub enum DownloadError {
+    /// Validation or other error happened due to user input.
+    BadInput(anyhow::Error),
+    /// The file was not found in the remote storage.
+    NotFound,
+    /// A cancellation token aborted the download, typically during
+    /// tenant detach or process shutdown.
+    Cancelled,
+    /// A timeout happened while executing the request. Possible reasons:
+    /// - stuck tcp connection
+    ///
+    /// Concurrency control is not timed within timeout.
+    Timeout,
+    /// The file was found in the remote storage, but the download failed.
+    Other(anyhow::Error),
+}
+
+impl std::fmt::Display for DownloadError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            DownloadError::BadInput(e) => {
+                write!(f, "Failed to download a remote file due to user input: {e}")
+            }
+            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
+            DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
+            DownloadError::Timeout => write!(f, "timeout"),
+            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
+        }
+    }
+}
+
+impl std::error::Error for DownloadError {}
+
+impl DownloadError {
+    /// Returns true if the error should not be retried with backoff
+    pub fn is_permanent(&self) -> bool {
+        use DownloadError::*;
+        match self {
+            BadInput(_) | NotFound | Cancelled => true,
+            Timeout | Other(_) => false,
+        }
+    }
+}
+
+#[derive(Debug)]
+pub enum TimeTravelError {
+    /// Validation or other error happened due to user input.
+    BadInput(anyhow::Error),
+    /// The used remote storage does not have time travel recovery implemented
+    Unimplemented,
+    /// The number of versions/deletion markers is above our limit.
+    TooManyVersions,
+    /// A cancellation token aborted the process, typically during
+    /// request closure or process shutdown.
+    Cancelled,
+    /// Other errors
+    Other(anyhow::Error),
+}
+
+impl std::fmt::Display for TimeTravelError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        match self {
+            TimeTravelError::BadInput(e) => {
+                write!(
+                    f,
+                    "Failed to time travel recover a prefix due to user input: {e}"
+                )
+            }
+            TimeTravelError::Unimplemented => write!(
+                f,
+                "time travel recovery is not implemented for the current storage backend"
+            ),
+            TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"),
+            TimeTravelError::TooManyVersions => {
+                write!(f, "Number of versions/delete markers above limit")
+            }
+            TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"),
+        }
+    }
+}
+
+impl std::error::Error for TimeTravelError {}
+
+/// Plain cancelled error.
+///
+/// By design this type does not not implement `std::error::Error` so it cannot be put as the root
+/// cause of `std::io::Error` or `anyhow::Error`. It should never need to be exposed out of this
+/// crate.
+///
+/// It exists to implement permit acquiring in `{Download,TimeTravel}Error` and `anyhow::Error` returning
+/// operations and ensuring that those get converted to proper versions with just `?`.
+#[derive(Debug)]
+pub(crate) struct Cancelled;
+
+impl From<Cancelled> for anyhow::Error {
+    fn from(_: Cancelled) -> Self {
+        anyhow::Error::new(TimeoutOrCancel::Cancel)
+    }
+}
+
+impl From<Cancelled> for TimeTravelError {
+    fn from(_: Cancelled) -> Self {
+        TimeTravelError::Cancelled
+    }
+}
+
+impl From<Cancelled> for TimeoutOrCancel {
+    fn from(_: Cancelled) -> Self {
+        TimeoutOrCancel::Cancel
+    }
+}
+
+impl From<Cancelled> for DownloadError {
+    fn from(_: Cancelled) -> Self {
+        DownloadError::Cancelled
+    }
+}
+
+/// This type is used at as the root cause for timeouts and cancellations with `anyhow::Error` returning
+/// RemoteStorage methods.
+///
+/// For use with `utils::backoff::retry` and `anyhow::Error` returning operations there is
+/// `TimeoutOrCancel::caused_by_cancel` method to query "proper form" errors.
+#[derive(Debug)]
+pub enum TimeoutOrCancel {
+    Timeout,
+    Cancel,
+}
+
+impl std::fmt::Display for TimeoutOrCancel {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        use TimeoutOrCancel::*;
+        match self {
+            Timeout => write!(f, "timeout"),
+            Cancel => write!(f, "cancel"),
+        }
+    }
+}
+
+impl std::error::Error for TimeoutOrCancel {}
+
+impl TimeoutOrCancel {
+    pub fn caused(error: &anyhow::Error) -> Option<&Self> {
+        error.root_cause().downcast_ref()
+    }
+
+    /// Returns true if the error was caused by [`TimeoutOrCancel::Cancel`].
+    pub fn caused_by_cancel(error: &anyhow::Error) -> bool {
+        Self::caused(error).is_some_and(Self::is_cancel)
+    }
+
+    pub fn is_cancel(&self) -> bool {
+        matches!(self, TimeoutOrCancel::Cancel)
+    }
+
+    pub fn is_timeout(&self) -> bool {
+        matches!(self, TimeoutOrCancel::Timeout)
+    }
+}
+
+/// This conversion is used when [`crate::support::DownloadStream`] notices a cancellation or
+/// timeout to wrap it in an `std::io::Error`.
+impl From<TimeoutOrCancel> for std::io::Error {
+    fn from(value: TimeoutOrCancel) -> Self {
+        let e = DownloadError::from(value);
+        std::io::Error::other(e)
+    }
+}
+
+impl From<TimeoutOrCancel> for DownloadError {
+    fn from(value: TimeoutOrCancel) -> Self {
+        use TimeoutOrCancel::*;
+
+        match value {
+            Timeout => DownloadError::Timeout,
+            Cancel => DownloadError::Cancelled,
+        }
+    }
+}
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -10,6 +10,7 @@
 #![deny(clippy::undocumented_unsafe_blocks)]

 mod azure_blob;
+mod error;
 mod local_fs;
 mod s3_bucket;
 mod simulate_failures;
@@ -21,7 +22,7 @@ use std::{
    num::{NonZeroU32, NonZeroUsize},
    pin::Pin,
    sync::Arc,
-    time::SystemTime,
+    time::{Duration, SystemTime},
 };

 use anyhow::{bail, Context};
@@ -41,6 +42,8 @@ pub use self::{
 };
 use s3_bucket::RequestKind;

+pub use error::{DownloadError, TimeTravelError, TimeoutOrCancel};
+
 /// Currently, sync happens with AWS S3, that has two limits on requests per second:
 /// ~200 RPS for IAM services
 /// <https://docs.aws.amazon.com/AmazonRDS/latest/AuroraUserGuide/UsingWithRDS.IAMDBAuth.html>
@@ -158,9 +161,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
+        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        let result = self
-            .list(prefix, ListingMode::WithDelimiter, None)
+            .list(prefix, ListingMode::WithDelimiter, None, cancel)
            .await?
            .prefixes;
        Ok(result)
@@ -182,9 +186,10 @@ pub trait RemoteStorage: Send + Sync + 'static {
        &self,
        prefix: Option<&RemotePath>,
        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        let result = self
-            .list(prefix, ListingMode::NoDelimiter, max_keys)
+            .list(prefix, ListingMode::NoDelimiter, max_keys, cancel)
            .await?
            .keys;
        Ok(result)
@@ -195,9 +200,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
        prefix: Option<&RemotePath>,
        _mode: ListingMode,
        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
    ) -> Result<Listing, DownloadError>;

    /// Streams the local file contents into remote into the remote storage entry.
+    ///
+    /// If the operation fails because of timeout or cancellation, the root cause of the error will be
+    /// set to `TimeoutOrCancel`.
    async fn upload(
        &self,
        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
@@ -206,27 +215,61 @@ pub trait RemoteStorage: Send + Sync + 'static {
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
    ) -> anyhow::Result<()>;

-    /// Streams the remote storage entry contents into the buffered writer given, returns the filled writer.
+    /// Streams the remote storage entry contents.
+    ///
+    /// The returned download stream will obey initial timeout and cancellation signal by erroring
+    /// on whichever happens first. Only one of the reasons will fail the stream, which is usually
+    /// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out.
+    ///
    /// Returns the metadata, if any was stored with the file previously.
-    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError>;
+    async fn download(
+        &self,
+        from: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError>;

-    /// Streams a given byte range of the remote storage entry contents into the buffered writer given, returns the filled writer.
+    /// Streams a given byte range of the remote storage entry contents.
+    ///
+    /// The returned download stream will obey initial timeout and cancellation signal by erroring
+    /// on whichever happens first. Only one of the reasons will fail the stream, which is usually
+    /// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out.
+    ///
    /// Returns the metadata, if any was stored with the file previously.
    async fn download_byte_range(
        &self,
        from: &RemotePath,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
+        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError>;

-    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()>;
+    /// Delete a single path from remote storage.
+    ///
+    /// If the operation fails because of timeout or cancellation, the root cause of the error will be
+    /// set to `TimeoutOrCancel`. In such situation it is unknown if the deletion went through.
+    async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()>;

-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()>;
+    /// Delete a multiple paths from remote storage.
+    ///
+    /// If the operation fails because of timeout or cancellation, the root cause of the error will be
+    /// set to `TimeoutOrCancel`. In such situation it is unknown which deletions, if any, went
+    /// through.
+    async fn delete_objects<'a>(
+        &self,
+        paths: &'a [RemotePath],
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()>;

    /// Copy a remote object inside a bucket from one path to another.
-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()>;
+    async fn copy(
+        &self,
+        from: &RemotePath,
+        to: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()>;

    /// Resets the content of everything with the given prefix to the given state
    async fn time_travel_recover(
@@ -238,7 +281,13 @@ pub trait RemoteStorage: Send + Sync + 'static {
    ) -> Result<(), TimeTravelError>;
 }

-pub type DownloadStream = Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>;
+/// DownloadStream is sensitive to the timeout and cancellation used with the original
+/// [`RemoteStorage::download`] request. The type yields `std::io::Result<Bytes>` to be compatible
+/// with `tokio::io::copy_buf`.
+// This has 'static because safekeepers do not use cancellation tokens (yet)
+pub type DownloadStream =
+    Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>>;
+
 pub struct Download {
    pub download_stream: DownloadStream,
    /// The last time the file was modified (`last-modified` HTTP header)
@@ -257,86 +306,6 @@ impl Debug for Download {
    }
 }

-#[derive(Debug)]
-pub enum DownloadError {
-    /// Validation or other error happened due to user input.
-    BadInput(anyhow::Error),
-    /// The file was not found in the remote storage.
-    NotFound,
-    /// A cancellation token aborted the download, typically during
-    /// tenant detach or process shutdown.
-    Cancelled,
-    /// The file was found in the remote storage, but the download failed.
-    Other(anyhow::Error),
-}
-
-impl std::fmt::Display for DownloadError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            DownloadError::BadInput(e) => {
-                write!(f, "Failed to download a remote file due to user input: {e}")
-            }
-            DownloadError::Cancelled => write!(f, "Cancelled, shutting down"),
-            DownloadError::NotFound => write!(f, "No file found for the remote object id given"),
-            DownloadError::Other(e) => write!(f, "Failed to download a remote file: {e:?}"),
-        }
-    }
-}
-
-impl std::error::Error for DownloadError {}
-
-impl DownloadError {
-    /// Returns true if the error should not be retried with backoff
-    pub fn is_permanent(&self) -> bool {
-        use DownloadError::*;
-        match self {
-            BadInput(_) => true,
-            NotFound => true,
-            Cancelled => true,
-            Other(_) => false,
-        }
-    }
-}
-
-#[derive(Debug)]
-pub enum TimeTravelError {
-    /// Validation or other error happened due to user input.
-    BadInput(anyhow::Error),
-    /// The used remote storage does not have time travel recovery implemented
-    Unimplemented,
-    /// The number of versions/deletion markers is above our limit.
-    TooManyVersions,
-    /// A cancellation token aborted the process, typically during
-    /// request closure or process shutdown.
-    Cancelled,
-    /// Other errors
-    Other(anyhow::Error),
-}
-
-impl std::fmt::Display for TimeTravelError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            TimeTravelError::BadInput(e) => {
-                write!(
-                    f,
-                    "Failed to time travel recover a prefix due to user input: {e}"
-                )
-            }
-            TimeTravelError::Unimplemented => write!(
-                f,
-                "time travel recovery is not implemented for the current storage backend"
-            ),
-            TimeTravelError::Cancelled => write!(f, "Cancelled, shutting down"),
-            TimeTravelError::TooManyVersions => {
-                write!(f, "Number of versions/delete markers above limit")
-            }
-            TimeTravelError::Other(e) => write!(f, "Failed to time travel recover a prefix: {e:?}"),
-        }
-    }
-}
-
-impl std::error::Error for TimeTravelError {}
-
 /// Every storage, currently supported.
 /// Serves as a simple way to pass around the [`RemoteStorage`] without dealing with generics.
 #[derive(Clone)]
@@ -354,12 +323,13 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        prefix: Option<&RemotePath>,
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
    ) -> anyhow::Result<Listing, DownloadError> {
        match self {
-            Self::LocalFs(s) => s.list(prefix, mode, max_keys).await,
-            Self::AwsS3(s) => s.list(prefix, mode, max_keys).await,
-            Self::AzureBlob(s) => s.list(prefix, mode, max_keys).await,
-            Self::Unreliable(s) => s.list(prefix, mode, max_keys).await,
+            Self::LocalFs(s) => s.list(prefix, mode, max_keys, cancel).await,
+            Self::AwsS3(s) => s.list(prefix, mode, max_keys, cancel).await,
+            Self::AzureBlob(s) => s.list(prefix, mode, max_keys, cancel).await,
+            Self::Unreliable(s) => s.list(prefix, mode, max_keys, cancel).await,
        }
    }

@@ -372,12 +342,13 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        &self,
        folder: Option<&RemotePath>,
        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        match self {
-            Self::LocalFs(s) => s.list_files(folder, max_keys).await,
-            Self::AwsS3(s) => s.list_files(folder, max_keys).await,
-            Self::AzureBlob(s) => s.list_files(folder, max_keys).await,
-            Self::Unreliable(s) => s.list_files(folder, max_keys).await,
+            Self::LocalFs(s) => s.list_files(folder, max_keys, cancel).await,
+            Self::AwsS3(s) => s.list_files(folder, max_keys, cancel).await,
+            Self::AzureBlob(s) => s.list_files(folder, max_keys, cancel).await,
+            Self::Unreliable(s) => s.list_files(folder, max_keys, cancel).await,
        }
    }

@@ -387,36 +358,43 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
    pub async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
+        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        match self {
-            Self::LocalFs(s) => s.list_prefixes(prefix).await,
-            Self::AwsS3(s) => s.list_prefixes(prefix).await,
-            Self::AzureBlob(s) => s.list_prefixes(prefix).await,
-            Self::Unreliable(s) => s.list_prefixes(prefix).await,
+            Self::LocalFs(s) => s.list_prefixes(prefix, cancel).await,
+            Self::AwsS3(s) => s.list_prefixes(prefix, cancel).await,
+            Self::AzureBlob(s) => s.list_prefixes(prefix, cancel).await,
+            Self::Unreliable(s) => s.list_prefixes(prefix, cancel).await,
        }
    }

+    /// See [`RemoteStorage::upload`]
    pub async fn upload(
        &self,
        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
        match self {
-            Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata).await,
-            Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata).await,
-            Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata).await,
-            Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata).await,
+            Self::LocalFs(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
+            Self::AwsS3(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
+            Self::AzureBlob(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
+            Self::Unreliable(s) => s.upload(from, data_size_bytes, to, metadata, cancel).await,
        }
    }

-    pub async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+    pub async fn download(
+        &self,
+        from: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError> {
        match self {
-            Self::LocalFs(s) => s.download(from).await,
-            Self::AwsS3(s) => s.download(from).await,
-            Self::AzureBlob(s) => s.download(from).await,
-            Self::Unreliable(s) => s.download(from).await,
+            Self::LocalFs(s) => s.download(from, cancel).await,
+            Self::AwsS3(s) => s.download(from, cancel).await,
+            Self::AzureBlob(s) => s.download(from, cancel).await,
+            Self::Unreliable(s) => s.download(from, cancel).await,
        }
    }

@@ -425,54 +403,72 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
        from: &RemotePath,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
+        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError> {
        match self {
            Self::LocalFs(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive)
+                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
                    .await
            }
            Self::AwsS3(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive)
+                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
                    .await
            }
            Self::AzureBlob(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive)
+                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
                    .await
            }
            Self::Unreliable(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive)
+                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
                    .await
            }
        }
    }

-    pub async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+    /// See [`RemoteStorage::delete`]
+    pub async fn delete(
+        &self,
+        path: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
        match self {
-            Self::LocalFs(s) => s.delete(path).await,
-            Self::AwsS3(s) => s.delete(path).await,
-            Self::AzureBlob(s) => s.delete(path).await,
-            Self::Unreliable(s) => s.delete(path).await,
+            Self::LocalFs(s) => s.delete(path, cancel).await,
+            Self::AwsS3(s) => s.delete(path, cancel).await,
+            Self::AzureBlob(s) => s.delete(path, cancel).await,
+            Self::Unreliable(s) => s.delete(path, cancel).await,
        }
    }

-    pub async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+    /// See [`RemoteStorage::delete_objects`]
+    pub async fn delete_objects(
+        &self,
+        paths: &[RemotePath],
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
        match self {
-            Self::LocalFs(s) => s.delete_objects(paths).await,
-            Self::AwsS3(s) => s.delete_objects(paths).await,
-            Self::AzureBlob(s) => s.delete_objects(paths).await,
-            Self::Unreliable(s) => s.delete_objects(paths).await,
+            Self::LocalFs(s) => s.delete_objects(paths, cancel).await,
+            Self::AwsS3(s) => s.delete_objects(paths, cancel).await,
+            Self::AzureBlob(s) => s.delete_objects(paths, cancel).await,
+            Self::Unreliable(s) => s.delete_objects(paths, cancel).await,
        }
    }

-    pub async fn copy_object(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+    /// See [`RemoteStorage::copy`]
+    pub async fn copy_object(
+        &self,
+        from: &RemotePath,
+        to: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
        match self {
-            Self::LocalFs(s) => s.copy(from, to).await,
-            Self::AwsS3(s) => s.copy(from, to).await,
-            Self::AzureBlob(s) => s.copy(from, to).await,
-            Self::Unreliable(s) => s.copy(from, to).await,
+            Self::LocalFs(s) => s.copy(from, to, cancel).await,
+            Self::AwsS3(s) => s.copy(from, to, cancel).await,
+            Self::AzureBlob(s) => s.copy(from, to, cancel).await,
+            Self::Unreliable(s) => s.copy(from, to, cancel).await,
        }
    }

+    /// See [`RemoteStorage::time_travel_recover`].
    pub async fn time_travel_recover(
        &self,
        prefix: Option<&RemotePath>,
@@ -503,10 +499,11 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {

 impl GenericRemoteStorage {
    pub fn from_config(storage_config: &RemoteStorageConfig) -> anyhow::Result<Self> {
+        let timeout = storage_config.timeout;
        Ok(match &storage_config.storage {
-            RemoteStorageKind::LocalFs(root) => {
-                info!("Using fs root '{root}' as a remote storage");
-                Self::LocalFs(LocalFs::new(root.clone())?)
+            RemoteStorageKind::LocalFs(path) => {
+                info!("Using fs root '{path}' as a remote storage");
+                Self::LocalFs(LocalFs::new(path.clone(), timeout)?)
            }
            RemoteStorageKind::AwsS3(s3_config) => {
                // The profile and access key id are only printed here for debugging purposes,
@@ -516,12 +513,12 @@ impl GenericRemoteStorage {
                    std::env::var("AWS_ACCESS_KEY_ID").unwrap_or_else(|_| "<none>".into());
                info!("Using s3 bucket '{}' in region '{}' as a remote storage, prefix in bucket: '{:?}', bucket endpoint: '{:?}', profile: {profile}, access_key_id: {access_key_id}",
                      s3_config.bucket_name, s3_config.bucket_region, s3_config.prefix_in_bucket, s3_config.endpoint);
-                Self::AwsS3(Arc::new(S3Bucket::new(s3_config)?))
+                Self::AwsS3(Arc::new(S3Bucket::new(s3_config, timeout)?))
            }
            RemoteStorageKind::AzureContainer(azure_config) => {
                info!("Using azure container '{}' in region '{}' as a remote storage, prefix in container: '{:?}'",
                      azure_config.container_name, azure_config.container_region, azure_config.prefix_in_container);
-                Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config)?))
+                Self::AzureBlob(Arc::new(AzureBlobStorage::new(azure_config, timeout)?))
            }
        })
    }
@@ -530,18 +527,15 @@ impl GenericRemoteStorage {
        Self::Unreliable(Arc::new(UnreliableWrapper::new(s, fail_first)))
    }

-    /// Takes storage object contents and its size and uploads to remote storage,
-    /// mapping `from_path` to the corresponding remote object id in the storage.
-    ///
-    /// The storage object does not have to be present on the `from_path`,
-    /// this path is used for the remote object id conversion only.
+    /// See [`RemoteStorage::upload`], which this method calls with `None` as metadata.
    pub async fn upload_storage_object(
        &self,
        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        from_size_bytes: usize,
        to: &RemotePath,
+        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
-        self.upload(from, from_size_bytes, to, None)
+        self.upload(from, from_size_bytes, to, None, cancel)
            .await
            .with_context(|| {
                format!("Failed to upload data of length {from_size_bytes} to storage path {to:?}")
@@ -554,10 +548,11 @@ impl GenericRemoteStorage {
        &self,
        byte_range: Option<(u64, Option<u64>)>,
        from: &RemotePath,
+        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError> {
        match byte_range {
-            Some((start, end)) => self.download_byte_range(from, start, end).await,
-            None => self.download(from).await,
+            Some((start, end)) => self.download_byte_range(from, start, end, cancel).await,
+            None => self.download(from, cancel).await,
        }
    }
 }
@@ -572,6 +567,9 @@ pub struct StorageMetadata(HashMap<String, String>);
 pub struct RemoteStorageConfig {
    /// The storage connection configuration.
    pub storage: RemoteStorageKind,
+    /// A common timeout enforced for all requests after concurrency limiter permit has been
+    /// acquired.
+    pub timeout: Duration,
 }

 /// A kind of a remote storage to connect to, with its connection configuration.
@@ -656,6 +654,8 @@ impl Debug for AzureConfig {
 }

 impl RemoteStorageConfig {
+    pub const DEFAULT_TIMEOUT: Duration = std::time::Duration::from_secs(120);
+
    pub fn from_toml(toml: &toml_edit::Item) -> anyhow::Result<Option<RemoteStorageConfig>> {
        let local_path = toml.get("local_path");
        let bucket_name = toml.get("bucket_name");
@@ -685,6 +685,27 @@ impl RemoteStorageConfig {
            .map(|endpoint| parse_toml_string("endpoint", endpoint))
            .transpose()?;

+        let timeout = toml
+            .get("timeout")
+            .map(|timeout| {
+                timeout
+                    .as_str()
+                    .ok_or_else(|| anyhow::Error::msg("timeout was not a string"))
+            })
+            .transpose()
+            .and_then(|timeout| {
+                timeout
+                    .map(humantime::parse_duration)
+                    .transpose()
+                    .map_err(anyhow::Error::new)
+            })
+            .context("parse timeout")?
+            .unwrap_or(Self::DEFAULT_TIMEOUT);
+
+        if timeout < Duration::from_secs(1) {
+            bail!("timeout was specified as {timeout:?} which is too low");
+        }
+
        let storage = match (
            local_path,
            bucket_name,
@@ -746,7 +767,7 @@ impl RemoteStorageConfig {
            }
        };

-        Ok(Some(RemoteStorageConfig { storage }))
+        Ok(Some(RemoteStorageConfig { storage, timeout }))
    }
 }

@@ -842,4 +863,24 @@ mod tests {
        let err = RemotePath::new(Utf8Path::new("/")).expect_err("Should fail on absolute paths");
        assert_eq!(err.to_string(), "Path \"/\" is not relative");
    }
+
+    #[test]
+    fn parse_localfs_config_with_timeout() {
+        let input = "local_path = '.'
+timeout = '5s'";
+
+        let toml = input.parse::<toml_edit::Document>().unwrap();
+
+        let config = RemoteStorageConfig::from_toml(toml.as_item())
+            .unwrap()
+            .expect("it exists");
+
+        assert_eq!(
+            config,
+            RemoteStorageConfig {
+                storage: RemoteStorageKind::LocalFs(Utf8PathBuf::from(".")),
+                timeout: Duration::from_secs(5)
+            }
+        );
+    }
 }
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -5,7 +5,12 @@
 //! volume is mounted to the local FS.

 use std::{
-    borrow::Cow, future::Future, io::ErrorKind, num::NonZeroU32, pin::Pin, time::SystemTime,
+    borrow::Cow,
+    future::Future,
+    io::ErrorKind,
+    num::NonZeroU32,
+    pin::Pin,
+    time::{Duration, SystemTime},
 };

 use anyhow::{bail, ensure, Context};
@@ -20,7 +25,9 @@ use tokio_util::{io::ReaderStream, sync::CancellationToken};
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};

-use crate::{Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError};
+use crate::{
+    Download, DownloadError, Listing, ListingMode, RemotePath, TimeTravelError, TimeoutOrCancel,
+};

 use super::{RemoteStorage, StorageMetadata};

@@ -29,12 +36,13 @@ const LOCAL_FS_TEMP_FILE_SUFFIX: &str = "___temp";
 #[derive(Debug, Clone)]
 pub struct LocalFs {
    storage_root: Utf8PathBuf,
+    timeout: Duration,
 }

 impl LocalFs {
    /// Attempts to create local FS storage, along with its root directory.
    /// Storage root will be created (if does not exist) and transformed into an absolute path (if passed as relative).
-    pub fn new(mut storage_root: Utf8PathBuf) -> anyhow::Result<Self> {
+    pub fn new(mut storage_root: Utf8PathBuf, timeout: Duration) -> anyhow::Result<Self> {
        if !storage_root.exists() {
            std::fs::create_dir_all(&storage_root).with_context(|| {
                format!("Failed to create all directories in the given root path {storage_root:?}")
@@ -46,7 +54,10 @@ impl LocalFs {
            })?;
        }

-        Ok(Self { storage_root })
+        Ok(Self {
+            storage_root,
+            timeout,
+        })
    }

    // mirrors S3Bucket::s3_object_to_relative_path
@@ -157,80 +168,14 @@ impl LocalFs {

        Ok(files)
    }
-}

-impl RemoteStorage for LocalFs {
-    async fn list(
-        &self,
-        prefix: Option<&RemotePath>,
-        mode: ListingMode,
-        max_keys: Option<NonZeroU32>,
-    ) -> Result<Listing, DownloadError> {
-        let mut result = Listing::default();
-
-        if let ListingMode::NoDelimiter = mode {
-            let keys = self
-                .list_recursive(prefix)
-                .await
-                .map_err(DownloadError::Other)?;
-
-            result.keys = keys
-                .into_iter()
-                .filter(|k| {
-                    let path = k.with_base(&self.storage_root);
-                    !path.is_dir()
-                })
-                .collect();
-            if let Some(max_keys) = max_keys {
-                result.keys.truncate(max_keys.get() as usize);
-            }
-
-            return Ok(result);
-        }
-
-        let path = match prefix {
-            Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
-            None => Cow::Borrowed(&self.storage_root),
-        };
-
-        let prefixes_to_filter = get_all_files(path.as_ref(), false)
-            .await
-            .map_err(DownloadError::Other)?;
-
-        // filter out empty directories to mirror s3 behavior.
-        for prefix in prefixes_to_filter {
-            if prefix.is_dir()
-                && is_directory_empty(&prefix)
-                    .await
-                    .map_err(DownloadError::Other)?
-            {
-                continue;
-            }
-
-            let stripped = prefix
-                .strip_prefix(&self.storage_root)
-                .context("Failed to strip prefix")
-                .and_then(RemotePath::new)
-                .expect(
-                    "We list files for storage root, hence should be able to remote the prefix",
-                );
-
-            if prefix.is_dir() {
-                result.prefixes.push(stripped);
-            } else {
-                result.keys.push(stripped);
-            }
-        }
-
-        Ok(result)
-    }
-
-    async fn upload(
+    async fn upload0(
        &self,
        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
        let target_file_path = to.with_base(&self.storage_root);
        create_target_directory(&target_file_path).await?;
@@ -265,9 +210,26 @@ impl RemoteStorage for LocalFs {
        let mut buffer_to_read = data.take(from_size_bytes);

        // alternatively we could just write the bytes to a file, but local_fs is a testing utility
-        let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination)
-            .await
-            .with_context(|| {
+        let copy = io::copy_buf(&mut buffer_to_read, &mut destination);
+
+        let bytes_read = tokio::select! {
+            biased;
+            _ = cancel.cancelled() => {
+                let file = destination.into_inner();
+                // wait for the inflight operation(s) to complete so that there could be a next
+                // attempt right away and our writes are not directed to their file.
+                file.into_std().await;
+
+                // TODO: leave the temp or not? leaving is probably less racy. enabled truncate at
+                // least.
+                fs::remove_file(temp_file_path).await.context("remove temp_file_path after cancellation or timeout")?;
+                return Err(TimeoutOrCancel::Cancel.into());
+            }
+            read = copy => read,
+        };
+
+        let bytes_read =
+            bytes_read.with_context(|| {
                format!(
                    "Failed to upload file (write temp) to the local storage at '{temp_file_path}'",
                )
@@ -299,6 +261,9 @@ impl RemoteStorage for LocalFs {
            })?;

        if let Some(storage_metadata) = metadata {
+            // FIXME: we must not be using metadata much, since this would forget the old metadata
+            // for new writes? or perhaps metadata is sticky; could consider removing if it's never
+            // used.
            let storage_metadata_path = storage_metadata_path(&target_file_path);
            fs::write(
                &storage_metadata_path,
@@ -315,8 +280,131 @@ impl RemoteStorage for LocalFs {

        Ok(())
    }
+}

-    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+impl RemoteStorage for LocalFs {
+    async fn list(
+        &self,
+        prefix: Option<&RemotePath>,
+        mode: ListingMode,
+        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
+    ) -> Result<Listing, DownloadError> {
+        let op = async {
+            let mut result = Listing::default();
+
+            if let ListingMode::NoDelimiter = mode {
+                let keys = self
+                    .list_recursive(prefix)
+                    .await
+                    .map_err(DownloadError::Other)?;
+
+                result.keys = keys
+                    .into_iter()
+                    .filter(|k| {
+                        let path = k.with_base(&self.storage_root);
+                        !path.is_dir()
+                    })
+                    .collect();
+
+                if let Some(max_keys) = max_keys {
+                    result.keys.truncate(max_keys.get() as usize);
+                }
+
+                return Ok(result);
+            }
+
+            let path = match prefix {
+                Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
+                None => Cow::Borrowed(&self.storage_root),
+            };
+
+            let prefixes_to_filter = get_all_files(path.as_ref(), false)
+                .await
+                .map_err(DownloadError::Other)?;
+
+            // filter out empty directories to mirror s3 behavior.
+            for prefix in prefixes_to_filter {
+                if prefix.is_dir()
+                    && is_directory_empty(&prefix)
+                        .await
+                        .map_err(DownloadError::Other)?
+                {
+                    continue;
+                }
+
+                let stripped = prefix
+                    .strip_prefix(&self.storage_root)
+                    .context("Failed to strip prefix")
+                    .and_then(RemotePath::new)
+                    .expect(
+                        "We list files for storage root, hence should be able to remote the prefix",
+                    );
+
+                if prefix.is_dir() {
+                    result.prefixes.push(stripped);
+                } else {
+                    result.keys.push(stripped);
+                }
+            }
+
+            Ok(result)
+        };
+
+        let timeout = async {
+            tokio::time::sleep(self.timeout).await;
+            Err(DownloadError::Timeout)
+        };
+
+        let cancelled = async {
+            cancel.cancelled().await;
+            Err(DownloadError::Cancelled)
+        };
+
+        tokio::select! {
+            res = op => res,
+            res = timeout => res,
+            res = cancelled => res,
+        }
+    }
+
+    async fn upload(
+        &self,
+        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
+        data_size_bytes: usize,
+        to: &RemotePath,
+        metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        let cancel = cancel.child_token();
+
+        let op = self.upload0(data, data_size_bytes, to, metadata, &cancel);
+        let mut op = std::pin::pin!(op);
+
+        // race the upload0 to the timeout; if it goes over, do a graceful shutdown
+        let (res, timeout) = tokio::select! {
+            res = &mut op => (res, false),
+            _ = tokio::time::sleep(self.timeout) => {
+                cancel.cancel();
+                (op.await, true)
+            }
+        };
+
+        match res {
+            Err(e) if timeout && TimeoutOrCancel::caused_by_cancel(&e) => {
+                // we caused this cancel (or they happened simultaneously) -- swap it out to
+                // Timeout
+                Err(TimeoutOrCancel::Timeout.into())
+            }
+            res => res,
+        }
+    }
+
+    async fn download(
+        &self,
+        from: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError> {
        let target_path = from.with_base(&self.storage_root);
        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
            let source = ReaderStream::new(
@@ -334,6 +422,10 @@ impl RemoteStorage for LocalFs {
                .read_storage_metadata(&target_path)
                .await
                .map_err(DownloadError::Other)?;
+
+            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+
            Ok(Download {
                metadata,
                last_modified: None,
@@ -350,6 +442,7 @@ impl RemoteStorage for LocalFs {
        from: &RemotePath,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
+        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError> {
        if let Some(end_exclusive) = end_exclusive {
            if end_exclusive <= start_inclusive {
@@ -391,6 +484,9 @@ impl RemoteStorage for LocalFs {
            let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
            let source = ReaderStream::new(source);

+            let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
+            let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
+
            Ok(Download {
                metadata,
                last_modified: None,
@@ -402,7 +498,7 @@ impl RemoteStorage for LocalFs {
        }
    }

-    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+    async fn delete(&self, path: &RemotePath, _cancel: &CancellationToken) -> anyhow::Result<()> {
        let file_path = path.with_base(&self.storage_root);
        match fs::remove_file(&file_path).await {
            Ok(()) => Ok(()),
@@ -414,14 +510,23 @@ impl RemoteStorage for LocalFs {
        }
    }

-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+    async fn delete_objects<'a>(
+        &self,
+        paths: &'a [RemotePath],
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
        for path in paths {
-            self.delete(path).await?
+            self.delete(path, cancel).await?
        }
        Ok(())
    }

-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+    async fn copy(
+        &self,
+        from: &RemotePath,
+        to: &RemotePath,
+        _cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
        let from_path = from.with_base(&self.storage_root);
        let to_path = to.with_base(&self.storage_root);
        create_target_directory(&to_path).await?;
@@ -435,7 +540,6 @@ impl RemoteStorage for LocalFs {
        Ok(())
    }

-    #[allow(clippy::diverging_sub_expression)]
    async fn time_travel_recover(
        &self,
        _prefix: Option<&RemotePath>,
@@ -529,8 +633,9 @@ mod fs_tests {
        remote_storage_path: &RemotePath,
        expected_metadata: Option<&StorageMetadata>,
    ) -> anyhow::Result<String> {
+        let cancel = CancellationToken::new();
        let download = storage
-            .download(remote_storage_path)
+            .download(remote_storage_path, &cancel)
            .await
            .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
        ensure!(
@@ -545,16 +650,16 @@ mod fs_tests {

    #[tokio::test]
    async fn upload_file() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;

-        let target_path_1 = upload_dummy_file(&storage, "upload_1", None).await?;
+        let target_path_1 = upload_dummy_file(&storage, "upload_1", None, &cancel).await?;
        assert_eq!(
            storage.list_all().await?,
            vec![target_path_1.clone()],
            "Should list a single file after first upload"
        );

-        let target_path_2 = upload_dummy_file(&storage, "upload_2", None).await?;
+        let target_path_2 = upload_dummy_file(&storage, "upload_2", None, &cancel).await?;
        assert_eq!(
            list_files_sorted(&storage).await?,
            vec![target_path_1.clone(), target_path_2.clone()],
@@ -566,7 +671,7 @@ mod fs_tests {

    #[tokio::test]
    async fn upload_file_negatives() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;

        let id = RemotePath::new(Utf8Path::new("dummy"))?;
        let content = Bytes::from_static(b"12345");
@@ -575,34 +680,34 @@ mod fs_tests {
        // Check that you get an error if the size parameter doesn't match the actual
        // size of the stream.
        storage
-            .upload(content(), 0, &id, None)
+            .upload(content(), 0, &id, None, &cancel)
            .await
            .expect_err("upload with zero size succeeded");
        storage
-            .upload(content(), 4, &id, None)
+            .upload(content(), 4, &id, None, &cancel)
            .await
            .expect_err("upload with too short size succeeded");
        storage
-            .upload(content(), 6, &id, None)
+            .upload(content(), 6, &id, None, &cancel)
            .await
            .expect_err("upload with too large size succeeded");

        // Correct size is 5, this should succeed.
-        storage.upload(content(), 5, &id, None).await?;
+        storage.upload(content(), 5, &id, None, &cancel).await?;

        Ok(())
    }

-    fn create_storage() -> anyhow::Result<LocalFs> {
+    fn create_storage() -> anyhow::Result<(LocalFs, CancellationToken)> {
        let storage_root = tempdir()?.path().to_path_buf();
-        LocalFs::new(storage_root)
+        LocalFs::new(storage_root, Duration::from_secs(120)).map(|s| (s, CancellationToken::new()))
    }

    #[tokio::test]
    async fn download_file() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;

        let contents = read_and_check_metadata(&storage, &upload_target, None).await?;
        assert_eq!(
@@ -612,7 +717,7 @@ mod fs_tests {
        );

        let non_existing_path = "somewhere/else";
-        match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?).await {
+        match storage.download(&RemotePath::new(Utf8Path::new(non_existing_path))?, &cancel).await {
            Err(DownloadError::NotFound) => {} // Should get NotFound for non existing keys
            other => panic!("Should get a NotFound error when downloading non-existing storage files, but got: {other:?}"),
        }
@@ -621,9 +726,9 @@ mod fs_tests {

    #[tokio::test]
    async fn download_file_range_positive() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;

        let full_range_download_contents =
            read_and_check_metadata(&storage, &upload_target, None).await?;
@@ -637,7 +742,12 @@ mod fs_tests {
        let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);

        let first_part_download = storage
-            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
+            .download_byte_range(
+                &upload_target,
+                0,
+                Some(first_part_local.len() as u64),
+                &cancel,
+            )
            .await?;
        assert!(
            first_part_download.metadata.is_none(),
@@ -655,6 +765,7 @@ mod fs_tests {
                &upload_target,
                first_part_local.len() as u64,
                Some((first_part_local.len() + second_part_local.len()) as u64),
+                &cancel,
            )
            .await?;
        assert!(
@@ -669,7 +780,7 @@ mod fs_tests {
        );

        let suffix_bytes = storage
-            .download_byte_range(&upload_target, 13, None)
+            .download_byte_range(&upload_target, 13, None, &cancel)
            .await?
            .download_stream;
        let suffix_bytes = aggregate(suffix_bytes).await?;
@@ -677,7 +788,7 @@ mod fs_tests {
        assert_eq!(upload_name, suffix);

        let all_bytes = storage
-            .download_byte_range(&upload_target, 0, None)
+            .download_byte_range(&upload_target, 0, None, &cancel)
            .await?
            .download_stream;
        let all_bytes = aggregate(all_bytes).await?;
@@ -689,9 +800,9 @@ mod fs_tests {

    #[tokio::test]
    async fn download_file_range_negative() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;

        let start = 1_000_000_000;
        let end = start + 1;
@@ -700,6 +811,7 @@ mod fs_tests {
                &upload_target,
                start,
                Some(end), // exclusive end
+                &cancel,
            )
            .await
        {
@@ -716,7 +828,7 @@ mod fs_tests {
        let end = 234;
        assert!(start > end, "Should test an incorrect range");
        match storage
-            .download_byte_range(&upload_target, start, Some(end))
+            .download_byte_range(&upload_target, start, Some(end), &cancel)
            .await
        {
            Ok(_) => panic!("Should not allow downloading wrong ranges"),
@@ -733,15 +845,15 @@ mod fs_tests {

    #[tokio::test]
    async fn delete_file() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
        let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&storage, upload_name, None).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;

-        storage.delete(&upload_target).await?;
+        storage.delete(&upload_target, &cancel).await?;
        assert!(storage.list_all().await?.is_empty());

        storage
-            .delete(&upload_target)
+            .delete(&upload_target, &cancel)
            .await
            .expect("Should allow deleting non-existing storage files");

@@ -750,14 +862,14 @@ mod fs_tests {

    #[tokio::test]
    async fn file_with_metadata() -> anyhow::Result<()> {
-        let storage = create_storage()?;
+        let (storage, cancel) = create_storage()?;
        let upload_name = "upload_1";
        let metadata = StorageMetadata(HashMap::from([
            ("one".to_string(), "1".to_string()),
            ("two".to_string(), "2".to_string()),
        ]));
        let upload_target =
-            upload_dummy_file(&storage, upload_name, Some(metadata.clone())).await?;
+            upload_dummy_file(&storage, upload_name, Some(metadata.clone()), &cancel).await?;

        let full_range_download_contents =
            read_and_check_metadata(&storage, &upload_target, Some(&metadata)).await?;
@@ -771,7 +883,12 @@ mod fs_tests {
        let (first_part_local, _) = uploaded_bytes.split_at(3);

        let partial_download_with_metadata = storage
-            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
+            .download_byte_range(
+                &upload_target,
+                0,
+                Some(first_part_local.len() as u64),
+                &cancel,
+            )
            .await?;
        let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?;
        assert_eq!(
@@ -792,16 +909,20 @@ mod fs_tests {
    #[tokio::test]
    async fn list() -> anyhow::Result<()> {
        // No delimiter: should recursively list everything
-        let storage = create_storage()?;
-        let child = upload_dummy_file(&storage, "grandparent/parent/child", None).await?;
-        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None).await?;
+        let (storage, cancel) = create_storage()?;
+        let child = upload_dummy_file(&storage, "grandparent/parent/child", None, &cancel).await?;
+        let uncle = upload_dummy_file(&storage, "grandparent/uncle", None, &cancel).await?;

-        let listing = storage.list(None, ListingMode::NoDelimiter, None).await?;
+        let listing = storage
+            .list(None, ListingMode::NoDelimiter, None, &cancel)
+            .await?;
        assert!(listing.prefixes.is_empty());
        assert_eq!(listing.keys, [uncle.clone(), child.clone()].to_vec());

        // Delimiter: should only go one deep
-        let listing = storage.list(None, ListingMode::WithDelimiter, None).await?;
+        let listing = storage
+            .list(None, ListingMode::WithDelimiter, None, &cancel)
+            .await?;

        assert_eq!(
            listing.prefixes,
@@ -815,6 +936,7 @@ mod fs_tests {
                Some(&RemotePath::from_string("timelines/some_timeline/grandparent").unwrap()),
                ListingMode::WithDelimiter,
                None,
+                &cancel,
            )
            .await?;
        assert_eq!(
@@ -827,10 +949,75 @@ mod fs_tests {
        Ok(())
    }

+    #[tokio::test]
+    async fn overwrite_shorter_file() -> anyhow::Result<()> {
+        let (storage, cancel) = create_storage()?;
+
+        let path = RemotePath::new("does/not/matter/file".into())?;
+
+        let body = Bytes::from_static(b"long file contents is long");
+        {
+            let len = body.len();
+            let body =
+                futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone())));
+            storage.upload(body, len, &path, None, &cancel).await?;
+        }
+
+        let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
+        assert_eq!(body, read);
+
+        let shorter = Bytes::from_static(b"shorter body");
+        {
+            let len = shorter.len();
+            let body =
+                futures::stream::once(futures::future::ready(std::io::Result::Ok(shorter.clone())));
+            storage.upload(body, len, &path, None, &cancel).await?;
+        }
+
+        let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
+        assert_eq!(shorter, read);
+        Ok(())
+    }
+
+    #[tokio::test]
+    async fn cancelled_upload_can_later_be_retried() -> anyhow::Result<()> {
+        let (storage, cancel) = create_storage()?;
+
+        let path = RemotePath::new("does/not/matter/file".into())?;
+
+        let body = Bytes::from_static(b"long file contents is long");
+        {
+            let len = body.len();
+            let body =
+                futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone())));
+            let cancel = cancel.child_token();
+            cancel.cancel();
+            let e = storage
+                .upload(body, len, &path, None, &cancel)
+                .await
+                .unwrap_err();
+
+            assert!(TimeoutOrCancel::caused_by_cancel(&e));
+        }
+
+        {
+            let len = body.len();
+            let body =
+                futures::stream::once(futures::future::ready(std::io::Result::Ok(body.clone())));
+            storage.upload(body, len, &path, None, &cancel).await?;
+        }
+
+        let read = aggregate(storage.download(&path, &cancel).await?.download_stream).await?;
+        assert_eq!(body, read);
+
+        Ok(())
+    }
+
    async fn upload_dummy_file(
        storage: &LocalFs,
        name: &str,
        metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
    ) -> anyhow::Result<RemotePath> {
        let from_path = storage
            .storage_root
@@ -852,7 +1039,9 @@ mod fs_tests {

        let file = tokio_util::io::ReaderStream::new(file);

-        storage.upload(file, size, &relative_path, metadata).await?;
+        storage
+            .upload(file, size, &relative_path, metadata, cancel)
+            .await?;
        Ok(relative_path)
    }

--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -11,7 +11,7 @@ use std::{
    pin::Pin,
    sync::Arc,
    task::{Context, Poll},
-    time::SystemTime,
+    time::{Duration, SystemTime},
 };

 use anyhow::{anyhow, Context as _};
@@ -46,9 +46,9 @@ use utils::backoff;

 use super::StorageMetadata;
 use crate::{
-    support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError, Listing, ListingMode,
-    RemotePath, RemoteStorage, S3Config, TimeTravelError, MAX_KEYS_PER_DELETE,
-    REMOTE_STORAGE_PREFIX_SEPARATOR,
+    error::Cancelled, support::PermitCarrying, ConcurrencyLimiter, Download, DownloadError,
+    Listing, ListingMode, RemotePath, RemoteStorage, S3Config, TimeTravelError, TimeoutOrCancel,
+    MAX_KEYS_PER_DELETE, REMOTE_STORAGE_PREFIX_SEPARATOR,
 };

 pub(super) mod metrics;
@@ -63,6 +63,8 @@ pub struct S3Bucket {
    prefix_in_bucket: Option<String>,
    max_keys_per_list_response: Option<i32>,
    concurrency_limiter: ConcurrencyLimiter,
+    // Per-request timeout. Accessible for tests.
+    pub timeout: Duration,
 }

 struct GetObjectRequest {
@@ -72,7 +74,7 @@ struct GetObjectRequest {
 }
 impl S3Bucket {
    /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
-    pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
+    pub fn new(aws_config: &S3Config, timeout: Duration) -> anyhow::Result<Self> {
        tracing::debug!(
            "Creating s3 remote storage for S3 bucket {}",
            aws_config.bucket_name
@@ -152,6 +154,7 @@ impl S3Bucket {
            max_keys_per_list_response: aws_config.max_keys_per_list_response,
            prefix_in_bucket,
            concurrency_limiter: ConcurrencyLimiter::new(aws_config.concurrency_limit.get()),
+            timeout,
        })
    }

@@ -185,40 +188,55 @@ impl S3Bucket {
        }
    }

-    async fn permit(&self, kind: RequestKind) -> tokio::sync::SemaphorePermit<'_> {
+    async fn permit(
+        &self,
+        kind: RequestKind,
+        cancel: &CancellationToken,
+    ) -> Result<tokio::sync::SemaphorePermit<'_>, Cancelled> {
        let started_at = start_counting_cancelled_wait(kind);
-        let permit = self
-            .concurrency_limiter
-            .acquire(kind)
-            .await
-            .expect("semaphore is never closed");
+        let acquire = self.concurrency_limiter.acquire(kind);
+
+        let permit = tokio::select! {
+            permit = acquire => permit.expect("semaphore is never closed"),
+            _ = cancel.cancelled() => return Err(Cancelled),
+        };

        let started_at = ScopeGuard::into_inner(started_at);
        metrics::BUCKET_METRICS
            .wait_seconds
            .observe_elapsed(kind, started_at);

-        permit
+        Ok(permit)
    }

-    async fn owned_permit(&self, kind: RequestKind) -> tokio::sync::OwnedSemaphorePermit {
+    async fn owned_permit(
+        &self,
+        kind: RequestKind,
+        cancel: &CancellationToken,
+    ) -> Result<tokio::sync::OwnedSemaphorePermit, Cancelled> {
        let started_at = start_counting_cancelled_wait(kind);
-        let permit = self
-            .concurrency_limiter
-            .acquire_owned(kind)
-            .await
-            .expect("semaphore is never closed");
+        let acquire = self.concurrency_limiter.acquire_owned(kind);
+
+        let permit = tokio::select! {
+            permit = acquire => permit.expect("semaphore is never closed"),
+            _ = cancel.cancelled() => return Err(Cancelled),
+        };

        let started_at = ScopeGuard::into_inner(started_at);
        metrics::BUCKET_METRICS
            .wait_seconds
            .observe_elapsed(kind, started_at);
-        permit
+        Ok(permit)
    }

-    async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
+    async fn download_object(
+        &self,
+        request: GetObjectRequest,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError> {
        let kind = RequestKind::Get;
-        let permit = self.owned_permit(kind).await;
+
+        let permit = self.owned_permit(kind, cancel).await?;

        let started_at = start_measuring_requests(kind);

@@ -228,8 +246,13 @@ impl S3Bucket {
            .bucket(request.bucket)
            .key(request.key)
            .set_range(request.range)
-            .send()
-            .await;
+            .send();
+
+        let get_object = tokio::select! {
+            res = get_object => res,
+            _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
+            _ = cancel.cancelled() => return Err(DownloadError::Cancelled),
+        };

        let started_at = ScopeGuard::into_inner(started_at);

@@ -259,6 +282,10 @@ impl S3Bucket {
            }
        };

+        // even if we would have no timeout left, continue anyways. the caller can decide to ignore
+        // the errors considering timeouts and cancellation.
+        let remaining = self.timeout.saturating_sub(started_at.elapsed());
+
        let metadata = object_output.metadata().cloned().map(StorageMetadata);
        let etag = object_output.e_tag;
        let last_modified = object_output.last_modified.and_then(|t| t.try_into().ok());
@@ -268,6 +295,9 @@ impl S3Bucket {
        let body = PermitCarrying::new(permit, body);
        let body = TimedDownload::new(started_at, body);

+        let cancel_or_timeout = crate::support::cancel_or_timeout(remaining, cancel.clone());
+        let body = crate::support::DownloadStream::new(cancel_or_timeout, body);
+
        Ok(Download {
            metadata,
            etag,
@@ -278,33 +308,44 @@ impl S3Bucket {

    async fn delete_oids(
        &self,
-        kind: RequestKind,
+        _permit: &tokio::sync::SemaphorePermit<'_>,
        delete_objects: &[ObjectIdentifier],
+        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
+        let kind = RequestKind::Delete;
+        let mut cancel = std::pin::pin!(cancel.cancelled());
+
        for chunk in delete_objects.chunks(MAX_KEYS_PER_DELETE) {
            let started_at = start_measuring_requests(kind);

-            let resp = self
+            let req = self
                .client
                .delete_objects()
                .bucket(self.bucket_name.clone())
                .delete(
                    Delete::builder()
                        .set_objects(Some(chunk.to_vec()))
-                        .build()?,
+                        .build()
+                        .context("build request")?,
                )
-                .send()
-                .await;
+                .send();
+
+            let resp = tokio::select! {
+                resp = req => resp,
+                _ = tokio::time::sleep(self.timeout) => return Err(TimeoutOrCancel::Timeout.into()),
+                _ = &mut cancel => return Err(TimeoutOrCancel::Cancel.into()),
+            };

            let started_at = ScopeGuard::into_inner(started_at);
            metrics::BUCKET_METRICS
                .req_seconds
                .observe_elapsed(kind, &resp, started_at);

-            let resp = resp?;
+            let resp = resp.context("request deletion")?;
            metrics::BUCKET_METRICS
                .deleted_objects_total
                .inc_by(chunk.len() as u64);
+
            if let Some(errors) = resp.errors {
                // Log a bounded number of the errors within the response:
                // these requests can carry 1000 keys so logging each one
@@ -320,9 +361,10 @@ impl S3Bucket {
                    );
                }

-                return Err(anyhow::format_err!(
-                    "Failed to delete {} objects",
-                    errors.len()
+                return Err(anyhow::anyhow!(
+                    "Failed to delete {}/{} objects",
+                    errors.len(),
+                    chunk.len(),
                ));
            }
        }
@@ -410,6 +452,7 @@ impl RemoteStorage for S3Bucket {
        prefix: Option<&RemotePath>,
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
    ) -> Result<Listing, DownloadError> {
        let kind = RequestKind::List;
        // s3 sdk wants i32
@@ -431,10 +474,11 @@ impl RemoteStorage for S3Bucket {
                p
            });

+        let _permit = self.permit(kind, cancel).await?;
+
        let mut continuation_token = None;

        loop {
-            let _guard = self.permit(kind).await;
            let started_at = start_measuring_requests(kind);

            // min of two Options, returning Some if one is value and another is
@@ -456,9 +500,15 @@ impl RemoteStorage for S3Bucket {
                request = request.delimiter(REMOTE_STORAGE_PREFIX_SEPARATOR.to_string());
            }

-            let response = request
-                .send()
-                .await
+            let request = request.send();
+
+            let response = tokio::select! {
+                res = request => res,
+                _ = tokio::time::sleep(self.timeout) => return Err(DownloadError::Timeout),
+                _ = cancel.cancelled() => return Err(DownloadError::Cancelled),
+            };
+
+            let response = response
                .context("Failed to list S3 prefixes")
                .map_err(DownloadError::Other);

@@ -511,16 +561,17 @@ impl RemoteStorage for S3Bucket {
        from_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
        let kind = RequestKind::Put;
-        let _guard = self.permit(kind).await;
+        let _permit = self.permit(kind, cancel).await?;

        let started_at = start_measuring_requests(kind);

        let body = Body::wrap_stream(from);
        let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));

-        let res = self
+        let upload = self
            .client
            .put_object()
            .bucket(self.bucket_name.clone())
@@ -528,22 +579,40 @@ impl RemoteStorage for S3Bucket {
            .set_metadata(metadata.map(|m| m.0))
            .content_length(from_size_bytes.try_into()?)
            .body(bytes_stream)
-            .send()
-            .await;
+            .send();

-        let started_at = ScopeGuard::into_inner(started_at);
-        metrics::BUCKET_METRICS
-            .req_seconds
-            .observe_elapsed(kind, &res, started_at);
+        let upload = tokio::time::timeout(self.timeout, upload);

-        res?;
+        let res = tokio::select! {
+            res = upload => res,
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };

-        Ok(())
+        if let Ok(inner) = &res {
+            // do not incl. timeouts as errors in metrics but cancellations
+            let started_at = ScopeGuard::into_inner(started_at);
+            metrics::BUCKET_METRICS
+                .req_seconds
+                .observe_elapsed(kind, inner, started_at);
+        }
+
+        match res {
+            Ok(Ok(_put)) => Ok(()),
+            Ok(Err(sdk)) => Err(sdk.into()),
+            Err(_timeout) => Err(TimeoutOrCancel::Timeout.into()),
+        }
    }

-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+    async fn copy(
+        &self,
+        from: &RemotePath,
+        to: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
        let kind = RequestKind::Copy;
-        let _guard = self.permit(kind).await;
+        let _permit = self.permit(kind, cancel).await?;
+
+        let timeout = tokio::time::sleep(self.timeout);

        let started_at = start_measuring_requests(kind);

@@ -554,14 +623,19 @@ impl RemoteStorage for S3Bucket {
            self.relative_path_to_s3_object(from)
        );

-        let res = self
+        let op = self
            .client
            .copy_object()
            .bucket(self.bucket_name.clone())
            .key(self.relative_path_to_s3_object(to))
            .copy_source(copy_source)
-            .send()
-            .await;
+            .send();
+
+        let res = tokio::select! {
+            res = op => res,
+            _ = timeout => return Err(TimeoutOrCancel::Timeout.into()),
+            _ = cancel.cancelled() => return Err(TimeoutOrCancel::Cancel.into()),
+        };

        let started_at = ScopeGuard::into_inner(started_at);
        metrics::BUCKET_METRICS
@@ -573,14 +647,21 @@ impl RemoteStorage for S3Bucket {
        Ok(())
    }

-    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+    async fn download(
+        &self,
+        from: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError> {
        // if prefix is not none then download file `prefix/from`
        // if prefix is none then download file `from`
-        self.download_object(GetObjectRequest {
-            bucket: self.bucket_name.clone(),
-            key: self.relative_path_to_s3_object(from),
-            range: None,
-        })
+        self.download_object(
+            GetObjectRequest {
+                bucket: self.bucket_name.clone(),
+                key: self.relative_path_to_s3_object(from),
+                range: None,
+            },
+            cancel,
+        )
        .await
    }

@@ -589,6 +670,7 @@ impl RemoteStorage for S3Bucket {
        from: &RemotePath,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
+        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError> {
        // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
        // and needs both ends to be exclusive
@@ -598,31 +680,39 @@ impl RemoteStorage for S3Bucket {
            None => format!("bytes={start_inclusive}-"),
        });

-        self.download_object(GetObjectRequest {
-            bucket: self.bucket_name.clone(),
-            key: self.relative_path_to_s3_object(from),
-            range,
-        })
+        self.download_object(
+            GetObjectRequest {
+                bucket: self.bucket_name.clone(),
+                key: self.relative_path_to_s3_object(from),
+                range,
+            },
+            cancel,
+        )
        .await
    }
-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
-        let kind = RequestKind::Delete;
-        let _guard = self.permit(kind).await;

+    async fn delete_objects<'a>(
+        &self,
+        paths: &'a [RemotePath],
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
+        let kind = RequestKind::Delete;
+        let permit = self.permit(kind, cancel).await?;
        let mut delete_objects = Vec::with_capacity(paths.len());
        for path in paths {
            let obj_id = ObjectIdentifier::builder()
                .set_key(Some(self.relative_path_to_s3_object(path)))
-                .build()?;
+                .build()
+                .context("convert path to oid")?;
            delete_objects.push(obj_id);
        }

-        self.delete_oids(kind, &delete_objects).await
+        self.delete_oids(&permit, &delete_objects, cancel).await
    }

-    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
+    async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
        let paths = std::array::from_ref(path);
-        self.delete_objects(paths).await
+        self.delete_objects(paths, cancel).await
    }

    async fn time_travel_recover(
@@ -633,7 +723,7 @@ impl RemoteStorage for S3Bucket {
        cancel: &CancellationToken,
    ) -> Result<(), TimeTravelError> {
        let kind = RequestKind::TimeTravel;
-        let _guard = self.permit(kind).await;
+        let permit = self.permit(kind, cancel).await?;

        let timestamp = DateTime::from(timestamp);
        let done_if_after = DateTime::from(done_if_after);
@@ -647,7 +737,7 @@ impl RemoteStorage for S3Bucket {

        let warn_threshold = 3;
        let max_retries = 10;
-        let is_permanent = |_e: &_| false;
+        let is_permanent = |e: &_| matches!(e, TimeTravelError::Cancelled);

        let mut key_marker = None;
        let mut version_id_marker = None;
@@ -656,15 +746,19 @@ impl RemoteStorage for S3Bucket {
        loop {
            let response = backoff::retry(
                || async {
-                    self.client
+                    let op = self
+                        .client
                        .list_object_versions()
                        .bucket(self.bucket_name.clone())
                        .set_prefix(prefix.clone())
                        .set_key_marker(key_marker.clone())
                        .set_version_id_marker(version_id_marker.clone())
-                        .send()
-                        .await
-                        .map_err(|e| TimeTravelError::Other(e.into()))
+                        .send();
+
+                    tokio::select! {
+                        res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
+                        _ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
+                    }
                },
                is_permanent,
                warn_threshold,
@@ -786,14 +880,18 @@ impl RemoteStorage for S3Bucket {

                        backoff::retry(
                            || async {
-                                self.client
+                                let op = self
+                                    .client
                                    .copy_object()
                                    .bucket(self.bucket_name.clone())
                                    .key(key)
                                    .copy_source(&source_id)
-                                    .send()
-                                    .await
-                                    .map_err(|e| TimeTravelError::Other(e.into()))
+                                    .send();
+
+                                tokio::select! {
+                                    res = op => res.map_err(|e| TimeTravelError::Other(e.into())),
+                                    _ = cancel.cancelled() => Err(TimeTravelError::Cancelled),
+                                }
                            },
                            is_permanent,
                            warn_threshold,
@@ -824,10 +922,18 @@ impl RemoteStorage for S3Bucket {
                    let oid = ObjectIdentifier::builder()
                        .key(key.to_owned())
                        .build()
-                        .map_err(|e| TimeTravelError::Other(anyhow::Error::new(e)))?;
-                    self.delete_oids(kind, &[oid])
+                        .map_err(|e| TimeTravelError::Other(e.into()))?;
+
+                    self.delete_oids(&permit, &[oid], cancel)
                        .await
-                        .map_err(TimeTravelError::Other)?;
+                        .map_err(|e| {
+                            // delete_oid0 will use TimeoutOrCancel
+                            if TimeoutOrCancel::caused_by_cancel(&e) {
+                                TimeTravelError::Cancelled
+                            } else {
+                                TimeTravelError::Other(e)
+                            }
+                        })?;
                }
            }
        }
@@ -963,7 +1069,8 @@ mod tests {
                concurrency_limit: NonZeroUsize::new(100).unwrap(),
                max_keys_per_list_response: Some(5),
            };
-            let storage = S3Bucket::new(&config).expect("remote storage init");
+            let storage =
+                S3Bucket::new(&config, std::time::Duration::ZERO).expect("remote storage init");
            for (test_path_idx, test_path) in all_paths.iter().enumerate() {
                let result = storage.relative_path_to_s3_object(test_path);
                let expected = expected_outputs[prefix_idx][test_path_idx];
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -90,11 +90,16 @@ impl UnreliableWrapper {
        }
    }

-    async fn delete_inner(&self, path: &RemotePath, attempt: bool) -> anyhow::Result<()> {
+    async fn delete_inner(
+        &self,
+        path: &RemotePath,
+        attempt: bool,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
        if attempt {
            self.attempt(RemoteOp::Delete(path.clone()))?;
        }
-        self.inner.delete(path).await
+        self.inner.delete(path, cancel).await
    }
 }

@@ -105,20 +110,22 @@ impl RemoteStorage for UnreliableWrapper {
    async fn list_prefixes(
        &self,
        prefix: Option<&RemotePath>,
+        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
            .map_err(DownloadError::Other)?;
-        self.inner.list_prefixes(prefix).await
+        self.inner.list_prefixes(prefix, cancel).await
    }

    async fn list_files(
        &self,
        folder: Option<&RemotePath>,
        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
    ) -> Result<Vec<RemotePath>, DownloadError> {
        self.attempt(RemoteOp::ListPrefixes(folder.cloned()))
            .map_err(DownloadError::Other)?;
-        self.inner.list_files(folder, max_keys).await
+        self.inner.list_files(folder, max_keys, cancel).await
    }

    async fn list(
@@ -126,10 +133,11 @@ impl RemoteStorage for UnreliableWrapper {
        prefix: Option<&RemotePath>,
        mode: ListingMode,
        max_keys: Option<NonZeroU32>,
+        cancel: &CancellationToken,
    ) -> Result<Listing, DownloadError> {
        self.attempt(RemoteOp::ListPrefixes(prefix.cloned()))
            .map_err(DownloadError::Other)?;
-        self.inner.list(prefix, mode, max_keys).await
+        self.inner.list(prefix, mode, max_keys, cancel).await
    }

    async fn upload(
@@ -140,15 +148,22 @@ impl RemoteStorage for UnreliableWrapper {
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
+        cancel: &CancellationToken,
    ) -> anyhow::Result<()> {
        self.attempt(RemoteOp::Upload(to.clone()))?;
-        self.inner.upload(data, data_size_bytes, to, metadata).await
+        self.inner
+            .upload(data, data_size_bytes, to, metadata, cancel)
+            .await
    }

-    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+    async fn download(
+        &self,
+        from: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> Result<Download, DownloadError> {
        self.attempt(RemoteOp::Download(from.clone()))
            .map_err(DownloadError::Other)?;
-        self.inner.download(from).await
+        self.inner.download(from, cancel).await
    }

    async fn download_byte_range(
@@ -156,6 +171,7 @@ impl RemoteStorage for UnreliableWrapper {
        from: &RemotePath,
        start_inclusive: u64,
        end_exclusive: Option<u64>,
+        cancel: &CancellationToken,
    ) -> Result<Download, DownloadError> {
        // Note: We treat any download_byte_range as an "attempt" of the same
        // operation. We don't pay attention to the ranges. That's good enough
@@ -163,20 +179,24 @@ impl RemoteStorage for UnreliableWrapper {
        self.attempt(RemoteOp::Download(from.clone()))
            .map_err(DownloadError::Other)?;
        self.inner
-            .download_byte_range(from, start_inclusive, end_exclusive)
+            .download_byte_range(from, start_inclusive, end_exclusive, cancel)
            .await
    }

-    async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
-        self.delete_inner(path, true).await
+    async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
+        self.delete_inner(path, true, cancel).await
    }

-    async fn delete_objects<'a>(&self, paths: &'a [RemotePath]) -> anyhow::Result<()> {
+    async fn delete_objects<'a>(
+        &self,
+        paths: &'a [RemotePath],
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
        self.attempt(RemoteOp::DeleteObjects(paths.to_vec()))?;
        let mut error_counter = 0;
        for path in paths {
            // Dont record attempt because it was already recorded above
-            if (self.delete_inner(path, false).await).is_err() {
+            if (self.delete_inner(path, false, cancel).await).is_err() {
                error_counter += 1;
            }
        }
@@ -189,11 +209,16 @@ impl RemoteStorage for UnreliableWrapper {
        Ok(())
    }

-    async fn copy(&self, from: &RemotePath, to: &RemotePath) -> anyhow::Result<()> {
+    async fn copy(
+        &self,
+        from: &RemotePath,
+        to: &RemotePath,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<()> {
        // copy is equivalent to download + upload
        self.attempt(RemoteOp::Download(from.clone()))?;
        self.attempt(RemoteOp::Upload(to.clone()))?;
-        self.inner.copy_object(from, to).await
+        self.inner.copy_object(from, to, cancel).await
    }

    async fn time_travel_recover(
--- a/libs/remote_storage/src/support.rs
+++ b/libs/remote_storage/src/support.rs
@@ -1,9 +1,15 @@
 use std::{
+    future::Future,
    pin::Pin,
    task::{Context, Poll},
+    time::Duration,
 };

+use bytes::Bytes;
 use futures_util::Stream;
+use tokio_util::sync::CancellationToken;
+
+use crate::TimeoutOrCancel;

 pin_project_lite::pin_project! {
    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
@@ -31,3 +37,133 @@ impl<S: Stream> Stream for PermitCarrying<S> {
        self.inner.size_hint()
    }
 }
+
+pin_project_lite::pin_project! {
+    pub(crate) struct DownloadStream<F, S> {
+        hit: bool,
+        #[pin]
+        cancellation: F,
+        #[pin]
+        inner: S,
+    }
+}
+
+impl<F, S> DownloadStream<F, S> {
+    pub(crate) fn new(cancellation: F, inner: S) -> Self {
+        Self {
+            cancellation,
+            hit: false,
+            inner,
+        }
+    }
+}
+
+/// See documentation on [`crate::DownloadStream`] on rationale why `std::io::Error` is used.
+impl<E, F, S> Stream for DownloadStream<F, S>
+where
+    std::io::Error: From<E>,
+    F: Future<Output = E>,
+    S: Stream<Item = std::io::Result<Bytes>>,
+{
+    type Item = <S as Stream>::Item;
+
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
+        let this = self.project();
+
+        if !*this.hit {
+            if let Poll::Ready(e) = this.cancellation.poll(cx) {
+                *this.hit = true;
+                let e = Err(std::io::Error::from(e));
+                return Poll::Ready(Some(e));
+            }
+        }
+
+        this.inner.poll_next(cx)
+    }
+
+    fn size_hint(&self) -> (usize, Option<usize>) {
+        self.inner.size_hint()
+    }
+}
+
+/// Fires only on the first cancel or timeout, not on both.
+pub(crate) async fn cancel_or_timeout(
+    timeout: Duration,
+    cancel: CancellationToken,
+) -> TimeoutOrCancel {
+    tokio::select! {
+        _ = tokio::time::sleep(timeout) => TimeoutOrCancel::Timeout,
+        _ = cancel.cancelled() => TimeoutOrCancel::Cancel,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use crate::DownloadError;
+    use futures::stream::StreamExt;
+
+    #[tokio::test(start_paused = true)]
+    async fn cancelled_download_stream() {
+        let inner = futures::stream::pending();
+        let timeout = Duration::from_secs(120);
+        let cancel = CancellationToken::new();
+
+        let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
+        let mut stream = std::pin::pin!(stream);
+
+        let mut first = stream.next();
+
+        tokio::select! {
+            _ = &mut first => unreachable!("we haven't yet cancelled nor is timeout passed"),
+            _ = tokio::time::sleep(Duration::from_secs(1)) => {},
+        }
+
+        cancel.cancel();
+
+        let e = first.await.expect("there must be some").unwrap_err();
+        assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}");
+        let inner = e.get_ref().expect("inner should be set");
+        assert!(
+            inner
+                .downcast_ref::<DownloadError>()
+                .is_some_and(|e| matches!(e, DownloadError::Cancelled)),
+            "{inner:?}"
+        );
+
+        tokio::select! {
+            _ = stream.next() => unreachable!("no timeout ever happens as we were already cancelled"),
+            _ = tokio::time::sleep(Duration::from_secs(121)) => {},
+        }
+    }
+
+    #[tokio::test(start_paused = true)]
+    async fn timeouted_download_stream() {
+        let inner = futures::stream::pending();
+        let timeout = Duration::from_secs(120);
+        let cancel = CancellationToken::new();
+
+        let stream = DownloadStream::new(cancel_or_timeout(timeout, cancel.clone()), inner);
+        let mut stream = std::pin::pin!(stream);
+
+        // because the stream uses 120s timeout we are paused, we advance to 120s right away.
+        let first = stream.next();
+
+        let e = first.await.expect("there must be some").unwrap_err();
+        assert!(matches!(e.kind(), std::io::ErrorKind::Other), "{e:?}");
+        let inner = e.get_ref().expect("inner should be set");
+        assert!(
+            inner
+                .downcast_ref::<DownloadError>()
+                .is_some_and(|e| matches!(e, DownloadError::Timeout)),
+            "{inner:?}"
+        );
+
+        cancel.cancel();
+
+        tokio::select! {
+            _ = stream.next() => unreachable!("no cancellation ever happens because we already timed out"),
+            _ = tokio::time::sleep(Duration::from_secs(121)) => {},
+        }
+    }
+}
--- a/libs/remote_storage/tests/common/mod.rs
+++ b/libs/remote_storage/tests/common/mod.rs
@@ -10,6 +10,7 @@ use futures::stream::Stream;
 use once_cell::sync::OnceCell;
 use remote_storage::{Download, GenericRemoteStorage, RemotePath};
 use tokio::task::JoinSet;
+use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info};

 static LOGGING_DONE: OnceCell<()> = OnceCell::new();
@@ -58,8 +59,12 @@ pub(crate) async fn upload_simple_remote_data(
 ) -> ControlFlow<HashSet<RemotePath>, HashSet<RemotePath>> {
    info!("Creating {upload_tasks_count} remote files");
    let mut upload_tasks = JoinSet::new();
+    let cancel = CancellationToken::new();
+
    for i in 1..upload_tasks_count + 1 {
        let task_client = Arc::clone(client);
+        let cancel = cancel.clone();
+
        upload_tasks.spawn(async move {
            let blob_path = PathBuf::from(format!("folder{}/blob_{}.txt", i / 7, i));
            let blob_path = RemotePath::new(
@@ -69,7 +74,9 @@ pub(crate) async fn upload_simple_remote_data(
            debug!("Creating remote item {i} at path {blob_path:?}");

            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, len, &blob_path, None).await?;
+            task_client
+                .upload(data, len, &blob_path, None, &cancel)
+                .await?;

            Ok::<_, anyhow::Error>(blob_path)
        });
@@ -107,13 +114,15 @@ pub(crate) async fn cleanup(
        "Removing {} objects from the remote storage during cleanup",
        objects_to_delete.len()
    );
+    let cancel = CancellationToken::new();
    let mut delete_tasks = JoinSet::new();
    for object_to_delete in objects_to_delete {
        let task_client = Arc::clone(client);
+        let cancel = cancel.clone();
        delete_tasks.spawn(async move {
            debug!("Deleting remote item at path {object_to_delete:?}");
            task_client
-                .delete(&object_to_delete)
+                .delete(&object_to_delete, &cancel)
                .await
                .with_context(|| format!("{object_to_delete:?} removal"))
        });
@@ -141,8 +150,12 @@ pub(crate) async fn upload_remote_data(
 ) -> ControlFlow<Uploads, Uploads> {
    info!("Creating {upload_tasks_count} remote files");
    let mut upload_tasks = JoinSet::new();
+    let cancel = CancellationToken::new();
+
    for i in 1..upload_tasks_count + 1 {
        let task_client = Arc::clone(client);
+        let cancel = cancel.clone();
+
        upload_tasks.spawn(async move {
            let prefix = format!("{base_prefix_str}/sub_prefix_{i}/");
            let blob_prefix = RemotePath::new(Utf8Path::new(&prefix))
@@ -152,7 +165,9 @@ pub(crate) async fn upload_remote_data(

            let (data, data_len) =
                upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client.upload(data, data_len, &blob_path, None).await?;
+            task_client
+                .upload(data, data_len, &blob_path, None, &cancel)
+                .await?;

            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
        });
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -4,6 +4,7 @@ use remote_storage::RemotePath;
 use std::sync::Arc;
 use std::{collections::HashSet, num::NonZeroU32};
 use test_context::test_context;
+use tokio_util::sync::CancellationToken;
 use tracing::debug;

 use crate::common::{download_to_vec, upload_stream, wrap_stream};
@@ -45,13 +46,15 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
        }
    };

+    let cancel = CancellationToken::new();
+
    let test_client = Arc::clone(&ctx.enabled.client);
    let expected_remote_prefixes = ctx.remote_prefixes.clone();

    let base_prefix = RemotePath::new(Utf8Path::new(ctx.enabled.base_prefix))
        .context("common_prefix construction")?;
    let root_remote_prefixes = test_client
-        .list_prefixes(None)
+        .list_prefixes(None, &cancel)
        .await
        .context("client list root prefixes failure")?
        .into_iter()
@@ -62,7 +65,7 @@ async fn pagination_should_work(ctx: &mut MaybeEnabledStorageWithTestBlobs) -> a
    );

    let nested_remote_prefixes = test_client
-        .list_prefixes(Some(&base_prefix))
+        .list_prefixes(Some(&base_prefix), &cancel)
        .await
        .context("client list nested prefixes failure")?
        .into_iter()
@@ -99,11 +102,12 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
            anyhow::bail!("S3 init failed: {e:?}")
        }
    };
+    let cancel = CancellationToken::new();
    let test_client = Arc::clone(&ctx.enabled.client);
    let base_prefix =
        RemotePath::new(Utf8Path::new("folder1")).context("common_prefix construction")?;
    let root_files = test_client
-        .list_files(None, None)
+        .list_files(None, None, &cancel)
        .await
        .context("client list root files failure")?
        .into_iter()
@@ -117,13 +121,13 @@ async fn list_files_works(ctx: &mut MaybeEnabledStorageWithSimpleTestBlobs) -> a
    // Test that max_keys limit works. In total there are about 21 files (see
    // upload_simple_remote_data call in test_real_s3.rs).
    let limited_root_files = test_client
-        .list_files(None, Some(NonZeroU32::new(2).unwrap()))
+        .list_files(None, Some(NonZeroU32::new(2).unwrap()), &cancel)
        .await
        .context("client list root files failure")?;
    assert_eq!(limited_root_files.len(), 2);

    let nested_remote_files = test_client
-        .list_files(Some(&base_prefix), None)
+        .list_files(Some(&base_prefix), None, &cancel)
        .await
        .context("client list nested files failure")?
        .into_iter()
@@ -150,12 +154,17 @@ async fn delete_non_exising_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Resu
        MaybeEnabledStorage::Disabled => return Ok(()),
    };

+    let cancel = CancellationToken::new();
+
    let path = RemotePath::new(Utf8Path::new(
        format!("{}/for_sure_there_is_nothing_there_really", ctx.base_prefix).as_str(),
    ))
    .with_context(|| "RemotePath conversion")?;

-    ctx.client.delete(&path).await.expect("should succeed");
+    ctx.client
+        .delete(&path, &cancel)
+        .await
+        .expect("should succeed");

    Ok(())
 }
@@ -168,6 +177,8 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
        MaybeEnabledStorage::Disabled => return Ok(()),
    };

+    let cancel = CancellationToken::new();
+
    let path1 = RemotePath::new(Utf8Path::new(format!("{}/path1", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

@@ -178,21 +189,21 @@ async fn delete_objects_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<(
        .with_context(|| "RemotePath conversion")?;

    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-    ctx.client.upload(data, len, &path1, None).await?;
+    ctx.client.upload(data, len, &path1, None, &cancel).await?;

    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
-    ctx.client.upload(data, len, &path2, None).await?;
+    ctx.client.upload(data, len, &path2, None, &cancel).await?;

    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-    ctx.client.upload(data, len, &path3, None).await?;
+    ctx.client.upload(data, len, &path3, None, &cancel).await?;

-    ctx.client.delete_objects(&[path1, path2]).await?;
+    ctx.client.delete_objects(&[path1, path2], &cancel).await?;

-    let prefixes = ctx.client.list_prefixes(None).await?;
+    let prefixes = ctx.client.list_prefixes(None, &cancel).await?;

    assert_eq!(prefixes.len(), 1);

-    ctx.client.delete_objects(&[path3]).await?;
+    ctx.client.delete_objects(&[path3], &cancel).await?;

    Ok(())
 }
@@ -204,6 +215,8 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
        return Ok(());
    };

+    let cancel = CancellationToken::new();
+
    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;

@@ -211,47 +224,56 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<

    let (data, len) = wrap_stream(orig.clone());

-    ctx.client.upload(data, len, &path, None).await?;
+    ctx.client.upload(data, len, &path, None, &cancel).await?;

    // Normal download request
-    let dl = ctx.client.download(&path).await?;
+    let dl = ctx.client.download(&path, &cancel).await?;
    let buf = download_to_vec(dl).await?;
    assert_eq!(&buf, &orig);

    // Full range (end specified)
    let dl = ctx
        .client
-        .download_byte_range(&path, 0, Some(len as u64))
+        .download_byte_range(&path, 0, Some(len as u64), &cancel)
        .await?;
    let buf = download_to_vec(dl).await?;
    assert_eq!(&buf, &orig);

    // partial range (end specified)
-    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 4, Some(10), &cancel)
+        .await?;
    let buf = download_to_vec(dl).await?;
    assert_eq!(&buf, &orig[4..10]);

    // partial range (end beyond real end)
    let dl = ctx
        .client
-        .download_byte_range(&path, 8, Some(len as u64 * 100))
+        .download_byte_range(&path, 8, Some(len as u64 * 100), &cancel)
        .await?;
    let buf = download_to_vec(dl).await?;
    assert_eq!(&buf, &orig[8..]);

    // Partial range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 4, None, &cancel)
+        .await?;
    let buf = download_to_vec(dl).await?;
    assert_eq!(&buf, &orig[4..]);

    // Full range (end unspecified)
-    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
+    let dl = ctx
+        .client
+        .download_byte_range(&path, 0, None, &cancel)
+        .await?;
    let buf = download_to_vec(dl).await?;
    assert_eq!(&buf, &orig);

    debug!("Cleanup: deleting file at path {path:?}");
    ctx.client
-        .delete(&path)
+        .delete(&path, &cancel)
        .await
        .with_context(|| format!("{path:?} removal"))?;

@@ -265,6 +287,8 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {
        return Ok(());
    };

+    let cancel = CancellationToken::new();
+
    let path = RemotePath::new(Utf8Path::new(
        format!("{}/file_to_copy", ctx.base_prefix).as_str(),
    ))
@@ -278,18 +302,18 @@ async fn copy_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<()> {

    let (data, len) = wrap_stream(orig.clone());

-    ctx.client.upload(data, len, &path, None).await?;
+    ctx.client.upload(data, len, &path, None, &cancel).await?;

    // Normal download request
-    ctx.client.copy_object(&path, &path_dest).await?;
+    ctx.client.copy_object(&path, &path_dest, &cancel).await?;

-    let dl = ctx.client.download(&path_dest).await?;
+    let dl = ctx.client.download(&path_dest, &cancel).await?;
    let buf = download_to_vec(dl).await?;
    assert_eq!(&buf, &orig);

    debug!("Cleanup: deleting file at path {path:?}");
    ctx.client
-        .delete_objects(&[path.clone(), path_dest.clone()])
+        .delete_objects(&[path.clone(), path_dest.clone()], &cancel)
        .await
        .with_context(|| format!("{path:?} removal"))?;

--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -1,9 +1,9 @@
-use std::collections::HashSet;
 use std::env;
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::sync::Arc;
 use std::time::UNIX_EPOCH;
+use std::{collections::HashSet, time::Duration};

 use anyhow::Context;
 use remote_storage::{
@@ -39,6 +39,17 @@ impl EnabledAzure {
            base_prefix: BASE_PREFIX,
        }
    }
+
+    #[allow(unused)] // this will be needed when moving the timeout integration tests back
+    fn configure_request_timeout(&mut self, timeout: Duration) {
+        match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") {
+            GenericRemoteStorage::AzureBlob(azure) => {
+                let azure = Arc::get_mut(azure).expect("inner Arc::get_mut");
+                azure.timeout = timeout;
+            }
+            _ => unreachable!(),
+        }
+    }
 }

 enum MaybeEnabledStorage {
@@ -213,6 +224,7 @@ fn create_azure_client(
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
            max_keys_per_list_response,
        }),
+        timeout: Duration::from_secs(120),
    };
    Ok(Arc::new(
        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -1,5 +1,6 @@
 use std::env;
 use std::fmt::{Debug, Display};
+use std::future::Future;
 use std::num::NonZeroUsize;
 use std::ops::ControlFlow;
 use std::sync::Arc;
@@ -9,9 +10,10 @@ use std::{collections::HashSet, time::SystemTime};
 use crate::common::{download_to_vec, upload_stream};
 use anyhow::Context;
 use camino::Utf8Path;
-use futures_util::Future;
+use futures_util::StreamExt;
 use remote_storage::{
-    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
+    DownloadError, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
+    S3Config,
 };
 use test_context::test_context;
 use test_context::AsyncTestContext;
@@ -27,7 +29,6 @@ use common::{cleanup, ensure_logging_ready, upload_remote_data, upload_simple_re
 use utils::backoff;

 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
-
 const BASE_PREFIX: &str = "test";

 #[test_context(MaybeEnabledStorage)]
@@ -69,8 +70,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
        ret
    }

-    async fn list_files(client: &Arc<GenericRemoteStorage>) -> anyhow::Result<HashSet<RemotePath>> {
-        Ok(retry(|| client.list_files(None, None))
+    async fn list_files(
+        client: &Arc<GenericRemoteStorage>,
+        cancel: &CancellationToken,
+    ) -> anyhow::Result<HashSet<RemotePath>> {
+        Ok(retry(|| client.list_files(None, None, cancel))
            .await
            .context("list root files failure")?
            .into_iter()
@@ -90,11 +94,11 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:

    retry(|| {
        let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-        ctx.client.upload(data, len, &path1, None)
+        ctx.client.upload(data, len, &path1, None, &cancel)
    })
    .await?;

-    let t0_files = list_files(&ctx.client).await?;
+    let t0_files = list_files(&ctx.client, &cancel).await?;
    let t0 = time_point().await;
    println!("at t0: {t0_files:?}");

@@ -102,17 +106,17 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:

    retry(|| {
        let (data, len) = upload_stream(old_data.as_bytes().into());
-        ctx.client.upload(data, len, &path2, None)
+        ctx.client.upload(data, len, &path2, None, &cancel)
    })
    .await?;

-    let t1_files = list_files(&ctx.client).await?;
+    let t1_files = list_files(&ctx.client, &cancel).await?;
    let t1 = time_point().await;
    println!("at t1: {t1_files:?}");

    // A little check to ensure that our clock is not too far off from the S3 clock
    {
-        let dl = retry(|| ctx.client.download(&path2)).await?;
+        let dl = retry(|| ctx.client.download(&path2, &cancel)).await?;
        let last_modified = dl.last_modified.unwrap();
        let half_wt = WAIT_TIME.mul_f32(0.5);
        let t0_hwt = t0 + half_wt;
@@ -125,7 +129,7 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:

    retry(|| {
        let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-        ctx.client.upload(data, len, &path3, None)
+        ctx.client.upload(data, len, &path3, None, &cancel)
    })
    .await?;

@@ -133,12 +137,12 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:

    retry(|| {
        let (data, len) = upload_stream(new_data.as_bytes().into());
-        ctx.client.upload(data, len, &path2, None)
+        ctx.client.upload(data, len, &path2, None, &cancel)
    })
    .await?;

-    retry(|| ctx.client.delete(&path1)).await?;
-    let t2_files = list_files(&ctx.client).await?;
+    retry(|| ctx.client.delete(&path1, &cancel)).await?;
+    let t2_files = list_files(&ctx.client, &cancel).await?;
    let t2 = time_point().await;
    println!("at t2: {t2_files:?}");

@@ -147,10 +151,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    ctx.client
        .time_travel_recover(None, t2, t_final, &cancel)
        .await?;
-    let t2_files_recovered = list_files(&ctx.client).await?;
+    let t2_files_recovered = list_files(&ctx.client, &cancel).await?;
    println!("after recovery to t2: {t2_files_recovered:?}");
    assert_eq!(t2_files, t2_files_recovered);
-    let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2).await?).await?;
+    let path2_recovered_t2 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?;
    assert_eq!(path2_recovered_t2, new_data.as_bytes());

    // after recovery to t1: path1 is back, path2 has the old content
@@ -158,10 +162,10 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    ctx.client
        .time_travel_recover(None, t1, t_final, &cancel)
        .await?;
-    let t1_files_recovered = list_files(&ctx.client).await?;
+    let t1_files_recovered = list_files(&ctx.client, &cancel).await?;
    println!("after recovery to t1: {t1_files_recovered:?}");
    assert_eq!(t1_files, t1_files_recovered);
-    let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2).await?).await?;
+    let path2_recovered_t1 = download_to_vec(ctx.client.download(&path2, &cancel).await?).await?;
    assert_eq!(path2_recovered_t1, old_data.as_bytes());

    // after recovery to t0: everything is gone except for path1
@@ -169,14 +173,14 @@ async fn s3_time_travel_recovery_works(ctx: &mut MaybeEnabledStorage) -> anyhow:
    ctx.client
        .time_travel_recover(None, t0, t_final, &cancel)
        .await?;
-    let t0_files_recovered = list_files(&ctx.client).await?;
+    let t0_files_recovered = list_files(&ctx.client, &cancel).await?;
    println!("after recovery to t0: {t0_files_recovered:?}");
    assert_eq!(t0_files, t0_files_recovered);

    // cleanup

    let paths = &[path1, path2, path3];
-    retry(|| ctx.client.delete_objects(paths)).await?;
+    retry(|| ctx.client.delete_objects(paths, &cancel)).await?;

    Ok(())
 }
@@ -197,6 +201,16 @@ impl EnabledS3 {
            base_prefix: BASE_PREFIX,
        }
    }
+
+    fn configure_request_timeout(&mut self, timeout: Duration) {
+        match Arc::get_mut(&mut self.client).expect("outer Arc::get_mut") {
+            GenericRemoteStorage::AwsS3(s3) => {
+                let s3 = Arc::get_mut(s3).expect("inner Arc::get_mut");
+                s3.timeout = timeout;
+            }
+            _ => unreachable!(),
+        }
+    }
 }

 enum MaybeEnabledStorage {
@@ -370,8 +384,169 @@ fn create_s3_client(
            concurrency_limit: NonZeroUsize::new(100).unwrap(),
            max_keys_per_list_response,
        }),
+        timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
    };
    Ok(Arc::new(
        GenericRemoteStorage::from_config(&remote_storage_config).context("remote storage init")?,
    ))
 }
+
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn download_is_timeouted(ctx: &mut MaybeEnabledStorage) {
+    let MaybeEnabledStorage::Enabled(ctx) = ctx else {
+        return;
+    };
+
+    let cancel = CancellationToken::new();
+
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/file_to_copy", ctx.base_prefix).as_str(),
+    ))
+    .unwrap();
+
+    let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
+
+    let timeout = std::time::Duration::from_secs(5);
+
+    ctx.configure_request_timeout(timeout);
+
+    let started_at = std::time::Instant::now();
+    let mut stream = ctx
+        .client
+        .download(&path, &cancel)
+        .await
+        .expect("download succeeds")
+        .download_stream;
+
+    if started_at.elapsed().mul_f32(0.9) >= timeout {
+        tracing::warn!(
+            elapsed_ms = started_at.elapsed().as_millis(),
+            "timeout might be too low, consumed most of it during headers"
+        );
+    }
+
+    let first = stream
+        .next()
+        .await
+        .expect("should have the first blob")
+        .expect("should have succeeded");
+
+    tracing::info!(len = first.len(), "downloaded first chunk");
+
+    assert!(
+        first.len() < len,
+        "uploaded file is too small, we downloaded all on first chunk"
+    );
+
+    tokio::time::sleep(timeout).await;
+
+    {
+        let started_at = std::time::Instant::now();
+        let next = stream
+            .next()
+            .await
+            .expect("stream should not have ended yet");
+
+        tracing::info!(
+            next.is_err = next.is_err(),
+            elapsed_ms = started_at.elapsed().as_millis(),
+            "received item after timeout"
+        );
+
+        let e = next.expect_err("expected an error, but got a chunk?");
+
+        let inner = e.get_ref().expect("std::io::Error::inner should be set");
+        assert!(
+            inner
+                .downcast_ref::<DownloadError>()
+                .is_some_and(|e| matches!(e, DownloadError::Timeout)),
+            "{inner:?}"
+        );
+    }
+
+    ctx.configure_request_timeout(RemoteStorageConfig::DEFAULT_TIMEOUT);
+
+    ctx.client.delete_objects(&[path], &cancel).await.unwrap()
+}
+
+#[test_context(MaybeEnabledStorage)]
+#[tokio::test]
+async fn download_is_cancelled(ctx: &mut MaybeEnabledStorage) {
+    let MaybeEnabledStorage::Enabled(ctx) = ctx else {
+        return;
+    };
+
+    let cancel = CancellationToken::new();
+
+    let path = RemotePath::new(Utf8Path::new(
+        format!("{}/file_to_copy", ctx.base_prefix).as_str(),
+    ))
+    .unwrap();
+
+    let len = upload_large_enough_file(&ctx.client, &path, &cancel).await;
+
+    {
+        let mut stream = ctx
+            .client
+            .download(&path, &cancel)
+            .await
+            .expect("download succeeds")
+            .download_stream;
+
+        let first = stream
+            .next()
+            .await
+            .expect("should have the first blob")
+            .expect("should have succeeded");
+
+        tracing::info!(len = first.len(), "downloaded first chunk");
+
+        assert!(
+            first.len() < len,
+            "uploaded file is too small, we downloaded all on first chunk"
+        );
+
+        cancel.cancel();
+
+        let next = stream.next().await.expect("stream should have more");
+
+        let e = next.expect_err("expected an error, but got a chunk?");
+
+        let inner = e.get_ref().expect("std::io::Error::inner should be set");
+        assert!(
+            inner
+                .downcast_ref::<DownloadError>()
+                .is_some_and(|e| matches!(e, DownloadError::Cancelled)),
+            "{inner:?}"
+        );
+    }
+
+    let cancel = CancellationToken::new();
+
+    ctx.client.delete_objects(&[path], &cancel).await.unwrap();
+}
+
+/// Upload a long enough file so that we cannot download it in single chunk
+///
+/// For s3 the first chunk seems to be less than 10kB, so this has a bit of a safety margin
+async fn upload_large_enough_file(
+    client: &GenericRemoteStorage,
+    path: &RemotePath,
+    cancel: &CancellationToken,
+) -> usize {
+    let header = bytes::Bytes::from_static("remote blob data content".as_bytes());
+    let body = bytes::Bytes::from(vec![0u8; 1024]);
+    let contents = std::iter::once(header).chain(std::iter::repeat(body).take(128));
+
+    let len = contents.clone().fold(0, |acc, next| acc + next.len());
+
+    let contents = futures::stream::iter(contents.map(std::io::Result::Ok));
+
+    client
+        .upload(contents, len, path, None, cancel)
+        .await
+        .expect("upload succeeds");
+
+    len
+}
--- a/libs/utils/Cargo.toml
+++ b/libs/utils/Cargo.toml
@@ -25,6 +25,7 @@ hyper = { workspace = true, features = ["full"] }
 fail.workspace = true
 futures = { workspace = true}
 jsonwebtoken.workspace = true
+leaky-bucket.workspace = true
 nix.workspace = true
 once_cell.workspace = true
 pin-project-lite.workspace = true
--- a/libs/utils/benches/benchmarks.rs
+++ b/libs/utils/benches/benchmarks.rs
@@ -1,5 +1,3 @@
-#![allow(unused)]
-
 use criterion::{criterion_group, criterion_main, Criterion};
 use utils::id;

--- a/libs/utils/src/auth.rs
+++ b/libs/utils/src/auth.rs
@@ -29,6 +29,9 @@ pub enum Scope {
    // Should only be used e.g. for status check.
    // Currently also used for connection from any pageserver to any safekeeper.
    SafekeeperData,
+    // The scope used by pageservers in upcalls to storage controller and cloud control plane
+    #[serde(rename = "generations_api")]
+    GenerationsApi,
 }

 /// JWT payload. See docs/authentication.md for the format
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -12,6 +12,7 @@ testing = ["fail/failpoints"]

 [dependencies]
 anyhow.workspace = true
+arc-swap.workspace = true
 async-compression.workspace = true
 async-stream.workspace = true
 async-trait.workspace = true
@@ -35,6 +36,7 @@ humantime.workspace = true
 humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
+leaky-bucket.workspace = true
 md5.workspace = true
 nix.workspace = true
 # hack to get the number of worker threads tokio uses
@@ -82,7 +84,7 @@ workspace_hack.workspace = true
 reqwest.workspace = true
 rpds.workspace = true
 enum-map.workspace = true
-enumset.workspace = true
+enumset = { workspace = true, features = ["serde"]}
 strum.workspace = true
 strum_macros.workspace = true

--- a/pageserver/benches/bench_walredo.rs
+++ b/pageserver/benches/bench_walredo.rs
@@ -6,14 +6,28 @@
 //! There are two sets of inputs; `short` and `medium`. They were collected on postgres v14 by
 //! logging what happens when a sequential scan is requested on a small table, then picking out two
 //! suitable from logs.
+//!
+//!
+//! Reference data (git blame to see commit) on an i3en.3xlarge
+// ```text
+//! short/short/1           time:   [39.175 µs 39.348 µs 39.536 µs]
+//! short/short/2           time:   [51.227 µs 51.487 µs 51.755 µs]
+//! short/short/4           time:   [76.048 µs 76.362 µs 76.674 µs]
+//! short/short/8           time:   [128.94 µs 129.82 µs 130.74 µs]
+//! short/short/16          time:   [227.84 µs 229.00 µs 230.28 µs]
+//! short/short/32          time:   [455.97 µs 457.81 µs 459.90 µs]
+//! short/short/64          time:   [902.46 µs 904.84 µs 907.32 µs]
+//! short/short/128         time:   [1.7416 ms 1.7487 ms 1.7561 ms]
+//! ``

-use std::sync::{Arc, Barrier};
+use std::sync::Arc;

 use bytes::{Buf, Bytes};
 use pageserver::{
    config::PageServerConf, repository::Key, walrecord::NeonWalRecord, walredo::PostgresRedoManager,
 };
 use pageserver_api::shard::TenantShardId;
+use tokio::task::JoinSet;
 use utils::{id::TenantId, lsn::Lsn};

 use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion};
@@ -39,11 +53,11 @@ fn redo_scenarios(c: &mut Criterion) {
            .build()
            .unwrap();
        tracing::info!("executing first");
-        short().execute(rt.handle(), &manager).unwrap();
+        rt.block_on(short().execute(&manager)).unwrap();
        tracing::info!("first executed");
    }

-    let thread_counts = [1, 2, 4, 8, 16];
+    let thread_counts = [1, 2, 4, 8, 16, 32, 64, 128];

    let mut group = c.benchmark_group("short");
    group.sampling_mode(criterion::SamplingMode::Flat);
@@ -74,114 +88,69 @@ fn redo_scenarios(c: &mut Criterion) {
    drop(group);
 }

-/// Sets up `threads` number of requesters to `request_redo`, with the given input.
+/// Sets up a multi-threaded tokio runtime with default worker thread count,
+/// then, spawn `requesters` tasks that repeatedly:
+/// - get input from `input_factor()`
+/// - call `manager.request_redo()` with their input
+///
+/// This stress-tests the scalability of a single walredo manager at high tokio-level concurrency.
+///
+/// Using tokio's default worker thread count means the results will differ on machines
+/// with different core countrs. We don't care about that, the performance will always
+/// be different on different hardware. To compare performance of different software versions,
+/// use the same hardware.
 fn add_multithreaded_walredo_requesters(
    b: &mut criterion::Bencher,
-    threads: u32,
+    nrequesters: usize,
    manager: &Arc<PostgresRedoManager>,
    input_factory: fn() -> Request,
 ) {
-    assert_ne!(threads, 0);
+    assert_ne!(nrequesters, 0);

-    if threads == 1 {
-        let rt = tokio::runtime::Builder::new_current_thread()
-            .enable_all()
-            .build()
-            .unwrap();
-        let handle = rt.handle();
-        b.iter_batched_ref(
-            || Some(input_factory()),
-            |input| execute_all(input.take(), handle, manager),
-            criterion::BatchSize::PerIteration,
-        );
-    } else {
-        let (work_tx, work_rx) = std::sync::mpsc::sync_channel(threads as usize);
+    let rt = tokio::runtime::Builder::new_multi_thread()
+        .enable_all()
+        .build()
+        .unwrap();

-        let work_rx = std::sync::Arc::new(std::sync::Mutex::new(work_rx));
+    let barrier = Arc::new(tokio::sync::Barrier::new(nrequesters + 1));

-        let barrier = Arc::new(Barrier::new(threads as usize + 1));
-
-        let jhs = (0..threads)
-            .map(|_| {
-                std::thread::spawn({
-                    let manager = manager.clone();
-                    let barrier = barrier.clone();
-                    let work_rx = work_rx.clone();
-                    move || {
-                        let rt = tokio::runtime::Builder::new_current_thread()
-                            .enable_all()
-                            .build()
-                            .unwrap();
-                        let handle = rt.handle();
-                        loop {
-                            // queue up and wait if we want to go another round
-                            if work_rx.lock().unwrap().recv().is_err() {
-                                break;
-                            }
-
-                            let input = Some(input_factory());
-
-                            barrier.wait();
-
-                            execute_all(input, handle, &manager).unwrap();
-
-                            barrier.wait();
-                        }
-                    }
-                })
-            })
-            .collect::<Vec<_>>();
-
-        let _jhs = JoinOnDrop(jhs);
-
-        b.iter_batched(
-            || {
-                for _ in 0..threads {
-                    work_tx.send(()).unwrap()
-                }
-            },
-            |()| {
-                // start the work
-                barrier.wait();
-
-                // wait for work to complete
-                barrier.wait();
-            },
-            criterion::BatchSize::PerIteration,
-        );
-
-        drop(work_tx);
+    let mut requesters = JoinSet::new();
+    for _ in 0..nrequesters {
+        let _entered = rt.enter();
+        let manager = manager.clone();
+        let barrier = barrier.clone();
+        requesters.spawn(async move {
+            loop {
+                let input = input_factory();
+                barrier.wait().await;
+                let page = input.execute(&manager).await.unwrap();
+                assert_eq!(page.remaining(), 8192);
+                barrier.wait().await;
+            }
+        });
    }
-}

-struct JoinOnDrop(Vec<std::thread::JoinHandle<()>>);
+    let do_one_iteration = || {
+        rt.block_on(async {
+            barrier.wait().await;
+            // wait for work to complete
+            barrier.wait().await;
+        })
+    };

-impl Drop for JoinOnDrop {
-    // it's not really needless because we want join all then check for panicks
-    #[allow(clippy::needless_collect)]
-    fn drop(&mut self) {
-        // first join all
-        let results = self.0.drain(..).map(|jh| jh.join()).collect::<Vec<_>>();
-        // then check the results; panicking here is not great, but it does get the message across
-        // to the user, and sets an exit value.
-        results.into_iter().try_for_each(|res| res).unwrap();
-    }
-}
+    b.iter_batched(
+        || {
+            // warmup
+            do_one_iteration();
+        },
+        |()| {
+            // work loop
+            do_one_iteration();
+        },
+        criterion::BatchSize::PerIteration,
+    );

-fn execute_all<I>(
-    input: I,
-    handle: &tokio::runtime::Handle,
-    manager: &PostgresRedoManager,
-) -> anyhow::Result<()>
-where
-    I: IntoIterator<Item = Request>,
-{
-    // just fire all requests as fast as possible
-    input.into_iter().try_for_each(|req| {
-        let page = req.execute(handle, manager)?;
-        assert_eq!(page.remaining(), 8192);
-        anyhow::Ok(())
-    })
+    rt.block_on(requesters.shutdown());
 }

 criterion_group!(benches, redo_scenarios);
@@ -493,11 +462,7 @@ struct Request {
 }

 impl Request {
-    fn execute(
-        self,
-        rt: &tokio::runtime::Handle,
-        manager: &PostgresRedoManager,
-    ) -> anyhow::Result<Bytes> {
+    async fn execute(self, manager: &PostgresRedoManager) -> anyhow::Result<Bytes> {
        let Request {
            key,
            lsn,
@@ -506,6 +471,8 @@ impl Request {
            pg_version,
        } = self;

-        rt.block_on(manager.request_redo(key, lsn, base_img, records, pg_version))
+        manager
+            .request_redo(key, lsn, base_img, records, pg_version)
+            .await
    }
 }
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -1,6 +1,5 @@
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use futures::future::join_all;
 use pageserver_api::key::{is_rel_block_key, key_to_rel_block, Key};
 use pageserver_api::keyspace::KeySpaceAccum;
 use pageserver_api::models::PagestreamGetPageRequest;
@@ -10,11 +9,10 @@ use utils::id::TenantTimelineId;
 use utils::lsn::Lsn;

 use rand::prelude::*;
-use tokio::sync::Barrier;
 use tokio::task::JoinSet;
-use tracing::{info, instrument};
+use tracing::info;

-use std::collections::{HashMap, HashSet};
+use std::collections::HashSet;
 use std::future::Future;
 use std::num::NonZeroUsize;
 use std::pin::Pin;
@@ -38,8 +36,12 @@ pub(crate) struct Args {
    num_clients: NonZeroUsize,
    #[clap(long)]
    runtime: Option<humantime::Duration>,
+    /// Each client sends requests at the given rate.
+    ///
+    /// If a request takes too long and we should be issuing a new request already,
+    /// we skip that request and account it as `MISSED`.
    #[clap(long)]
-    per_target_rate_limit: Option<usize>,
+    per_client_rate: Option<usize>,
    /// Probability for sending `latest=true` in the request (uniform distribution).
    #[clap(long, default_value = "1")]
    req_latest_probability: f64,
@@ -61,12 +63,16 @@ pub(crate) struct Args {
 #[derive(Debug, Default)]
 struct LiveStats {
    completed_requests: AtomicU64,
+    missed: AtomicU64,
 }

 impl LiveStats {
-    fn inc(&self) {
+    fn request_done(&self) {
        self.completed_requests.fetch_add(1, Ordering::Relaxed);
    }
+    fn missed(&self, n: u64) {
+        self.missed.fetch_add(n, Ordering::Relaxed);
+    }
 }

 #[derive(Clone, serde::Serialize, serde::Deserialize)]
@@ -220,13 +226,12 @@ async fn main_impl(

    let live_stats = Arc::new(LiveStats::default());

-    let num_client_tasks = args.num_clients.get() * timelines.len();
    let num_live_stats_dump = 1;
-    let num_work_sender_tasks = 1;
+    let num_work_sender_tasks = args.num_clients.get() * timelines.len();
    let num_main_impl = 1;

    let start_work_barrier = Arc::new(tokio::sync::Barrier::new(
-        num_client_tasks + num_live_stats_dump + num_work_sender_tasks + num_main_impl,
+        num_live_stats_dump + num_work_sender_tasks + num_main_impl,
    ));

    tokio::spawn({
@@ -238,10 +243,12 @@ async fn main_impl(
                let start = std::time::Instant::now();
                tokio::time::sleep(std::time::Duration::from_secs(1)).await;
                let completed_requests = stats.completed_requests.swap(0, Ordering::Relaxed);
+                let missed = stats.missed.swap(0, Ordering::Relaxed);
                let elapsed = start.elapsed();
                info!(
-                    "RPS: {:.0}",
-                    completed_requests as f64 / elapsed.as_secs_f64()
+                    "RPS: {:.0}   MISSED: {:.0}",
+                    completed_requests as f64 / elapsed.as_secs_f64(),
+                    missed as f64 / elapsed.as_secs_f64()
                );
            }
        }
@@ -249,127 +256,105 @@ async fn main_impl(

    let cancel = CancellationToken::new();

-    let mut work_senders: HashMap<WorkerId, _> = HashMap::new();
-    let mut tasks = Vec::new();
+    let rps_period = args
+        .per_client_rate
+        .map(|rps_limit| Duration::from_secs_f64(1.0 / (rps_limit as f64)));
+    let make_worker: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> = &|worker_id| {
+        let live_stats = live_stats.clone();
+        let start_work_barrier = start_work_barrier.clone();
+        let ranges: Vec<KeyRange> = all_ranges
+            .iter()
+            .filter(|r| r.timeline == worker_id.timeline)
+            .cloned()
+            .collect();
+        let weights =
+            rand::distributions::weighted::WeightedIndex::new(ranges.iter().map(|v| v.len()))
+                .unwrap();
+
+        let cancel = cancel.clone();
+        Box::pin(async move {
+            let client =
+                pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
+                    .await
+                    .unwrap();
+            let mut client = client
+                .pagestream(worker_id.timeline.tenant_id, worker_id.timeline.timeline_id)
+                .await
+                .unwrap();
+
+            start_work_barrier.wait().await;
+            let client_start = Instant::now();
+            let mut ticks_processed = 0;
+            while !cancel.is_cancelled() {
+                // Detect if a request took longer than the RPS rate
+                if let Some(period) = &rps_period {
+                    let periods_passed_until_now =
+                        usize::try_from(client_start.elapsed().as_micros() / period.as_micros())
+                            .unwrap();
+
+                    if periods_passed_until_now > ticks_processed {
+                        live_stats.missed((periods_passed_until_now - ticks_processed) as u64);
+                    }
+                    ticks_processed = periods_passed_until_now;
+                }
+
+                let start = Instant::now();
+                let req = {
+                    let mut rng = rand::thread_rng();
+                    let r = &ranges[weights.sample(&mut rng)];
+                    let key: i128 = rng.gen_range(r.start..r.end);
+                    let key = Key::from_i128(key);
+                    assert!(is_rel_block_key(&key));
+                    let (rel_tag, block_no) =
+                        key_to_rel_block(key).expect("we filter non-rel-block keys out above");
+                    PagestreamGetPageRequest {
+                        latest: rng.gen_bool(args.req_latest_probability),
+                        lsn: r.timeline_lsn,
+                        rel: rel_tag,
+                        blkno: block_no,
+                    }
+                };
+                client.getpage(req).await.unwrap();
+                let end = Instant::now();
+                live_stats.request_done();
+                ticks_processed += 1;
+                STATS.with(|stats| {
+                    stats
+                        .borrow()
+                        .lock()
+                        .unwrap()
+                        .observe(end.duration_since(start))
+                        .unwrap();
+                });
+
+                if let Some(period) = &rps_period {
+                    let next_at = client_start
+                        + Duration::from_micros(
+                            (ticks_processed) as u64 * u64::try_from(period.as_micros()).unwrap(),
+                        );
+                    tokio::time::sleep_until(next_at.into()).await;
+                }
+            }
+        })
+    };
+
+    info!("spawning workers");
+    let mut workers = JoinSet::new();
    for timeline in timelines.iter().cloned() {
        for num_client in 0..args.num_clients.get() {
-            let (sender, receiver) = tokio::sync::mpsc::channel(10); // TODO: not sure what the implications of this are
            let worker_id = WorkerId {
                timeline,
                num_client,
            };
-            work_senders.insert(worker_id, sender);
-            tasks.push(tokio::spawn(client(
-                args,
-                worker_id,
-                Arc::clone(&start_work_barrier),
-                receiver,
-                Arc::clone(&live_stats),
-                cancel.clone(),
-            )));
+            workers.spawn(make_worker(worker_id));
        }
    }
-
-    let work_sender: Pin<Box<dyn Send + Future<Output = ()>>> = {
-        let start_work_barrier = start_work_barrier.clone();
-        let cancel = cancel.clone();
-        match args.per_target_rate_limit {
-            None => Box::pin(async move {
-                let weights = rand::distributions::weighted::WeightedIndex::new(
-                    all_ranges.iter().map(|v| v.len()),
-                )
-                .unwrap();
-
-                start_work_barrier.wait().await;
-
-                while !cancel.is_cancelled() {
-                    let (timeline, req) = {
-                        let mut rng = rand::thread_rng();
-                        let r = &all_ranges[weights.sample(&mut rng)];
-                        let key: i128 = rng.gen_range(r.start..r.end);
-                        let key = Key::from_i128(key);
-                        let (rel_tag, block_no) =
-                            key_to_rel_block(key).expect("we filter non-rel-block keys out above");
-                        (
-                            WorkerId {
-                                timeline: r.timeline,
-                                num_client: rng.gen_range(0..args.num_clients.get()),
-                            },
-                            PagestreamGetPageRequest {
-                                latest: rng.gen_bool(args.req_latest_probability),
-                                lsn: r.timeline_lsn,
-                                rel: rel_tag,
-                                blkno: block_no,
-                            },
-                        )
-                    };
-                    let sender = work_senders.get(&timeline).unwrap();
-                    // TODO: what if this blocks?
-                    if sender.send(req).await.is_err() {
-                        assert!(cancel.is_cancelled(), "client has gone away unexpectedly");
-                    }
-                }
-            }),
-            Some(rps_limit) => Box::pin(async move {
-                let period = Duration::from_secs_f64(1.0 / (rps_limit as f64));
-                let make_task: &dyn Fn(WorkerId) -> Pin<Box<dyn Send + Future<Output = ()>>> =
-                    &|worker_id| {
-                        let sender = work_senders.get(&worker_id).unwrap();
-                        let ranges: Vec<KeyRange> = all_ranges
-                            .iter()
-                            .filter(|r| r.timeline == worker_id.timeline)
-                            .cloned()
-                            .collect();
-                        let weights = rand::distributions::weighted::WeightedIndex::new(
-                            ranges.iter().map(|v| v.len()),
-                        )
-                        .unwrap();
-
-                        let cancel = cancel.clone();
-                        Box::pin(async move {
-                            let mut ticker = tokio::time::interval(period);
-                            ticker.set_missed_tick_behavior(
-                                /* TODO review this choice */
-                                tokio::time::MissedTickBehavior::Burst,
-                            );
-                            while !cancel.is_cancelled() {
-                                ticker.tick().await;
-                                let req = {
-                                    let mut rng = rand::thread_rng();
-                                    let r = &ranges[weights.sample(&mut rng)];
-                                    let key: i128 = rng.gen_range(r.start..r.end);
-                                    let key = Key::from_i128(key);
-                                    assert!(is_rel_block_key(&key));
-                                    let (rel_tag, block_no) = key_to_rel_block(key)
-                                        .expect("we filter non-rel-block keys out above");
-                                    PagestreamGetPageRequest {
-                                        latest: rng.gen_bool(args.req_latest_probability),
-                                        lsn: r.timeline_lsn,
-                                        rel: rel_tag,
-                                        blkno: block_no,
-                                    }
-                                };
-                                if sender.send(req).await.is_err() {
-                                    assert!(
-                                        cancel.is_cancelled(),
-                                        "client has gone away unexpectedly"
-                                    );
-                                }
-                            }
-                        })
-                    };
-
-                let tasks: Vec<_> = work_senders.keys().map(|tl| make_task(*tl)).collect();
-
-                start_work_barrier.wait().await;
-
-                join_all(tasks).await;
-            }),
+    let workers = async move {
+        while let Some(res) = workers.join_next().await {
+            res.unwrap();
        }
    };

-    let work_sender_task = tokio::spawn(work_sender);
-
    info!("waiting for everything to become ready");
    start_work_barrier.wait().await;
    info!("work started");
@@ -377,20 +362,13 @@ async fn main_impl(
        tokio::time::sleep(runtime.into()).await;
        info!("runtime over, signalling cancellation");
        cancel.cancel();
-        work_sender_task.await.unwrap();
+        workers.await;
        info!("work sender exited");
    } else {
-        work_sender_task.await.unwrap();
+        workers.await;
        unreachable!("work sender never terminates");
    }

-    info!("joining clients");
-    for t in tasks {
-        t.await.unwrap();
-    }
-
-    info!("all clients stopped");
-
    let output = Output {
        total: {
            let mut agg_stats = request_stats::Stats::new();
@@ -407,49 +385,3 @@ async fn main_impl(

    anyhow::Ok(())
 }
-
-#[instrument(skip_all)]
-async fn client(
-    args: &'static Args,
-    id: WorkerId,
-    start_work_barrier: Arc<Barrier>,
-    mut work: tokio::sync::mpsc::Receiver<PagestreamGetPageRequest>,
-    live_stats: Arc<LiveStats>,
-    cancel: CancellationToken,
-) {
-    let WorkerId {
-        timeline,
-        num_client: _,
-    } = id;
-    let client = pageserver_client::page_service::Client::new(args.page_service_connstring.clone())
-        .await
-        .unwrap();
-    let mut client = client
-        .pagestream(timeline.tenant_id, timeline.timeline_id)
-        .await
-        .unwrap();
-
-    let do_requests = async {
-        start_work_barrier.wait().await;
-        while let Some(req) = work.recv().await {
-            let start = Instant::now();
-            client
-                .getpage(req)
-                .await
-                .with_context(|| format!("getpage for {timeline}"))
-                .unwrap();
-            let elapsed = start.elapsed();
-            live_stats.inc();
-            STATS.with(|stats| {
-                stats.borrow().lock().unwrap().observe(elapsed).unwrap();
-            });
-        }
-    };
-    tokio::select! {
-        res = do_requests => { res },
-        _ = cancel.cancelled() => {
-            // fallthrough to shutdown
-        }
-    }
-    client.shutdown().await;
-}
--- a/pageserver/src/auth.rs
+++ b/pageserver/src/auth.rs
@@ -14,8 +14,12 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
        }
        (Scope::PageServerApi, None) => Ok(()), // access to management api for PageServerApi scope
        (Scope::PageServerApi, Some(_)) => Ok(()), // access to tenant api using PageServerApi scope
-        (Scope::SafekeeperData, _) => Err(AuthError(
-            "SafekeeperData scope makes no sense for Pageserver".into(),
+        (Scope::SafekeeperData | Scope::GenerationsApi, _) => Err(AuthError(
+            format!(
+                "JWT scope '{:?}' is ineligible for Pageserver auth",
+                claims.scope
+            )
+            .into(),
        )),
    }
 }
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -1359,6 +1359,7 @@ broker_endpoint = '{broker_endpoint}'
                parsed_remote_storage_config,
                RemoteStorageConfig {
                    storage: RemoteStorageKind::LocalFs(local_storage_path.clone()),
+                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                },
                "Remote storage config should correctly parse the local FS config and fill other storage defaults"
            );
@@ -1426,6 +1427,7 @@ broker_endpoint = '{broker_endpoint}'
                        concurrency_limit: s3_concurrency_limit,
                        max_keys_per_list_response: None,
                    }),
+                    timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
                },
                "Remote storage config should correctly parse the S3 config"
            );
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -834,7 +834,6 @@ mod test {
    }

    impl ControlPlaneGenerationsApi for MockControlPlane {
-        #[allow(clippy::diverging_sub_expression)] // False positive via async_trait
        async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
            unimplemented!()
        }
@@ -868,6 +867,7 @@ mod test {
        let remote_fs_dir = harness.conf.workdir.join("remote_fs").canonicalize_utf8()?;
        let storage_config = RemoteStorageConfig {
            storage: RemoteStorageKind::LocalFs(remote_fs_dir.clone()),
+            timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
        };
        let storage = GenericRemoteStorage::from_config(&storage_config).unwrap();

@@ -1171,6 +1171,7 @@ pub(crate) mod mock {
    pub struct ConsumerState {
        rx: tokio::sync::mpsc::UnboundedReceiver<ListWriterQueueMessage>,
        executor_rx: tokio::sync::mpsc::Receiver<DeleterMessage>,
+        cancel: CancellationToken,
    }

    impl ConsumerState {
@@ -1184,7 +1185,7 @@ pub(crate) mod mock {
                match msg {
                    DeleterMessage::Delete(objects) => {
                        for path in objects {
-                            match remote_storage.delete(&path).await {
+                            match remote_storage.delete(&path, &self.cancel).await {
                                Ok(_) => {
                                    debug!("Deleted {path}");
                                }
@@ -1217,7 +1218,7 @@ pub(crate) mod mock {

                        for path in objects {
                            info!("Executing deletion {path}");
-                            match remote_storage.delete(&path).await {
+                            match remote_storage.delete(&path, &self.cancel).await {
                                Ok(_) => {
                                    debug!("Deleted {path}");
                                }
@@ -1267,7 +1268,11 @@ pub(crate) mod mock {
                executor_tx,
                executed,
                remote_storage,
-                consumer: std::sync::Mutex::new(ConsumerState { rx, executor_rx }),
+                consumer: std::sync::Mutex::new(ConsumerState {
+                    rx,
+                    executor_rx,
+                    cancel: CancellationToken::new(),
+                }),
                lsn_table: Arc::new(std::sync::RwLock::new(VisibleLsnUpdates::new())),
            }
        }
--- a/pageserver/src/deletion_queue/deleter.rs
+++ b/pageserver/src/deletion_queue/deleter.rs
@@ -8,6 +8,7 @@

 use remote_storage::GenericRemoteStorage;
 use remote_storage::RemotePath;
+use remote_storage::TimeoutOrCancel;
 use remote_storage::MAX_KEYS_PER_DELETE;
 use std::time::Duration;
 use tokio_util::sync::CancellationToken;
@@ -71,9 +72,11 @@ impl Deleter {
                    Err(anyhow::anyhow!("failpoint: deletion-queue-before-execute"))
                });

-                self.remote_storage.delete_objects(&self.accumulator).await
+                self.remote_storage
+                    .delete_objects(&self.accumulator, &self.cancel)
+                    .await
            },
-            |_| false,
+            TimeoutOrCancel::caused_by_cancel,
            3,
            10,
            "executing deletion batch",
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -351,7 +351,6 @@ pub enum IterationOutcome<U> {
    Finished(IterationOutcomeFinished<U>),
 }

-#[allow(dead_code)]
 #[derive(Debug, Serialize)]
 pub struct IterationOutcomeFinished<U> {
    /// The actual usage observed before we started the iteration.
@@ -366,7 +365,6 @@ pub struct IterationOutcomeFinished<U> {
 }

 #[derive(Debug, Serialize)]
-#[allow(dead_code)]
 struct AssumedUsage<U> {
    /// The expected value for `after`, after phase 2.
    projected_after: U,
@@ -374,14 +372,12 @@ struct AssumedUsage<U> {
    failed: LayerCount,
 }

-#[allow(dead_code)]
 #[derive(Debug, Serialize)]
 struct PlannedUsage<U> {
    respecting_tenant_min_resident_size: U,
    fallback_to_global_lru: Option<U>,
 }

-#[allow(dead_code)]
 #[derive(Debug, Default, Serialize)]
 struct LayerCount {
    file_sizes: u64,
@@ -565,7 +561,6 @@ pub(crate) struct EvictionSecondaryLayer {
 #[derive(Clone)]
 pub(crate) enum EvictionLayer {
    Attached(Layer),
-    #[allow(dead_code)]
    Secondary(EvictionSecondaryLayer),
 }

@@ -1105,7 +1100,6 @@ mod filesystem_level_usage {
    use super::DiskUsageEvictionTaskConfig;

    #[derive(Debug, Clone, Copy)]
-    #[allow(dead_code)]
    pub struct Usage<'a> {
        config: &'a DiskUsageEvictionTaskConfig,

--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -83,12 +83,12 @@ use utils::{
 // This is not functionally necessary (clients will retry), but avoids generating a lot of
 // failed API calls while tenants are activating.
 #[cfg(not(feature = "testing"))]
-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
+pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);

 // Tests run on slow/oversubscribed nodes, and may need to wait much longer for tenants to
 // finish attaching, if calls to remote storage are slow.
 #[cfg(feature = "testing")]
-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
+pub(crate) const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);

 pub struct State {
    conf: &'static PageServerConf,
@@ -571,10 +571,16 @@ async fn timeline_list_handler(
        parse_query_param(&request, "force-await-initial-logical-size")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

+    let state = get_state(&request);
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);

    let response_data = async {
-        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
+        let tenant = state
+            .tenant_manager
+            .get_attached_tenant_shard(tenant_shard_id, false)?;
+
+        tenant.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await?;
+
        let timelines = tenant.list_timelines();

        let mut response_data = Vec::with_capacity(timelines.len());
@@ -1136,7 +1142,7 @@ async fn tenant_shard_split_handler(

    let new_shards = state
        .tenant_manager
-        .shard_split(tenant_shard_id, ShardCount(req.new_shard_count), &ctx)
+        .shard_split(tenant_shard_id, ShardCount::new(req.new_shard_count), &ctx)
        .await
        .map_err(ApiError::InternalServerError)?;

@@ -1951,6 +1957,7 @@ async fn put_io_engine_handler(
    mut r: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
+    check_permission(&r, None)?;
    let kind: crate::virtual_file::IoEngineKind = json_request(&mut r).await?;
    crate::virtual_file::io_engine::set(kind);
    json_response(StatusCode::OK, ())
@@ -2214,7 +2221,7 @@ pub fn make_router(
        )
        .get(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
-            |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
+            |r| api_handler(r, timeline_collect_keyspace),
        )
        .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
        .any(handler_404))
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -2496,6 +2496,56 @@ pub mod tokio_epoll_uring {
    }
 }

+pub(crate) mod tenant_throttling {
+    use metrics::{register_int_counter_vec, IntCounter};
+    use once_cell::sync::Lazy;
+
+    use crate::tenant::{self, throttle::Metric};
+
+    pub(crate) struct TimelineGet {
+        wait_time: IntCounter,
+        count: IntCounter,
+    }
+
+    pub(crate) static TIMELINE_GET: Lazy<TimelineGet> = Lazy::new(|| {
+        static WAIT_USECS: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+            register_int_counter_vec!(
+            "pageserver_tenant_throttling_wait_usecs_sum_global",
+            "Sum of microseconds that tenants spent waiting for a tenant throttle of a given kind.",
+            &["kind"]
+        )
+            .unwrap()
+        });
+
+        static WAIT_COUNT: Lazy<metrics::IntCounterVec> = Lazy::new(|| {
+            register_int_counter_vec!(
+                "pageserver_tenant_throttling_count_global",
+                "Count of tenant throttlings, by kind of throttle.",
+                &["kind"]
+            )
+            .unwrap()
+        });
+
+        let kind = "timeline_get";
+        TimelineGet {
+            wait_time: WAIT_USECS.with_label_values(&[kind]),
+            count: WAIT_COUNT.with_label_values(&[kind]),
+        }
+    });
+
+    impl Metric for &'static TimelineGet {
+        #[inline(always)]
+        fn observe_throttling(
+            &self,
+            tenant::throttle::Observation { wait_time }: &tenant::throttle::Observation,
+        ) {
+            let val = u64::try_from(wait_time.as_micros()).unwrap();
+            self.wait_time.inc_by(val);
+            self.count.inc();
+        }
+    }
+}
+
 pub fn preinitialize_metrics() {
    // Python tests need these and on some we do alerting.
    //
@@ -2557,4 +2607,5 @@ pub fn preinitialize_metrics() {

    // Custom
    Lazy::force(&RECONSTRUCT_TIME);
+    Lazy::force(&tenant_throttling::TIMELINE_GET);
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -26,7 +26,7 @@ use pageserver_api::models::{
    PagestreamNblocksResponse,
 };
 use pageserver_api::shard::ShardIndex;
-use pageserver_api::shard::{ShardCount, ShardNumber};
+use pageserver_api::shard::ShardNumber;
 use postgres_backend::{self, is_expected_io_error, AuthType, PostgresBackend, QueryError};
 use pq_proto::framed::ConnectionError;
 use pq_proto::FeStartupPacket;
@@ -998,7 +998,7 @@ impl PageServerHandler {
    ) -> Result<&Arc<Timeline>, Key> {
        let key = if let Some((first_idx, first_timeline)) = self.shard_timelines.iter().next() {
            // Fastest path: single sharded case
-            if first_idx.shard_count < ShardCount(2) {
+            if first_idx.shard_count.count() == 1 {
                return Ok(&first_timeline.timeline);
            }

--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -156,6 +156,7 @@ impl Timeline {
            pending_updates: HashMap::new(),
            pending_deletions: Vec::new(),
            pending_nblocks: 0,
+            pending_aux_files: None,
            pending_directory_entries: Vec::new(),
            lsn,
        }
@@ -870,6 +871,14 @@ pub struct DatadirModification<'a> {
    pending_updates: HashMap<Key, Vec<(Lsn, Value)>>,
    pending_deletions: Vec<(Range<Key>, Lsn)>,
    pending_nblocks: i64,
+
+    // If we already wrote any aux file changes in this modification, stash the latest dir.  If set,
+    // [`Self::put_file`] may assume that it is safe to emit a delta rather than checking
+    // if AUX_FILES_KEY is already set.
+    pending_aux_files: Option<AuxFilesDirectory>,
+
+    /// For special "directory" keys that store key-value maps, track the size of the map
+    /// if it was updated in this modification.
    pending_directory_entries: Vec<(DirectoryKind, usize)>,
 }

@@ -1384,31 +1393,76 @@ impl<'a> DatadirModification<'a> {
        content: &[u8],
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        let mut dir = match self.get(AUX_FILES_KEY, ctx).await {
-            Ok(buf) => AuxFilesDirectory::des(&buf)?,
-            Err(e) => {
-                // This is expected: historical databases do not have the key.
-                debug!("Failed to get info about AUX files: {}", e);
-                AuxFilesDirectory {
-                    files: HashMap::new(),
+        let file_path = path.to_string();
+        let content = if content.is_empty() {
+            None
+        } else {
+            Some(Bytes::copy_from_slice(content))
+        };
+
+        let dir = if let Some(mut dir) = self.pending_aux_files.take() {
+            // We already updated aux files in `self`: emit a delta and update our latest value
+
+            self.put(
+                AUX_FILES_KEY,
+                Value::WalRecord(NeonWalRecord::AuxFile {
+                    file_path: file_path.clone(),
+                    content: content.clone(),
+                }),
+            );
+
+            dir.upsert(file_path, content);
+            dir
+        } else {
+            // Check if the AUX_FILES_KEY is initialized
+            match self.get(AUX_FILES_KEY, ctx).await {
+                Ok(dir_bytes) => {
+                    let mut dir = AuxFilesDirectory::des(&dir_bytes)?;
+                    // Key is already set, we may append a delta
+                    self.put(
+                        AUX_FILES_KEY,
+                        Value::WalRecord(NeonWalRecord::AuxFile {
+                            file_path: file_path.clone(),
+                            content: content.clone(),
+                        }),
+                    );
+                    dir.upsert(file_path, content);
+                    dir
+                }
+                Err(
+                    e @ (PageReconstructError::AncestorStopping(_)
+                    | PageReconstructError::Cancelled
+                    | PageReconstructError::AncestorLsnTimeout(_)),
+                ) => {
+                    // Important that we do not interpret a shutdown error as "not found" and thereby
+                    // reset the map.
+                    return Err(e.into());
+                }
+                // FIXME: PageReconstructError doesn't have an explicit variant for key-not-found, so
+                // we are assuming that all _other_ possible errors represents a missing key.  If some
+                // other error occurs, we may incorrectly reset the map of aux files.
+                Err(PageReconstructError::Other(_) | PageReconstructError::WalRedo(_)) => {
+                    // Key is missing, we must insert an image as the basis for subsequent deltas.
+
+                    let mut dir = AuxFilesDirectory {
+                        files: HashMap::new(),
+                    };
+                    dir.upsert(file_path, content);
+                    self.put(
+                        AUX_FILES_KEY,
+                        Value::Image(Bytes::from(
+                            AuxFilesDirectory::ser(&dir).context("serialize")?,
+                        )),
+                    );
+                    dir
                }
            }
        };
-        let path = path.to_string();
-        if content.is_empty() {
-            dir.files.remove(&path);
-        } else {
-            dir.files.insert(path, Bytes::copy_from_slice(content));
-        }
+
        self.pending_directory_entries
            .push((DirectoryKind::AuxFiles, dir.files.len()));
+        self.pending_aux_files = Some(dir);

-        self.put(
-            AUX_FILES_KEY,
-            Value::Image(Bytes::from(
-                AuxFilesDirectory::ser(&dir).context("serialize")?,
-            )),
-        );
        Ok(())
    }

@@ -1618,8 +1672,18 @@ struct RelDirectory {
 }

 #[derive(Debug, Serialize, Deserialize, Default)]
-struct AuxFilesDirectory {
-    files: HashMap<String, Bytes>,
+pub(crate) struct AuxFilesDirectory {
+    pub(crate) files: HashMap<String, Bytes>,
+}
+
+impl AuxFilesDirectory {
+    pub(crate) fn upsert(&mut self, key: String, value: Option<Bytes>) {
+        if let Some(value) = value {
+            self.files.insert(key, value);
+        } else {
+            self.files.remove(&key);
+        }
+    }
 }

 #[derive(Debug, Serialize, Deserialize)]
@@ -1655,8 +1719,60 @@ static ZERO_PAGE: Bytes = Bytes::from_static(&[0u8; BLCKSZ as usize]);
 #[allow(clippy::bool_assert_comparison)]
 #[cfg(test)]
 mod tests {
-    //use super::repo_harness::*;
-    //use super::*;
+    use hex_literal::hex;
+    use utils::id::TimelineId;
+
+    use super::*;
+
+    use crate::{tenant::harness::TenantHarness, DEFAULT_PG_VERSION};
+
+    /// Test a round trip of aux file updates, from DatadirModification to reading back from the Timeline
+    #[tokio::test]
+    async fn aux_files_round_trip() -> anyhow::Result<()> {
+        let name = "aux_files_round_trip";
+        let harness = TenantHarness::create(name)?;
+
+        pub const TIMELINE_ID: TimelineId =
+            TimelineId::from_array(hex!("11223344556677881122334455667788"));
+
+        let (tenant, ctx) = harness.load().await;
+        let tline = tenant
+            .create_empty_timeline(TIMELINE_ID, Lsn(0), DEFAULT_PG_VERSION, &ctx)
+            .await?;
+        let tline = tline.raw_timeline().unwrap();
+
+        // First modification: insert two keys
+        let mut modification = tline.begin_modification(Lsn(0x1000));
+        modification.put_file("foo/bar1", b"content1", &ctx).await?;
+        modification.set_lsn(Lsn(0x1008))?;
+        modification.put_file("foo/bar2", b"content2", &ctx).await?;
+        modification.commit(&ctx).await?;
+        let expect_1008 = HashMap::from([
+            ("foo/bar1".to_string(), Bytes::from_static(b"content1")),
+            ("foo/bar2".to_string(), Bytes::from_static(b"content2")),
+        ]);
+
+        let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
+        assert_eq!(readback, expect_1008);
+
+        // Second modification: update one key, remove the other
+        let mut modification = tline.begin_modification(Lsn(0x2000));
+        modification.put_file("foo/bar1", b"content3", &ctx).await?;
+        modification.set_lsn(Lsn(0x2008))?;
+        modification.put_file("foo/bar2", b"", &ctx).await?;
+        modification.commit(&ctx).await?;
+        let expect_2008 =
+            HashMap::from([("foo/bar1".to_string(), Bytes::from_static(b"content3"))]);
+
+        let readback = tline.list_aux_files(Lsn(0x2008), &ctx).await?;
+        assert_eq!(readback, expect_2008);
+
+        // Reading back in time works
+        let readback = tline.list_aux_files(Lsn(0x1008), &ctx).await?;
+        assert_eq!(readback, expect_1008);
+
+        Ok(())
+    }

    /*
        fn assert_current_logical_size<R: Repository>(timeline: &DatadirTimeline<R>, lsn: Lsn) {
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -30,10 +30,6 @@
 //! only a single tenant or timeline.
 //!

-// Clippy 1.60 incorrectly complains about the tokio::task_local!() macro.
-// Silence it. See https://github.com/rust-lang/rust-clippy/issues/9224.
-#![allow(clippy::declare_interior_mutable_const)]
-
 use std::collections::HashMap;
 use std::fmt;
 use std::future::Future;
@@ -192,6 +188,7 @@ task_local! {
    serde::Serialize,
    serde::Deserialize,
    strum_macros::IntoStaticStr,
+    strum_macros::EnumString,
 )]
 pub enum TaskKind {
    // Pageserver startup, i.e., `main`
@@ -312,7 +309,6 @@ struct MutableTaskState {
 }

 struct PageServerTask {
-    #[allow(dead_code)] // unused currently
    task_id: PageserverTaskId,

    kind: TaskKind,
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -9,8 +9,8 @@
 //! may lead to a data loss.
 //!
 use anyhow::bail;
-use pageserver_api::models;
 use pageserver_api::models::EvictionPolicy;
+use pageserver_api::models::{self, ThrottleConfig};
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
@@ -251,7 +251,7 @@ impl LocationConf {
        } else {
            ShardIdentity::new(
                ShardNumber(conf.shard_number),
-                ShardCount(conf.shard_count),
+                ShardCount::new(conf.shard_count),
                ShardStripeSize(conf.shard_stripe_size),
            )?
        };
@@ -285,7 +285,7 @@ impl Default for LocationConf {
 ///
 /// For storing and transmitting individual tenant's configuration, see
 /// TenantConfOpt.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
 pub struct TenantConf {
    // Flush out an inmemory layer, if it's holding WAL older than this
    // This puts a backstop on how much WAL needs to be re-digested if the
@@ -348,11 +348,13 @@ pub struct TenantConf {

    /// If true then SLRU segments are dowloaded on demand, if false SLRU segments are included in basebackup
    pub lazy_slru_download: bool,
+
+    pub timeline_get_throttle: pageserver_api::models::ThrottleConfig,
 }

 /// Same as TenantConf, but this struct preserves the information about
 /// which parameters are set and which are not.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize, Default)]
+#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, Default)]
 pub struct TenantConfOpt {
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
@@ -437,6 +439,9 @@ pub struct TenantConfOpt {
    #[serde(skip_serializing_if = "Option::is_none")]
    #[serde(default)]
    pub lazy_slru_download: Option<bool>,
+
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub timeline_get_throttle: Option<pageserver_api::models::ThrottleConfig>,
 }

 impl TenantConfOpt {
@@ -485,6 +490,10 @@ impl TenantConfOpt {
            lazy_slru_download: self
                .lazy_slru_download
                .unwrap_or(global_conf.lazy_slru_download),
+            timeline_get_throttle: self
+                .timeline_get_throttle
+                .clone()
+                .unwrap_or(global_conf.timeline_get_throttle),
        }
    }
 }
@@ -524,6 +533,7 @@ impl Default for TenantConf {
            gc_feedback: false,
            heatmap_period: Duration::ZERO,
            lazy_slru_download: false,
+            timeline_get_throttle: crate::tenant::throttle::Config::disabled(),
        }
    }
 }
@@ -596,6 +606,7 @@ impl From<TenantConfOpt> for models::TenantConfig {
            gc_feedback: value.gc_feedback,
            heatmap_period: value.heatmap_period.map(humantime),
            lazy_slru_download: value.lazy_slru_download,
+            timeline_get_throttle: value.timeline_get_throttle.map(ThrottleConfig::from),
        }
    }
 }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -3,7 +3,7 @@ use std::sync::Arc;
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::{models::TenantState, shard::TenantShardId};
-use remote_storage::{GenericRemoteStorage, RemotePath};
+use remote_storage::{GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::{error, instrument, Instrument};
@@ -84,17 +84,17 @@ async fn create_remote_delete_mark(
            let data = bytes::Bytes::from_static(data);
            let stream = futures::stream::once(futures::future::ready(Ok(data)));
            remote_storage
-                .upload(stream, 0, &remote_mark_path, None)
+                .upload(stream, 0, &remote_mark_path, None, cancel)
                .await
        },
-        |_e| false,
+        TimeoutOrCancel::caused_by_cancel,
        FAILED_UPLOAD_WARN_THRESHOLD,
        FAILED_REMOTE_OP_RETRIES,
        "mark_upload",
        cancel,
    )
    .await
-    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
    .and_then(|x| x)
    .context("mark_upload")?;

@@ -184,15 +184,15 @@ async fn remove_tenant_remote_delete_mark(
    if let Some(remote_storage) = remote_storage {
        let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
        backoff::retry(
-            || async { remote_storage.delete(&path).await },
-            |_e| false,
+            || async { remote_storage.delete(&path, cancel).await },
+            TimeoutOrCancel::caused_by_cancel,
            FAILED_UPLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "remove_tenant_remote_delete_mark",
            cancel,
        )
        .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
        .and_then(|x| x)
        .context("remove_tenant_remote_delete_mark")?;
    }
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -36,7 +36,6 @@ use crate::{
 pub const VALUE_SZ: usize = 5;
 pub const MAX_VALUE: u64 = 0x007f_ffff_ffff;

-#[allow(dead_code)]
 pub const PAGE_SZ: usize = 8192;

 #[derive(Clone, Copy, Debug)]
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -6,6 +6,7 @@ use crate::context::RequestContext;
 use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::{self, VirtualFile};
+use bytes::BytesMut;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use std::cmp::min;
@@ -26,7 +27,10 @@ pub struct EphemeralFile {
    /// An ephemeral file is append-only.
    /// We keep the last page, which can still be modified, in [`Self::mutable_tail`].
    /// The other pages, which can no longer be modified, are accessed through the page cache.
-    mutable_tail: [u8; PAGE_SZ],
+    ///
+    /// None <=> IO is ongoing.
+    /// Size is fixed to PAGE_SZ at creation time and must not be changed.
+    mutable_tail: Option<BytesMut>,
 }

 impl EphemeralFile {
@@ -60,7 +64,7 @@ impl EphemeralFile {
            _timeline_id: timeline_id,
            file,
            len: 0,
-            mutable_tail: [0u8; PAGE_SZ],
+            mutable_tail: Some(BytesMut::zeroed(PAGE_SZ)),
        })
    }

@@ -103,7 +107,13 @@ impl EphemeralFile {
            };
        } else {
            debug_assert_eq!(blknum as u64, self.len / PAGE_SZ as u64);
-            Ok(BlockLease::EphemeralFileMutableTail(&self.mutable_tail))
+            Ok(BlockLease::EphemeralFileMutableTail(
+                self.mutable_tail
+                    .as_deref()
+                    .expect("we're not doing IO, it must be Some()")
+                    .try_into()
+                    .expect("we ensure that it's always PAGE_SZ"),
+            ))
        }
    }

@@ -135,21 +145,27 @@ impl EphemeralFile {
            ) -> Result<(), io::Error> {
                let mut src_remaining = src;
                while !src_remaining.is_empty() {
-                    let dst_remaining = &mut self.ephemeral_file.mutable_tail[self.off..];
+                    let dst_remaining = &mut self
+                        .ephemeral_file
+                        .mutable_tail
+                        .as_deref_mut()
+                        .expect("IO is not yet ongoing")[self.off..];
                    let n = min(dst_remaining.len(), src_remaining.len());
                    dst_remaining[..n].copy_from_slice(&src_remaining[..n]);
                    self.off += n;
                    src_remaining = &src_remaining[n..];
                    if self.off == PAGE_SZ {
-                        match self
+                        let mutable_tail = std::mem::take(&mut self.ephemeral_file.mutable_tail)
+                            .expect("IO is not yet ongoing");
+                        let (mutable_tail, res) = self
                            .ephemeral_file
                            .file
-                            .write_all_at(
-                                &self.ephemeral_file.mutable_tail,
-                                self.blknum as u64 * PAGE_SZ as u64,
-                            )
-                            .await
-                        {
+                            .write_all_at(mutable_tail, self.blknum as u64 * PAGE_SZ as u64)
+                            .await;
+                        // TODO: If we panic before we can put the mutable_tail back, subsequent calls will fail.
+                        // I.e., the IO isn't retryable if we panic.
+                        self.ephemeral_file.mutable_tail = Some(mutable_tail);
+                        match res {
                            Ok(_) => {
                                // Pre-warm the page cache with what we just wrote.
                                // This isn't necessary for coherency/correctness, but it's how we've always done it.
@@ -169,7 +185,12 @@ impl EphemeralFile {
                                    Ok(page_cache::ReadBufResult::NotFound(mut write_guard)) => {
                                        let buf: &mut [u8] = write_guard.deref_mut();
                                        debug_assert_eq!(buf.len(), PAGE_SZ);
-                                        buf.copy_from_slice(&self.ephemeral_file.mutable_tail);
+                                        buf.copy_from_slice(
+                                            self.ephemeral_file
+                                                .mutable_tail
+                                                .as_deref()
+                                                .expect("IO is not ongoing"),
+                                        );
                                        let _ = write_guard.mark_valid();
                                        // pre-warm successful
                                    }
@@ -181,7 +202,11 @@ impl EphemeralFile {
                                // Zero the buffer for re-use.
                                // Zeroing is critical for correcntess because the write_blob code below
                                // and similarly read_blk expect zeroed pages.
-                                self.ephemeral_file.mutable_tail.fill(0);
+                                self.ephemeral_file
+                                    .mutable_tail
+                                    .as_deref_mut()
+                                    .expect("IO is not ongoing")
+                                    .fill(0);
                                // This block is done, move to next one.
                                self.blknum += 1;
                                self.off = 0;
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -294,17 +294,6 @@ pub enum LoadMetadataError {
    Decode(#[from] anyhow::Error),
 }

-pub fn load_metadata(
-    conf: &'static PageServerConf,
-    tenant_shard_id: &TenantShardId,
-    timeline_id: &TimelineId,
-) -> Result<TimelineMetadata, LoadMetadataError> {
-    let metadata_path = conf.metadata_path(tenant_shard_id, timeline_id);
-    let metadata_bytes = std::fs::read(metadata_path)?;
-
-    Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,6 +2,7 @@
 //! page server.

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
+use futures::stream::StreamExt;
 use itertools::Itertools;
 use pageserver_api::key::Key;
 use pageserver_api::models::ShardParameters;
@@ -31,6 +32,7 @@ use crate::control_plane_client::{
    ControlPlaneClient, ControlPlaneGenerationsApi, RetryForeverError,
 };
 use crate::deletion_queue::DeletionQueueClient;
+use crate::http::routes::ACTIVE_TENANT_TIMEOUT;
 use crate::metrics::{TENANT, TENANT_MANAGER as METRICS};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::{
@@ -483,7 +485,7 @@ pub async fn init_tenant_mgr(
                            TenantSlot::Secondary(SecondaryTenant::new(
                                tenant_shard_id,
                                location_conf.shard,
-                                location_conf.tenant_conf,
+                                location_conf.tenant_conf.clone(),
                                &SecondaryLocationConfig { warm: false },
                            )),
                        );
@@ -793,7 +795,7 @@ pub(crate) async fn set_new_tenant_config(
    info!("configuring tenant {tenant_id}");
    let tenant = get_tenant(tenant_shard_id, true)?;

-    if tenant.tenant_shard_id().shard_count > ShardCount(0) {
+    if !tenant.tenant_shard_id().shard_count.is_unsharded() {
        // Note that we use ShardParameters::default below.
        return Err(SetNewTenantConfigError::Other(anyhow::anyhow!(
            "This API may only be used on single-sharded tenants, use the /location_config API for sharded tenants"
@@ -804,7 +806,7 @@ pub(crate) async fn set_new_tenant_config(
    // API to use is the location_config/ endpoint, which lets the caller provide
    // the full LocationConf.
    let location_conf = LocationConf::attached_single(
-        new_tenant_conf,
+        new_tenant_conf.clone(),
        tenant.generation,
        &ShardParameters::default(),
    );
@@ -1375,7 +1377,7 @@ impl TenantManager {
        result
    }

-    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.0))]
+    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), new_shard_count=%new_shard_count.literal()))]
    pub(crate) async fn shard_split(
        &self,
        tenant_shard_id: TenantShardId,
@@ -1385,11 +1387,10 @@ impl TenantManager {
        let tenant = get_tenant(tenant_shard_id, true)?;

        // Plan: identify what the new child shards will be
-        let effective_old_shard_count = std::cmp::max(tenant_shard_id.shard_count.0, 1);
-        if new_shard_count <= ShardCount(effective_old_shard_count) {
+        if new_shard_count.count() <= tenant_shard_id.shard_count.count() {
            anyhow::bail!("Requested shard count is not an increase");
        }
-        let expansion_factor = new_shard_count.0 / effective_old_shard_count;
+        let expansion_factor = new_shard_count.count() / tenant_shard_id.shard_count.count();
        if !expansion_factor.is_power_of_two() {
            anyhow::bail!("Requested split is not a power of two");
        }
@@ -1439,8 +1440,10 @@ impl TenantManager {
            }
        };

-        // TODO: hardlink layers from the parent into the child shard directories so that they don't immediately re-download
-        // TODO: erase the dentries from the parent
+        // Optimization: hardlink layers from the parent into the children, so that they don't have to
+        // re-download & duplicate the data referenced in their initial IndexPart
+        self.shard_split_hardlink(parent, child_shards.clone())
+            .await?;

        // Take a snapshot of where the parent's WAL ingest had got to: we will wait for
        // child shards to reach this point.
@@ -1464,7 +1467,7 @@ impl TenantManager {
                    attach_mode: AttachmentMode::Single,
                }),
                shard: child_shard_identity,
-                tenant_conf: parent_tenant_conf,
+                tenant_conf: parent_tenant_conf.clone(),
            };

            self.upsert_location(
@@ -1479,13 +1482,24 @@ impl TenantManager {

        // Phase 4: wait for child chards WAL ingest to catch up to target LSN
        for child_shard_id in &child_shards {
+            let child_shard_id = *child_shard_id;
            let child_shard = {
                let locked = TENANTS.read().unwrap();
                let peek_slot =
-                    tenant_map_peek_slot(&locked, child_shard_id, TenantSlotPeekMode::Read)?;
+                    tenant_map_peek_slot(&locked, &child_shard_id, TenantSlotPeekMode::Read)?;
                peek_slot.and_then(|s| s.get_attached()).cloned()
            };
            if let Some(t) = child_shard {
+                // Wait for the child shard to become active: this should be very quick because it only
+                // has to download the index_part that we just uploaded when creating it.
+                if let Err(e) = t.wait_to_become_active(ACTIVE_TENANT_TIMEOUT).await {
+                    // This is not fatal: we have durably created the child shard.  It just makes the
+                    // split operation less seamless for clients, as we will may detach the parent
+                    // shard before the child shards are fully ready to serve requests.
+                    tracing::warn!("Failed to wait for shard {child_shard_id} to activate: {e}");
+                    continue;
+                }
+
                let timelines = t.timelines.lock().unwrap().clone();
                for timeline in timelines.values() {
                    let Some(target_lsn) = target_lsns.get(&timeline.timeline_id) else {
@@ -1517,7 +1531,7 @@ impl TenantManager {
            }
        }

-        // Phase 5: Shut down the parent shard.
+        // Phase 5: Shut down the parent shard, and erase it from disk
        let (_guard, progress) = completion::channel();
        match parent.shutdown(progress, false).await {
            Ok(()) => {}
@@ -1525,6 +1539,24 @@ impl TenantManager {
                other.wait().await;
            }
        }
+        let local_tenant_directory = self.conf.tenant_path(&tenant_shard_id);
+        let tmp_path = safe_rename_tenant_dir(&local_tenant_directory)
+            .await
+            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))?;
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::MgmtRequest,
+            None,
+            None,
+            "tenant_files_delete",
+            false,
+            async move {
+                fs::remove_dir_all(tmp_path.as_path())
+                    .await
+                    .with_context(|| format!("tenant directory {:?} deletion", tmp_path))
+            },
+        );
+
        parent_slot_guard.drop_old_value()?;

        // Phase 6: Release the InProgress on the parent shard
@@ -1532,6 +1564,130 @@ impl TenantManager {

        Ok(child_shards)
    }
+
+    /// Part of [`Self::shard_split`]: hard link parent shard layers into child shards, as an optimization
+    /// to avoid the children downloading them again.
+    ///
+    /// For each resident layer in the parent shard, we will hard link it into all of the child shards.
+    async fn shard_split_hardlink(
+        &self,
+        parent_shard: &Tenant,
+        child_shards: Vec<TenantShardId>,
+    ) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_id();
+
+        let parent_path = self.conf.tenant_path(parent_shard.get_tenant_shard_id());
+        let (parent_timelines, parent_layers) = {
+            let mut parent_layers = Vec::new();
+            let timelines = parent_shard.timelines.lock().unwrap().clone();
+            let parent_timelines = timelines.keys().cloned().collect::<Vec<_>>();
+            for timeline in timelines.values() {
+                let timeline_layers = timeline
+                    .layers
+                    .read()
+                    .await
+                    .resident_layers()
+                    .collect::<Vec<_>>()
+                    .await;
+                for layer in timeline_layers {
+                    let relative_path = layer
+                        .local_path()
+                        .strip_prefix(&parent_path)
+                        .context("Removing prefix from parent layer path")?;
+                    parent_layers.push(relative_path.to_owned());
+                }
+            }
+            debug_assert!(
+                !parent_layers.is_empty(),
+                "shutdown cannot empty the layermap"
+            );
+            (parent_timelines, parent_layers)
+        };
+
+        let mut child_prefixes = Vec::new();
+        let mut create_dirs = Vec::new();
+
+        for child in child_shards {
+            let child_prefix = self.conf.tenant_path(&child);
+            create_dirs.push(child_prefix.clone());
+            create_dirs.extend(
+                parent_timelines
+                    .iter()
+                    .map(|t| self.conf.timeline_path(&child, t)),
+            );
+
+            child_prefixes.push(child_prefix);
+        }
+
+        // Since we will do a large number of small filesystem metadata operations, batch them into
+        // spawn_blocking calls rather than doing each one as a tokio::fs round-trip.
+        let jh = tokio::task::spawn_blocking(move || -> anyhow::Result<usize> {
+            for dir in &create_dirs {
+                if let Err(e) = std::fs::create_dir_all(dir) {
+                    // Ignore AlreadyExists errors, drop out on all other errors
+                    match e.kind() {
+                        std::io::ErrorKind::AlreadyExists => {}
+                        _ => {
+                            return Err(anyhow::anyhow!(e).context(format!("Creating {dir}")));
+                        }
+                    }
+                }
+            }
+
+            for child_prefix in child_prefixes {
+                for relative_layer in &parent_layers {
+                    let parent_path = parent_path.join(relative_layer);
+                    let child_path = child_prefix.join(relative_layer);
+                    if let Err(e) = std::fs::hard_link(&parent_path, &child_path) {
+                        match e.kind() {
+                            std::io::ErrorKind::AlreadyExists => {}
+                            std::io::ErrorKind::NotFound => {
+                                tracing::info!(
+                                    "Layer {} not found during hard-linking, evicted during split?",
+                                    relative_layer
+                                );
+                            }
+                            _ => {
+                                return Err(anyhow::anyhow!(e).context(format!(
+                                    "Hard linking {relative_layer} into {child_prefix}"
+                                )))
+                            }
+                        }
+                    }
+                }
+            }
+
+            // Durability is not required for correctness, but if we crashed during split and
+            // then came restarted with empty timeline dirs, it would be very inefficient to
+            // re-populate from remote storage.
+            for dir in create_dirs {
+                if let Err(e) = crashsafe::fsync(&dir) {
+                    // Something removed a newly created timeline dir out from underneath us?  Extremely
+                    // unexpected, but not worth panic'ing over as this whole function is just an
+                    // optimization.
+                    tracing::warn!("Failed to fsync directory {dir}: {e}")
+                }
+            }
+
+            Ok(parent_layers.len())
+        });
+
+        match jh.await {
+            Ok(Ok(layer_count)) => {
+                tracing::info!(count = layer_count, "Hard linked layers into child shards");
+            }
+            Ok(Err(e)) => {
+                // This is an optimization, so we tolerate failure.
+                tracing::warn!("Error hard-linking layers, proceeding anyway: {e}")
+            }
+            Err(e) => {
+                // This is something totally unexpected like a panic, so bail out.
+                anyhow::bail!("Error joining hard linking task: {e}");
+            }
+        }
+
+        Ok(())
+    }
 }

 #[derive(Debug, thiserror::Error)]
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -196,14 +196,12 @@ pub(crate) use upload::upload_initdb_dir;
 use utils::backoff::{
    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
-use utils::timeout::{timeout_cancellable, TimeoutCancellableError};

 use std::collections::{HashMap, VecDeque};
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
-use std::time::Duration;

-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath, TimeoutOrCancel};
 use std::ops::DerefMut;
 use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
@@ -263,11 +261,6 @@ pub(crate) const INITDB_PRESERVED_PATH: &str = "initdb-preserved.tar.zst";
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 pub(crate) const BUFFER_SIZE: usize = 32 * 1024;

-/// This timeout is intended to deal with hangs in lower layers, e.g. stuck TCP flows.  It is not
-/// intended to be snappy enough for prompt shutdown, as we have a CancellationToken for that.
-pub(crate) const UPLOAD_TIMEOUT: Duration = Duration::from_secs(120);
-pub(crate) const DOWNLOAD_TIMEOUT: Duration = Duration::from_secs(120);
-
 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
    Deleted(IndexPart),
@@ -331,40 +324,6 @@ pub struct RemoteTimelineClient {
    cancel: CancellationToken,
 }

-/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to anyhow.
-///
-/// This is a convenience for the various upload functions.  In future
-/// the anyhow::Error result should be replaced with a more structured type that
-/// enables callers to avoid handling shutdown as an error.
-async fn upload_cancellable<F>(cancel: &CancellationToken, future: F) -> anyhow::Result<()>
-where
-    F: std::future::Future<Output = anyhow::Result<()>>,
-{
-    match timeout_cancellable(UPLOAD_TIMEOUT, cancel, future).await {
-        Ok(Ok(())) => Ok(()),
-        Ok(Err(e)) => Err(e),
-        Err(TimeoutCancellableError::Timeout) => Err(anyhow::anyhow!("Timeout")),
-        Err(TimeoutCancellableError::Cancelled) => Err(anyhow::anyhow!("Shutting down")),
-    }
-}
-/// Wrapper for timeout_cancellable that flattens result and converts TimeoutCancellableError to DownloaDError.
-async fn download_cancellable<F, R>(
-    cancel: &CancellationToken,
-    future: F,
-) -> Result<R, DownloadError>
-where
-    F: std::future::Future<Output = Result<R, DownloadError>>,
-{
-    match timeout_cancellable(DOWNLOAD_TIMEOUT, cancel, future).await {
-        Ok(Ok(r)) => Ok(r),
-        Ok(Err(e)) => Err(e),
-        Err(TimeoutCancellableError::Timeout) => {
-            Err(DownloadError::Other(anyhow::anyhow!("Timed out")))
-        }
-        Err(TimeoutCancellableError::Cancelled) => Err(DownloadError::Cancelled),
-    }
-}
-
 impl RemoteTimelineClient {
    ///
    /// Create a remote storage client for given timeline
@@ -1050,7 +1009,7 @@ impl RemoteTimelineClient {
            &self.cancel,
        )
        .await
-        .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
        .and_then(|x| x)?;

        // all good, disarm the guard and mark as success
@@ -1082,14 +1041,14 @@ impl RemoteTimelineClient {
                upload::preserve_initdb_archive(&self.storage_impl, tenant_id, timeline_id, cancel)
                    .await
            },
-            |_e| false,
+            TimeoutOrCancel::caused_by_cancel,
            FAILED_DOWNLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "preserve_initdb_tar_zst",
            &cancel.clone(),
        )
        .await
-        .ok_or_else(|| anyhow::anyhow!("Cancellled"))
+        .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
        .and_then(|x| x)
        .context("backing up initdb archive")?;
        Ok(())
@@ -1151,7 +1110,7 @@ impl RemoteTimelineClient {
        let remaining = download_retry(
            || async {
                self.storage_impl
-                    .list_files(Some(&timeline_storage_path), None)
+                    .list_files(Some(&timeline_storage_path), None, &cancel)
                    .await
            },
            "list remaining files",
@@ -1445,6 +1404,10 @@ impl RemoteTimelineClient {
                Ok(()) => {
                    break;
                }
+                Err(e) if TimeoutOrCancel::caused_by_cancel(&e) => {
+                    // loop around to do the proper stopping
+                    continue;
+                }
                Err(e) => {
                    let retries = task.retries.fetch_add(1, Ordering::SeqCst);

--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -11,16 +11,14 @@ use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::shard::TenantShardId;
 use tokio::fs::{self, File, OpenOptions};
 use tokio::io::{AsyncSeekExt, AsyncWriteExt};
+use tokio_util::io::StreamReader;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
-use utils::timeout::timeout_cancellable;
 use utils::{backoff, crashsafe};

 use crate::config::PageServerConf;
 use crate::span::debug_assert_current_span_has_tenant_and_timeline_id;
-use crate::tenant::remote_timeline_client::{
-    download_cancellable, remote_layer_path, remote_timelines_path, DOWNLOAD_TIMEOUT,
-};
+use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_path};
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::Generation;
 use crate::virtual_file::on_fatal_io_error;
@@ -83,15 +81,13 @@ pub async fn download_layer_file<'a>(
                .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
                .map_err(DownloadError::Other)?;

-            // Cancellation safety: it is safe to cancel this future, because it isn't writing to a local
-            // file: the write to local file doesn't start until after the request header is returned
-            // and we start draining the body stream below
-            let download = download_cancellable(cancel, storage.download(&remote_path))
+            let download = storage
+                .download(&remote_path, cancel)
                .await
                .with_context(|| {
                    format!(
-                    "open a download stream for layer with remote storage path '{remote_path:?}'"
-                )
+                        "open a download stream for layer with remote storage path '{remote_path:?}'"
+                    )
                })
                .map_err(DownloadError::Other)?;

@@ -100,43 +96,26 @@ pub async fn download_layer_file<'a>(

            let mut reader = tokio_util::io::StreamReader::new(download.download_stream);

-            // Cancellation safety: it is safe to cancel this future because it is writing into a temporary file,
-            // and we will unlink the temporary file if there is an error.  This unlink is important because we
-            // are in a retry loop, and we wouldn't want to leave behind a rogue write I/O to a file that
-            // we will imminiently try and write to again.
-            let bytes_amount: u64 = match timeout_cancellable(
-                DOWNLOAD_TIMEOUT,
-                cancel,
-                tokio::io::copy_buf(&mut reader, &mut destination_file),
-            )
-            .await
-            .with_context(|| {
-                format!(
+            let bytes_amount = tokio::io::copy_buf(&mut reader, &mut destination_file)
+                .await
+                .with_context(|| format!(
                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
-                )
-            })
-            .map_err(DownloadError::Other)?
-            {
-                Ok(b) => Ok(b),
+                ))
+                .map_err(DownloadError::Other);
+
+            match bytes_amount {
+                Ok(bytes_amount) => {
+                    let destination_file = destination_file.into_inner();
+                    Ok((destination_file, bytes_amount))
+                }
                Err(e) => {
-                    // Remove incomplete files: on restart Timeline would do this anyway, but we must
-                    // do it here for the retry case.
                    if let Err(e) = tokio::fs::remove_file(&temp_file_path).await {
                        on_fatal_io_error(&e, &format!("Removing temporary file {temp_file_path}"));
                    }
+
                    Err(e)
                }
            }
-            .with_context(|| {
-                format!(
-                    "download layer at remote path '{remote_path:?}' into file {temp_file_path:?}"
-                )
-            })
-            .map_err(DownloadError::Other)?;
-
-            let destination_file = destination_file.into_inner();
-
-            Ok((destination_file, bytes_amount))
        },
        &format!("download {remote_path:?}"),
        cancel,
@@ -218,9 +197,11 @@ pub async fn list_remote_timelines(

    let listing = download_retry_forever(
        || {
-            download_cancellable(
+            storage.list(
+                Some(&remote_path),
+                ListingMode::WithDelimiter,
+                None,
                &cancel,
-                storage.list(Some(&remote_path), ListingMode::WithDelimiter, None),
            )
        },
        &format!("list timelines for {tenant_shard_id}"),
@@ -259,26 +240,23 @@ async fn do_download_index_part(
    index_generation: Generation,
    cancel: &CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
-    use futures::stream::StreamExt;
-
    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);

    let index_part_bytes = download_retry_forever(
        || async {
-            // Cancellation: if is safe to cancel this future because we're just downloading into
-            // a memory buffer, not touching local disk.
-            let index_part_download =
-                download_cancellable(cancel, storage.download(&remote_path)).await?;
+            let download = storage.download(&remote_path, cancel).await?;

-            let mut index_part_bytes = Vec::new();
-            let mut stream = std::pin::pin!(index_part_download.download_stream);
-            while let Some(chunk) = stream.next().await {
-                let chunk = chunk
-                    .with_context(|| format!("download index part at {remote_path:?}"))
-                    .map_err(DownloadError::Other)?;
-                index_part_bytes.extend_from_slice(&chunk[..]);
-            }
-            Ok(index_part_bytes)
+            let mut bytes = Vec::new();
+
+            let stream = download.download_stream;
+            let mut stream = StreamReader::new(stream);
+
+            tokio::io::copy_buf(&mut stream, &mut bytes)
+                .await
+                .with_context(|| format!("download index part at {remote_path:?}"))
+                .map_err(DownloadError::Other)?;
+
+            Ok(bytes)
        },
        &format!("download {remote_path:?}"),
        cancel,
@@ -373,7 +351,7 @@ pub(super) async fn download_index_part(
    let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());

    let indices = download_retry(
-        || async { storage.list_files(Some(&index_prefix), None).await },
+        || async { storage.list_files(Some(&index_prefix), None, cancel).await },
        "list index_part files",
        cancel,
    )
@@ -446,11 +424,10 @@ pub(crate) async fn download_initdb_tar_zst(
                .with_context(|| format!("tempfile creation {temp_path}"))
                .map_err(DownloadError::Other)?;

-            let download = match download_cancellable(cancel, storage.download(&remote_path)).await
-            {
+            let download = match storage.download(&remote_path, cancel).await {
                Ok(dl) => dl,
                Err(DownloadError::NotFound) => {
-                    download_cancellable(cancel, storage.download(&remote_preserved_path)).await?
+                    storage.download(&remote_preserved_path, cancel).await?
                }
                Err(other) => Err(other)?,
            };
@@ -460,6 +437,7 @@ pub(crate) async fn download_initdb_tar_zst(
            // TODO: this consumption of the response body should be subject to timeout + cancellation, but
            // not without thinking carefully about how to recover safely from cancelling a write to
            // local storage (e.g. by writing into a temp file as we do in download_layer)
+            // FIXME: flip the weird error wrapping
            tokio::io::copy_buf(&mut download, &mut writer)
                .await
                .with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -16,7 +16,7 @@ use crate::{
    config::PageServerConf,
    tenant::remote_timeline_client::{
        index::IndexPart, remote_index_path, remote_initdb_archive_path,
-        remote_initdb_preserved_archive_path, remote_path, upload_cancellable,
+        remote_initdb_preserved_archive_path, remote_path,
    },
 };
 use remote_storage::{GenericRemoteStorage, TimeTravelError};
@@ -49,16 +49,15 @@ pub(crate) async fn upload_index_part<'a>(
    let index_part_bytes = bytes::Bytes::from(index_part_bytes);

    let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
-    upload_cancellable(
-        cancel,
-        storage.upload_storage_object(
+    storage
+        .upload_storage_object(
            futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
            index_part_size,
            &remote_path,
-        ),
-    )
-    .await
-    .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
+            cancel,
+        )
+        .await
+        .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }

 /// Attempts to upload given layer files.
@@ -115,11 +114,10 @@ pub(super) async fn upload_timeline_layer<'a>(

    let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);

-    upload_cancellable(cancel, storage.upload(reader, fs_size, &storage_path, None))
+    storage
+        .upload(reader, fs_size, &storage_path, None, cancel)
        .await
-        .with_context(|| format!("upload layer from local path '{source_path}'"))?;
-
-    Ok(())
+        .with_context(|| format!("upload layer from local path '{source_path}'"))
 }

 /// Uploads the given `initdb` data to the remote storage.
@@ -139,12 +137,10 @@ pub(crate) async fn upload_initdb_dir(
    let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);

    let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
-    upload_cancellable(
-        cancel,
-        storage.upload_storage_object(file, size as usize, &remote_path),
-    )
-    .await
-    .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
+    storage
+        .upload_storage_object(file, size as usize, &remote_path, cancel)
+        .await
+        .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
 }

 pub(crate) async fn preserve_initdb_archive(
@@ -155,7 +151,8 @@ pub(crate) async fn preserve_initdb_archive(
 ) -> anyhow::Result<()> {
    let source_path = remote_initdb_archive_path(tenant_id, timeline_id);
    let dest_path = remote_initdb_preserved_archive_path(tenant_id, timeline_id);
-    upload_cancellable(cancel, storage.copy_object(&source_path, &dest_path))
+    storage
+        .copy_object(&source_path, &dest_path, cancel)
        .await
        .with_context(|| format!("backing up initdb archive for '{tenant_id} / {timeline_id}'"))
 }
--- a/pageserver/src/tenant/secondary.rs
+++ b/pageserver/src/tenant/secondary.rs
@@ -133,7 +133,7 @@ impl SecondaryTenant {
    }

    pub(crate) fn set_tenant_conf(&self, config: &TenantConfOpt) {
-        *(self.tenant_conf.lock().unwrap()) = *config;
+        *(self.tenant_conf.lock().unwrap()) = config.clone();
    }

    /// For API access: generate a LocationConfig equivalent to the one that would be used to
@@ -144,13 +144,13 @@ impl SecondaryTenant {

        let conf = models::LocationConfigSecondary { warm: conf.warm };

-        let tenant_conf = *self.tenant_conf.lock().unwrap();
+        let tenant_conf = self.tenant_conf.lock().unwrap().clone();
        models::LocationConfig {
            mode: models::LocationConfigMode::Secondary,
            generation: None,
            secondary_conf: Some(conf),
            shard_number: self.tenant_shard_id.shard_number.0,
-            shard_count: self.tenant_shard_id.shard_count.0,
+            shard_count: self.tenant_shard_id.shard_count.literal(),
            shard_stripe_size: self.shard_identity.stripe_size.0,
            tenant_conf: tenant_conf.into(),
        }
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -523,12 +523,13 @@ impl<'a> TenantDownloader<'a> {
        tracing::debug!("Downloading heatmap for secondary tenant",);

        let heatmap_path = remote_heatmap_path(tenant_shard_id);
+        let cancel = &self.secondary_state.cancel;

        let heatmap_bytes = backoff::retry(
            || async {
                let download = self
                    .remote_storage
-                    .download(&heatmap_path)
+                    .download(&heatmap_path, cancel)
                    .await
                    .map_err(UpdateError::from)?;
                let mut heatmap_bytes = Vec::new();
@@ -540,7 +541,7 @@ impl<'a> TenantDownloader<'a> {
            FAILED_DOWNLOAD_WARN_THRESHOLD,
            FAILED_REMOTE_OP_RETRIES,
            "download heatmap",
-            &self.secondary_state.cancel,
+            cancel,
        )
        .await
        .ok_or_else(|| UpdateError::Cancelled)
--- a/pageserver/src/tenant/secondary/heatmap_uploader.rs
+++ b/pageserver/src/tenant/secondary/heatmap_uploader.rs
@@ -21,18 +21,17 @@ use futures::Future;
 use md5;
 use pageserver_api::shard::TenantShardId;
 use rand::Rng;
-use remote_storage::GenericRemoteStorage;
+use remote_storage::{GenericRemoteStorage, TimeoutOrCancel};

 use super::{
+    heatmap::HeatMapTenant,
    scheduler::{self, JobGenerator, RunningJob, SchedulingResult, TenantBackgroundJobs},
-    CommandRequest,
+    CommandRequest, UploadCommand,
 };
 use tokio_util::sync::CancellationToken;
 use tracing::{info_span, instrument, Instrument};
 use utils::{backoff, completion::Barrier, yielding_loop::yielding_loop};

-use super::{heatmap::HeatMapTenant, UploadCommand};
-
 pub(super) async fn heatmap_uploader_task(
    tenant_manager: Arc<TenantManager>,
    remote_storage: GenericRemoteStorage,
@@ -417,10 +416,10 @@ async fn upload_tenant_heatmap(
        || async {
            let bytes = futures::stream::once(futures::future::ready(Ok(bytes.clone())));
            remote_storage
-                .upload_storage_object(bytes, size, &path)
+                .upload_storage_object(bytes, size, &path, cancel)
                .await
        },
-        |_| false,
+        TimeoutOrCancel::caused_by_cancel,
        3,
        u32::MAX,
        "Uploading heatmap",
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -9,6 +9,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
 use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
+use crate::tenant::throttle::Stats;
 use crate::tenant::timeline::CompactionError;
 use crate::tenant::{Tenant, TenantState};
 use tokio_util::sync::CancellationToken;
@@ -139,6 +140,8 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
    // How many errors we have seen consequtively
    let mut error_run_count = 0;

+    let mut last_throttle_flag_reset_at = Instant::now();
+
    TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
    async {
        let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
@@ -203,6 +206,27 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                walredo_mgr.maybe_quiesce(period * 10);
            }

+            // TODO: move this (and walredo quiesce) to a separate task that isn't affected by the back-off,
+            // so we get some upper bound guarantee on when walredo quiesce / this throttling reporting here happens.
+            info_span!(parent: None, "timeline_get_throttle", tenant_id=%tenant.tenant_shard_id, shard_id=%tenant.tenant_shard_id.shard_slug()).in_scope(|| {
+                let now = Instant::now();
+                let prev = std::mem::replace(&mut last_throttle_flag_reset_at, now);
+                let Stats { count_accounted, count_throttled, sum_throttled_usecs } = tenant.timeline_get_throttle.reset_stats();
+                if count_throttled == 0 {
+                    return;
+                }
+                let allowed_rps = tenant.timeline_get_throttle.steady_rps();
+                let delta = now - prev;
+                warn!(
+                    n_seconds=%format_args!("{:.3}",
+                    delta.as_secs_f64()),
+                    count_accounted,
+                    count_throttled,
+                    sum_throttled_usecs,
+                    allowed_rps=%format_args!("{allowed_rps:.0}"),
+                    "shard was throttled in the last n_seconds")
+            });
+
            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
                .await
--- a/pageserver/src/tenant/throttle.rs
+++ b/pageserver/src/tenant/throttle.rs
@@ -0,0 +1,162 @@
+use std::{
+    str::FromStr,
+    sync::{
+        atomic::{AtomicU64, Ordering},
+        Arc,
+    },
+    time::{Duration, Instant},
+};
+
+use arc_swap::ArcSwap;
+use enumset::EnumSet;
+use tracing::error;
+
+use crate::{context::RequestContext, task_mgr::TaskKind};
+
+/// Throttle for `async` functions.
+///
+/// Runtime reconfigurable.
+///
+/// To share a throttle among multiple entities, wrap it in an [`Arc`].
+///
+/// The intial use case for this is tenant-wide throttling of getpage@lsn requests.
+pub struct Throttle<M: Metric> {
+    inner: ArcSwap<Inner>,
+    metric: M,
+    /// will be turned into [`Stats::count_accounted`]
+    count_accounted: AtomicU64,
+    /// will be turned into [`Stats::count_throttled`]
+    count_throttled: AtomicU64,
+    /// will be turned into [`Stats::sum_throttled_usecs`]
+    sum_throttled_usecs: AtomicU64,
+}
+
+pub struct Inner {
+    task_kinds: EnumSet<TaskKind>,
+    rate_limiter: Arc<leaky_bucket::RateLimiter>,
+    config: Config,
+}
+
+pub type Config = pageserver_api::models::ThrottleConfig;
+
+pub struct Observation {
+    pub wait_time: Duration,
+}
+pub trait Metric {
+    fn observe_throttling(&self, observation: &Observation);
+}
+
+/// See [`Throttle::reset_stats`].
+pub struct Stats {
+    // Number of requests that were subject to throttling, i.e., requests of the configured [`Config::task_kinds`].
+    pub count_accounted: u64,
+    // Subset of the `accounted` requests that were actually throttled.
+    // Note that the numbers are stored as two independent atomics, so, there might be a slight drift.
+    pub count_throttled: u64,
+    // Sum of microseconds that throttled requests spent waiting for throttling.
+    pub sum_throttled_usecs: u64,
+}
+
+impl<M> Throttle<M>
+where
+    M: Metric,
+{
+    pub fn new(config: Config, metric: M) -> Self {
+        Self {
+            inner: ArcSwap::new(Arc::new(Self::new_inner(config))),
+            metric,
+            count_accounted: AtomicU64::new(0),
+            count_throttled: AtomicU64::new(0),
+            sum_throttled_usecs: AtomicU64::new(0),
+        }
+    }
+    fn new_inner(config: Config) -> Inner {
+        let Config {
+            task_kinds,
+            initial,
+            refill_interval,
+            refill_amount,
+            max,
+            fair,
+        } = &config;
+        let task_kinds: EnumSet<TaskKind> = task_kinds
+            .iter()
+            .filter_map(|s| match TaskKind::from_str(s) {
+                Ok(v) => Some(v),
+                Err(e) => {
+                    // TODO: avoid this failure mode
+                    error!(
+                        "cannot parse task kind, ignoring for rate limiting {}",
+                        utils::error::report_compact_sources(&e)
+                    );
+                    None
+                }
+            })
+            .collect();
+        Inner {
+            task_kinds,
+            rate_limiter: Arc::new(
+                leaky_bucket::RateLimiter::builder()
+                    .initial(*initial)
+                    .interval(*refill_interval)
+                    .refill(refill_amount.get())
+                    .max(*max)
+                    .fair(*fair)
+                    .build(),
+            ),
+            config,
+        }
+    }
+    pub fn reconfigure(&self, config: Config) {
+        self.inner.store(Arc::new(Self::new_inner(config)));
+    }
+
+    /// The [`Throttle`] keeps an internal flag that is true if there was ever any actual throttling.
+    /// This method allows retrieving & resetting that flag.
+    /// Useful for periodic reporting.
+    pub fn reset_stats(&self) -> Stats {
+        let count_accounted = self.count_accounted.swap(0, Ordering::Relaxed);
+        let count_throttled = self.count_throttled.swap(0, Ordering::Relaxed);
+        let sum_throttled_usecs = self.sum_throttled_usecs.swap(0, Ordering::Relaxed);
+        Stats {
+            count_accounted,
+            count_throttled,
+            sum_throttled_usecs,
+        }
+    }
+
+    /// See [`Config::steady_rps`].
+    pub fn steady_rps(&self) -> f64 {
+        self.inner.load().config.steady_rps()
+    }
+
+    pub async fn throttle(&self, ctx: &RequestContext, key_count: usize) {
+        let inner = self.inner.load_full(); // clones the `Inner` Arc
+        if !inner.task_kinds.contains(ctx.task_kind()) {
+            return;
+        };
+        let start = std::time::Instant::now();
+        let mut did_throttle = false;
+        let acquire = inner.rate_limiter.acquire(key_count);
+        // turn off runtime-induced preemption (aka coop) so our `did_throttle` is accurate
+        let acquire = tokio::task::unconstrained(acquire);
+        let mut acquire = std::pin::pin!(acquire);
+        std::future::poll_fn(|cx| {
+            use std::future::Future;
+            let poll = acquire.as_mut().poll(cx);
+            did_throttle = did_throttle || poll.is_pending();
+            poll
+        })
+        .await;
+        self.count_accounted.fetch_add(1, Ordering::Relaxed);
+        if did_throttle {
+            self.count_throttled.fetch_add(1, Ordering::Relaxed);
+            let now = Instant::now();
+            let wait_time = now - start;
+            self.sum_throttled_usecs
+                .fetch_add(wait_time.as_micros() as u64, Ordering::Relaxed);
+            let observation = Observation { wait_time };
+            self.metric.observe_throttling(&observation);
+        }
+    }
+}
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -164,6 +164,9 @@ fn drop_wlock<T>(rlock: tokio::sync::RwLockWriteGuard<'_, T>) {
 pub struct TimelineResources {
    pub remote_client: Option<RemoteTimelineClient>,
    pub deletion_queue_client: DeletionQueueClient,
+    pub timeline_get_throttle: Arc<
+        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
+    >,
 }

 pub struct Timeline {
@@ -355,6 +358,11 @@ pub struct Timeline {
    ///
    /// Timeline deletion will acquire both compaction and gc locks in whatever order.
    gc_lock: tokio::sync::Mutex<()>,
+
+    /// Cloned from [`super::Tenant::timeline_get_throttle`] on construction.
+    timeline_get_throttle: Arc<
+        crate::tenant::throttle::Throttle<&'static crate::metrics::tenant_throttling::TimelineGet>,
+    >,
 }

 pub struct WalReceiverInfo {
@@ -615,6 +623,8 @@ impl Timeline {
            return Err(PageReconstructError::Other(anyhow::anyhow!("Invalid LSN")));
        }

+        self.timeline_get_throttle.throttle(ctx, 1).await;
+
        // This check is debug-only because of the cost of hashing, and because it's a double-check: we
        // already checked the key against the shard_identity when looking up the Timeline from
        // page_service.
@@ -714,6 +724,10 @@ impl Timeline {
            return Err(GetVectoredError::Oversized(key_count));
        }

+        self.timeline_get_throttle
+            .throttle(ctx, key_count as usize)
+            .await;
+
        let _timer = crate::metrics::GET_VECTORED_LATENCY
            .for_task_kind(ctx.task_kind())
            .map(|t| t.start_timer());
@@ -1335,49 +1349,49 @@ const REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE: u64 = 10;
 // Private functions
 impl Timeline {
    pub(crate) fn get_lazy_slru_download(&self) -> bool {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .lazy_slru_download
            .unwrap_or(self.conf.default_tenant_conf.lazy_slru_download)
    }

    fn get_checkpoint_distance(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .checkpoint_distance
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_distance)
    }

    fn get_checkpoint_timeout(&self) -> Duration {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .checkpoint_timeout
            .unwrap_or(self.conf.default_tenant_conf.checkpoint_timeout)
    }

    fn get_compaction_target_size(&self) -> u64 {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .compaction_target_size
            .unwrap_or(self.conf.default_tenant_conf.compaction_target_size)
    }

    fn get_compaction_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .compaction_threshold
            .unwrap_or(self.conf.default_tenant_conf.compaction_threshold)
    }

    fn get_image_creation_threshold(&self) -> usize {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .image_creation_threshold
            .unwrap_or(self.conf.default_tenant_conf.image_creation_threshold)
    }

    fn get_eviction_policy(&self) -> EvictionPolicy {
-        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .eviction_policy
            .unwrap_or(self.conf.default_tenant_conf.eviction_policy)
@@ -1393,7 +1407,7 @@ impl Timeline {
    }

    fn get_gc_feedback(&self) -> bool {
-        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf;
+        let tenant_conf = &self.tenant_conf.read().unwrap().tenant_conf.clone();
        tenant_conf
            .gc_feedback
            .unwrap_or(self.conf.default_tenant_conf.gc_feedback)
@@ -1555,6 +1569,8 @@ impl Timeline {

                compaction_lock: tokio::sync::Mutex::default(),
                gc_lock: tokio::sync::Mutex::default(),
+
+                timeline_get_throttle: resources.timeline_get_throttle,
            };
            result.repartition_threshold =
                result.get_checkpoint_distance() / REPARTITION_FREQ_IN_CHECKPOINT_DISTANCE;
@@ -3274,90 +3290,107 @@ impl Timeline {

        for partition in partitioning.parts.iter() {
            let img_range = start..partition.ranges.last().unwrap().end;
-            start = img_range.end;
-            if force || self.time_for_new_image_layer(partition, lsn).await {
-                let mut image_layer_writer = ImageLayerWriter::new(
-                    self.conf,
-                    self.timeline_id,
-                    self.tenant_shard_id,
-                    &img_range,
-                    lsn,
-                )
-                .await?;
+            if !force && !self.time_for_new_image_layer(partition, lsn).await {
+                start = img_range.end;
+                continue;
+            }

-                fail_point!("image-layer-writer-fail-before-finish", |_| {
-                    Err(CreateImageLayersError::Other(anyhow::anyhow!(
-                        "failpoint image-layer-writer-fail-before-finish"
-                    )))
-                });
+            let mut image_layer_writer = ImageLayerWriter::new(
+                self.conf,
+                self.timeline_id,
+                self.tenant_shard_id,
+                &img_range,
+                lsn,
+            )
+            .await?;

-                let mut key_request_accum = KeySpaceAccum::new();
-                for range in &partition.ranges {
-                    let mut key = range.start;
-                    while key < range.end {
-                        if self.shard_identity.is_key_disposable(&key) {
-                            debug!(
-                                "Dropping key {} during compaction (it belongs on shard {:?})",
-                                key,
-                                self.shard_identity.get_shard_number(&key)
-                            );
-                            key = key.next();
-                            continue;
-                        }
+            fail_point!("image-layer-writer-fail-before-finish", |_| {
+                Err(CreateImageLayersError::Other(anyhow::anyhow!(
+                    "failpoint image-layer-writer-fail-before-finish"
+                )))
+            });

+            let mut wrote_keys = false;
+
+            let mut key_request_accum = KeySpaceAccum::new();
+            for range in &partition.ranges {
+                let mut key = range.start;
+                while key < range.end {
+                    // Decide whether to retain this key: usually we do, but sharded tenants may
+                    // need to drop keys that don't belong to them.  If we retain the key, add it
+                    // to `key_request_accum` for later issuing a vectored get
+                    if self.shard_identity.is_key_disposable(&key) {
+                        debug!(
+                            "Dropping key {} during compaction (it belongs on shard {:?})",
+                            key,
+                            self.shard_identity.get_shard_number(&key)
+                        );
+                    } else {
                        key_request_accum.add_key(key);
-                        if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
-                            || key.next() == range.end
-                        {
-                            let results = self
-                                .get_vectored(
-                                    &key_request_accum.consume_keyspace().ranges,
-                                    lsn,
-                                    ctx,
-                                )
-                                .await?;
+                    }

-                            for (img_key, img) in results {
-                                let img = match img {
-                                    Ok(img) => img,
-                                    Err(err) => {
-                                        // If we fail to reconstruct a VM or FSM page, we can zero the
-                                        // page without losing any actual user data. That seems better
-                                        // than failing repeatedly and getting stuck.
-                                        //
-                                        // We had a bug at one point, where we truncated the FSM and VM
-                                        // in the pageserver, but the Postgres didn't know about that
-                                        // and continued to generate incremental WAL records for pages
-                                        // that didn't exist in the pageserver. Trying to replay those
-                                        // WAL records failed to find the previous image of the page.
-                                        // This special case allows us to recover from that situation.
-                                        // See https://github.com/neondatabase/neon/issues/2601.
-                                        //
-                                        // Unfortunately we cannot do this for the main fork, or for
-                                        // any metadata keys, keys, as that would lead to actual data
-                                        // loss.
-                                        if is_rel_fsm_block_key(img_key)
-                                            || is_rel_vm_block_key(img_key)
-                                        {
-                                            warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
-                                            ZERO_PAGE.clone()
-                                        } else {
-                                            return Err(
-                                                CreateImageLayersError::PageReconstructError(err),
-                                            );
-                                        }
+                    let last_key_in_range = key.next() == range.end;
+                    key = key.next();
+
+                    // Maybe flush `key_rest_accum`
+                    if key_request_accum.size() >= Timeline::MAX_GET_VECTORED_KEYS
+                        || last_key_in_range
+                    {
+                        let results = self
+                            .get_vectored(&key_request_accum.consume_keyspace().ranges, lsn, ctx)
+                            .await?;
+
+                        for (img_key, img) in results {
+                            let img = match img {
+                                Ok(img) => img,
+                                Err(err) => {
+                                    // If we fail to reconstruct a VM or FSM page, we can zero the
+                                    // page without losing any actual user data. That seems better
+                                    // than failing repeatedly and getting stuck.
+                                    //
+                                    // We had a bug at one point, where we truncated the FSM and VM
+                                    // in the pageserver, but the Postgres didn't know about that
+                                    // and continued to generate incremental WAL records for pages
+                                    // that didn't exist in the pageserver. Trying to replay those
+                                    // WAL records failed to find the previous image of the page.
+                                    // This special case allows us to recover from that situation.
+                                    // See https://github.com/neondatabase/neon/issues/2601.
+                                    //
+                                    // Unfortunately we cannot do this for the main fork, or for
+                                    // any metadata keys, keys, as that would lead to actual data
+                                    // loss.
+                                    if is_rel_fsm_block_key(img_key) || is_rel_vm_block_key(img_key)
+                                    {
+                                        warn!("could not reconstruct FSM or VM key {img_key}, filling with zeros: {err:?}");
+                                        ZERO_PAGE.clone()
+                                    } else {
+                                        return Err(CreateImageLayersError::PageReconstructError(
+                                            err,
+                                        ));
                                    }
-                                };
+                                }
+                            };

-                                image_layer_writer.put_image(img_key, img).await?;
-                            }
+                            // Write all the keys we just read into our new image layer.
+                            image_layer_writer.put_image(img_key, img).await?;
+                            wrote_keys = true;
                        }
-
-                        key = key.next();
                    }
                }
+            }
+
+            if wrote_keys {
+                // Normal path: we have written some data into the new image layer for this
+                // partition, so flush it to disk.
+                start = img_range.end;
                let image_layer = image_layer_writer.finish(self).await?;
                image_layers.push(image_layer);
+            } else {
+                // Special case: the image layer may be empty if this is a sharded tenant and the
+                // partition does not cover any keys owned by this shard.  In this case, to ensure
+                // we don't leave gaps between image layers, leave `start` where it is, so that the next
+                // layer we write will cover the key range that we just scanned.
+                tracing::debug!("no data in range {}-{}", img_range.start, img_range.end);
            }
        }
        // All layers that the GC wanted us to create have now been created.
@@ -4849,7 +4882,7 @@ mod tests {
            TenantHarness::create("two_layer_eviction_attempts_at_the_same_time").unwrap();

        let ctx = any_context();
-        let tenant = harness.try_load(&ctx).await.unwrap();
+        let tenant = harness.do_try_load(&ctx).await.unwrap();
        let timeline = tenant
            .create_test_timeline(TimelineId::generate(), Lsn(0x10), 14, &ctx)
            .await
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -419,6 +419,7 @@ impl DeleteTimelineFlow {
                TimelineResources {
                    remote_client,
                    deletion_queue_client,
+                    timeline_get_throttle: tenant.timeline_get_throttle.clone(),
                },
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -196,13 +196,13 @@ impl Timeline {
            ControlFlow::Continue(()) => (),
        }

-        #[allow(dead_code)]
        #[derive(Debug, Default)]
        struct EvictionStats {
            candidates: usize,
            evicted: usize,
            errors: usize,
            not_evictable: usize,
+            #[allow(dead_code)]
            skipped_for_shutdown: usize,
        }

--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -582,24 +582,37 @@ impl VirtualFile {
    }

    // Copied from https://doc.rust-lang.org/1.72.0/src/std/os/unix/fs.rs.html#219-235
-    pub async fn write_all_at(&self, mut buf: &[u8], mut offset: u64) -> Result<(), Error> {
+    pub async fn write_all_at<B: BoundedBuf>(
+        &self,
+        buf: B,
+        mut offset: u64,
+    ) -> (B::Buf, Result<(), Error>) {
+        let buf_len = buf.bytes_init();
+        if buf_len == 0 {
+            return (Slice::into_inner(buf.slice_full()), Ok(()));
+        }
+        let mut buf = buf.slice(0..buf_len);
        while !buf.is_empty() {
-            match self.write_at(buf, offset).await {
+            // TODO: push `buf` further down
+            match self.write_at(&buf, offset).await {
                Ok(0) => {
-                    return Err(Error::new(
-                        std::io::ErrorKind::WriteZero,
-                        "failed to write whole buffer",
-                    ));
+                    return (
+                        Slice::into_inner(buf),
+                        Err(Error::new(
+                            std::io::ErrorKind::WriteZero,
+                            "failed to write whole buffer",
+                        )),
+                    );
                }
                Ok(n) => {
-                    buf = &buf[n..];
+                    buf = buf.slice(n..);
                    offset += n as u64;
                }
                Err(ref e) if e.kind() == std::io::ErrorKind::Interrupted => {}
-                Err(e) => return Err(e),
+                Err(e) => return (Slice::into_inner(buf), Err(e)),
            }
        }
-        Ok(())
+        (Slice::into_inner(buf), Ok(()))
    }

    /// Writes `buf.slice(0..buf.bytes_init())`.
@@ -1064,10 +1077,19 @@ mod tests {
                MaybeVirtualFile::File(file) => file.read_exact_at(&mut buf, offset).map(|()| buf),
            }
        }
-        async fn write_all_at(&self, buf: &[u8], offset: u64) -> Result<(), Error> {
+        async fn write_all_at<B: BoundedBuf>(&self, buf: B, offset: u64) -> Result<(), Error> {
            match self {
-                MaybeVirtualFile::VirtualFile(file) => file.write_all_at(buf, offset).await,
-                MaybeVirtualFile::File(file) => file.write_all_at(buf, offset),
+                MaybeVirtualFile::VirtualFile(file) => {
+                    let (_buf, res) = file.write_all_at(buf, offset).await;
+                    res
+                }
+                MaybeVirtualFile::File(file) => {
+                    let buf_len = buf.bytes_init();
+                    if buf_len == 0 {
+                        return Ok(());
+                    }
+                    file.write_all_at(&buf.slice(0..buf_len), offset)
+                }
            }
        }
        async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
@@ -1214,8 +1236,8 @@ mod tests {
                .to_owned(),
        )
        .await?;
-        file_b.write_all_at(b"BAR", 3).await?;
-        file_b.write_all_at(b"FOO", 0).await?;
+        file_b.write_all_at(b"BAR".to_vec(), 3).await?;
+        file_b.write_all_at(b"FOO".to_vec(), 0).await?;

        assert_eq!(file_b.read_string_at(2, 3).await?, "OBA");

--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1695,22 +1695,22 @@ mod tests {
        let mut m = tline.begin_modification(Lsn(0x20));
        walingest.put_rel_creation(&mut m, TESTREL_A, &ctx).await?;
        walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
            .await?;
        m.commit(&ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x30));
        walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 3"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 3"), &ctx)
            .await?;
        m.commit(&ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x40));
        walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1 at 4"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1 at 4"), &ctx)
            .await?;
        m.commit(&ctx).await?;
        let mut m = tline.begin_modification(Lsn(0x50));
        walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 2, TEST_IMG("foo blk 2 at 5"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 2, test_img("foo blk 2 at 5"), &ctx)
            .await?;
        m.commit(&ctx).await?;

@@ -1751,46 +1751,46 @@ mod tests {
            tline
                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x20)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 0 at 2")
+            test_img("foo blk 0 at 2")
        );

        assert_eq!(
            tline
                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x30)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 0 at 3")
+            test_img("foo blk 0 at 3")
        );

        assert_eq!(
            tline
                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x40)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 0 at 3")
+            test_img("foo blk 0 at 3")
        );
        assert_eq!(
            tline
                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x40)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 1 at 4")
+            test_img("foo blk 1 at 4")
        );

        assert_eq!(
            tline
                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 0 at 3")
+            test_img("foo blk 0 at 3")
        );
        assert_eq!(
            tline
                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 1 at 4")
+            test_img("foo blk 1 at 4")
        );
        assert_eq!(
            tline
                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 2 at 5")
+            test_img("foo blk 2 at 5")
        );

        // Truncate last block
@@ -1812,13 +1812,13 @@ mod tests {
            tline
                .get_rel_page_at_lsn(TESTREL_A, 0, Version::Lsn(Lsn(0x60)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 0 at 3")
+            test_img("foo blk 0 at 3")
        );
        assert_eq!(
            tline
                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x60)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 1 at 4")
+            test_img("foo blk 1 at 4")
        );

        // should still see the truncated block with older LSN
@@ -1832,7 +1832,7 @@ mod tests {
            tline
                .get_rel_page_at_lsn(TESTREL_A, 2, Version::Lsn(Lsn(0x50)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 2 at 5")
+            test_img("foo blk 2 at 5")
        );

        // Truncate to zero length
@@ -1851,7 +1851,7 @@ mod tests {
        // Extend from 0 to 2 blocks, leaving a gap
        let mut m = tline.begin_modification(Lsn(0x70));
        walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1, TEST_IMG("foo blk 1"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 1, test_img("foo blk 1"), &ctx)
            .await?;
        m.commit(&ctx).await?;
        assert_eq!(
@@ -1870,13 +1870,13 @@ mod tests {
            tline
                .get_rel_page_at_lsn(TESTREL_A, 1, Version::Lsn(Lsn(0x70)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 1")
+            test_img("foo blk 1")
        );

        // Extend a lot more, leaving a big gap that spans across segments
        let mut m = tline.begin_modification(Lsn(0x80));
        walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 1500, TEST_IMG("foo blk 1500"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 1500, test_img("foo blk 1500"), &ctx)
            .await?;
        m.commit(&ctx).await?;
        assert_eq!(
@@ -1897,7 +1897,7 @@ mod tests {
            tline
                .get_rel_page_at_lsn(TESTREL_A, 1500, Version::Lsn(Lsn(0x80)), false, &ctx)
                .await?,
-            TEST_IMG("foo blk 1500")
+            test_img("foo blk 1500")
        );

        Ok(())
@@ -1915,7 +1915,7 @@ mod tests {

        let mut m = tline.begin_modification(Lsn(0x20));
        walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 2"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 2"), &ctx)
            .await?;
        m.commit(&ctx).await?;

@@ -1952,7 +1952,7 @@ mod tests {
        // Re-create it
        let mut m = tline.begin_modification(Lsn(0x40));
        walingest
-            .put_rel_page_image(&mut m, TESTREL_A, 0, TEST_IMG("foo blk 0 at 4"), &ctx)
+            .put_rel_page_image(&mut m, TESTREL_A, 0, test_img("foo blk 0 at 4"), &ctx)
            .await?;
        m.commit(&ctx).await?;

@@ -1990,7 +1990,7 @@ mod tests {
        for blkno in 0..relsize {
            let data = format!("foo blk {} at {}", blkno, Lsn(0x20));
            walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx)
                .await?;
        }
        m.commit(&ctx).await?;
@@ -2028,7 +2028,7 @@ mod tests {
                tline
                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(lsn), false, &ctx)
                    .await?,
-                TEST_IMG(&data)
+                test_img(&data)
            );
        }

@@ -2055,7 +2055,7 @@ mod tests {
                tline
                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x60)), false, &ctx)
                    .await?,
-                TEST_IMG(&data)
+                test_img(&data)
            );
        }

@@ -2073,7 +2073,7 @@ mod tests {
                tline
                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x50)), false, &ctx)
                    .await?,
-                TEST_IMG(&data)
+                test_img(&data)
            );
        }

@@ -2084,7 +2084,7 @@ mod tests {
        for blkno in 0..relsize {
            let data = format!("foo blk {} at {}", blkno, lsn);
            walingest
-                .put_rel_page_image(&mut m, TESTREL_A, blkno, TEST_IMG(&data), &ctx)
+                .put_rel_page_image(&mut m, TESTREL_A, blkno, test_img(&data), &ctx)
                .await?;
        }
        m.commit(&ctx).await?;
@@ -2109,7 +2109,7 @@ mod tests {
                tline
                    .get_rel_page_at_lsn(TESTREL_A, blkno, Version::Lsn(Lsn(0x80)), false, &ctx)
                    .await?,
-                TEST_IMG(&data)
+                test_img(&data)
            );
        }

@@ -2130,7 +2130,7 @@ mod tests {
        for blknum in 0..RELSEG_SIZE + 1 {
            lsn += 0x10;
            let mut m = tline.begin_modification(Lsn(lsn));
-            let img = TEST_IMG(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
+            let img = test_img(&format!("foo blk {} at {}", blknum, Lsn(lsn)));
            walingest
                .put_rel_page_image(&mut m, TESTREL_A, blknum as BlockNumber, img, &ctx)
                .await?;
--- a/pageserver/src/walrecord.rs
+++ b/pageserver/src/walrecord.rs
@@ -44,6 +44,11 @@ pub enum NeonWalRecord {
        moff: MultiXactOffset,
        members: Vec<MultiXactMember>,
    },
+    /// Update the map of AUX files, either writing or dropping an entry
+    AuxFile {
+        file_path: String,
+        content: Option<Bytes>,
+    },
 }

 impl NeonWalRecord {
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -22,7 +22,7 @@
 mod process;

 /// Code to apply [`NeonWalRecord`]s.
-mod apply_neon;
+pub(crate) mod apply_neon;

 use crate::config::PageServerConf;
 use crate::metrics::{
--- a/pageserver/src/walredo/apply_neon.rs
+++ b/pageserver/src/walredo/apply_neon.rs
@@ -1,7 +1,8 @@
+use crate::pgdatadir_mapping::AuxFilesDirectory;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use byteorder::{ByteOrder, LittleEndian};
-use bytes::BytesMut;
+use bytes::{BufMut, BytesMut};
 use pageserver_api::key::{key_to_rel_block, key_to_slru_block, Key};
 use pageserver_api::reltag::SlruKind;
 use postgres_ffi::pg_constants;
@@ -12,6 +13,7 @@ use postgres_ffi::v14::nonrelfile_utils::{
 };
 use postgres_ffi::BLCKSZ;
 use tracing::*;
+use utils::bin_ser::BeSer;

 /// Can this request be served by neon redo functions
 /// or we need to pass it to wal-redo postgres process?
@@ -230,6 +232,72 @@ pub(crate) fn apply_in_neon(
                LittleEndian::write_u32(&mut page[memberoff..memberoff + 4], member.xid);
            }
        }
+        NeonWalRecord::AuxFile { file_path, content } => {
+            let mut dir = AuxFilesDirectory::des(page)?;
+            dir.upsert(file_path.clone(), content.clone());
+
+            page.clear();
+            let mut writer = page.writer();
+            dir.ser_into(&mut writer)?;
+        }
    }
    Ok(())
 }
+
+#[cfg(test)]
+mod test {
+    use bytes::Bytes;
+    use pageserver_api::key::AUX_FILES_KEY;
+
+    use super::*;
+    use std::collections::HashMap;
+
+    use crate::{pgdatadir_mapping::AuxFilesDirectory, walrecord::NeonWalRecord};
+
+    /// Test [`apply_in_neon`]'s handling of NeonWalRecord::AuxFile
+    #[test]
+    fn apply_aux_file_deltas() -> anyhow::Result<()> {
+        let base_dir = AuxFilesDirectory {
+            files: HashMap::from([
+                ("two".to_string(), Bytes::from_static(b"content0")),
+                ("three".to_string(), Bytes::from_static(b"contentX")),
+            ]),
+        };
+        let base_image = AuxFilesDirectory::ser(&base_dir)?;
+
+        let deltas = vec![
+            // Insert
+            NeonWalRecord::AuxFile {
+                file_path: "one".to_string(),
+                content: Some(Bytes::from_static(b"content1")),
+            },
+            // Update
+            NeonWalRecord::AuxFile {
+                file_path: "two".to_string(),
+                content: Some(Bytes::from_static(b"content99")),
+            },
+            // Delete
+            NeonWalRecord::AuxFile {
+                file_path: "three".to_string(),
+                content: None,
+            },
+        ];
+
+        let file_path = AUX_FILES_KEY;
+        let mut page = BytesMut::from_iter(base_image);
+
+        for record in deltas {
+            apply_in_neon(&record, file_path, &mut page)?;
+        }
+
+        let reconstructed = AuxFilesDirectory::des(&page)?;
+        let expect = HashMap::from([
+            ("one".to_string(), Bytes::from_static(b"content1")),
+            ("two".to_string(), Bytes::from_static(b"content99")),
+        ]);
+
+        assert_eq!(reconstructed.files, expect);
+
+        Ok(())
+    }
+}
--- a/pgxn/neon_test_utils/neon_test_utils--1.0.sql
+++ b/pgxn/neon_test_utils/neon_test_utils--1.0.sql
@@ -7,6 +7,24 @@ AS 'MODULE_PATHNAME', 'test_consume_xids'
 LANGUAGE C STRICT
 PARALLEL UNSAFE;

+CREATE FUNCTION test_consume_cpu(seconds int)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'test_consume_cpu'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
+
+CREATE FUNCTION test_consume_memory(megabytes int)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'test_consume_memory'
+LANGUAGE C STRICT
+PARALLEL UNSAFE;
+
+CREATE FUNCTION test_release_memory(megabytes int DEFAULT NULL)
+RETURNS VOID
+AS 'MODULE_PATHNAME', 'test_release_memory'
+LANGUAGE C
+PARALLEL UNSAFE;
+
 CREATE FUNCTION clear_buffer_cache()
 RETURNS VOID
 AS 'MODULE_PATHNAME', 'clear_buffer_cache'
--- a/pgxn/neon_test_utils/neon_test_utils.control
+++ b/pgxn/neon_test_utils/neon_test_utils.control
@@ -3,3 +3,4 @@ comment = 'helpers for neon testing and debugging'
 default_version = '1.0'
 module_pathname = '$libdir/neon_test_utils'
 relocatable = true
+trusted = true
--- a/pgxn/neon_test_utils/neontest.c
+++ b/pgxn/neon_test_utils/neontest.c
@@ -21,10 +21,12 @@
 #include "miscadmin.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
+#include "storage/fd.h"
 #include "utils/builtins.h"
 #include "utils/pg_lsn.h"
 #include "utils/rel.h"
 #include "utils/varlena.h"
+#include "utils/wait_event.h"
 #include "../neon/pagestore_client.h"

 PG_MODULE_MAGIC;
@@ -32,6 +34,9 @@ PG_MODULE_MAGIC;
 extern void _PG_init(void);

 PG_FUNCTION_INFO_V1(test_consume_xids);
+PG_FUNCTION_INFO_V1(test_consume_cpu);
+PG_FUNCTION_INFO_V1(test_consume_memory);
+PG_FUNCTION_INFO_V1(test_release_memory);
 PG_FUNCTION_INFO_V1(clear_buffer_cache);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn);
 PG_FUNCTION_INFO_V1(get_raw_page_at_lsn_ex);
@@ -97,6 +102,119 @@ test_consume_xids(PG_FUNCTION_ARGS)
 	PG_RETURN_VOID();
 }

+
+/*
+ * test_consume_cpu(seconds int). Keeps one CPU busy for the given number of seconds.
+ */
+Datum
+test_consume_cpu(PG_FUNCTION_ARGS)
+{
+	int32		seconds = PG_GETARG_INT32(0);
+	TimestampTz start;
+	uint64		total_iterations = 0;
+
+	start = GetCurrentTimestamp();
+
+	for (;;)
+	{
+		TimestampTz elapsed;
+
+		elapsed = GetCurrentTimestamp() - start;
+		if (elapsed > (TimestampTz) seconds * USECS_PER_SEC)
+			break;
+
+		/* keep spinning */
+		for (int i = 0; i < 1000000; i++)
+			total_iterations++;
+		elog(DEBUG2, "test_consume_cpu(): %lu iterations in total", total_iterations);
+
+		CHECK_FOR_INTERRUPTS();
+	}
+
+	PG_RETURN_VOID();
+}
+
+static MemoryContext consume_cxt = NULL;
+static slist_head consumed_memory_chunks;
+static int64 num_memory_chunks;
+
+/*
+ * test_consume_memory(megabytes int).
+ *
+ * Consume given amount of memory. The allocation is made in TopMemoryContext,
+ * so it outlives the function, until you call test_release_memory to
+ * explicitly release it, or close the session.
+ */
+Datum
+test_consume_memory(PG_FUNCTION_ARGS)
+{
+	int32		megabytes = PG_GETARG_INT32(0);
+
+	/*
+	 * Consume the memory in a new memory context, so that it's convenient to
+	 * release and to display it separately in a possible memory context dump.
+	 */
+	if (consume_cxt == NULL)
+		consume_cxt = AllocSetContextCreate(TopMemoryContext,
+											"test_consume_memory",
+											ALLOCSET_DEFAULT_SIZES);
+
+	for (int32 i = 0; i < megabytes; i++)
+	{
+		char	   *p;
+
+		p = MemoryContextAllocZero(consume_cxt, 1024 * 1024);
+
+		/* touch the memory, so that it's really allocated by the kernel */
+		for (int j = 0; j < 1024 * 1024; j += 1024)
+			p[j] = j % 0xFF;
+
+		slist_push_head(&consumed_memory_chunks, (slist_node *) p);
+		num_memory_chunks++;
+	}
+
+	PG_RETURN_VOID();
+}
+
+/*
+ * test_release_memory(megabytes int). NULL releases all
+ */
+Datum
+test_release_memory(PG_FUNCTION_ARGS)
+{
+	TimestampTz start;
+
+	if (PG_ARGISNULL(0))
+	{
+		if (consume_cxt)
+		{
+			MemoryContextDelete(consume_cxt);
+			consume_cxt = NULL;
+			num_memory_chunks = 0;
+		}
+	}
+	else
+	{
+		int32		chunks_to_release = PG_GETARG_INT32(0);
+
+		if (chunks_to_release > num_memory_chunks)
+		{
+			elog(WARNING, "only %lu MB is consumed, releasing it all", num_memory_chunks);
+			chunks_to_release = num_memory_chunks;
+		}
+
+		for (int32 i = 0; i < chunks_to_release; i++)
+		{
+			slist_node *chunk = slist_pop_head_node(&consumed_memory_chunks);
+
+			pfree(chunk);
+			num_memory_chunks--;
+		}
+	}
+
+	PG_RETURN_VOID();
+}
+
 /*
 * Flush the buffer cache, evicting all pages that are not currently pinned.
 */
--- a/poetry.lock
+++ b/poetry.lock
@@ -836,43 +836,43 @@ files = [

 [[package]]
 name = "cryptography"
-version = "42.0.0"
+version = "42.0.2"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-42.0.0-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:c640b0ef54138fde761ec99a6c7dc4ce05e80420262c20fa239e694ca371d434"},
-    {file = "cryptography-42.0.0-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:678cfa0d1e72ef41d48993a7be75a76b0725d29b820ff3cfd606a5b2b33fda01"},
-    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:146e971e92a6dd042214b537a726c9750496128453146ab0ee8971a0299dc9bd"},
-    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:87086eae86a700307b544625e3ba11cc600c3c0ef8ab97b0fda0705d6db3d4e3"},
-    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:0a68bfcf57a6887818307600c3c0ebc3f62fbb6ccad2240aa21887cda1f8df1b"},
-    {file = "cryptography-42.0.0-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:5a217bca51f3b91971400890905a9323ad805838ca3fa1e202a01844f485ee87"},
-    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:ca20550bb590db16223eb9ccc5852335b48b8f597e2f6f0878bbfd9e7314eb17"},
-    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:33588310b5c886dfb87dba5f013b8d27df7ffd31dc753775342a1e5ab139e59d"},
-    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:9515ea7f596c8092fdc9902627e51b23a75daa2c7815ed5aa8cf4f07469212ec"},
-    {file = "cryptography-42.0.0-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:35cf6ed4c38f054478a9df14f03c1169bb14bd98f0b1705751079b25e1cb58bc"},
-    {file = "cryptography-42.0.0-cp37-abi3-win32.whl", hash = "sha256:8814722cffcfd1fbd91edd9f3451b88a8f26a5fd41b28c1c9193949d1c689dc4"},
-    {file = "cryptography-42.0.0-cp37-abi3-win_amd64.whl", hash = "sha256:a2a8d873667e4fd2f34aedab02ba500b824692c6542e017075a2efc38f60a4c0"},
-    {file = "cryptography-42.0.0-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:8fedec73d590fd30c4e3f0d0f4bc961aeca8390c72f3eaa1a0874d180e868ddf"},
-    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:be41b0c7366e5549265adf2145135dca107718fa44b6e418dc7499cfff6b4689"},
-    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3ca482ea80626048975360c8e62be3ceb0f11803180b73163acd24bf014133a0"},
-    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:c58115384bdcfe9c7f644c72f10f6f42bed7cf59f7b52fe1bf7ae0a622b3a139"},
-    {file = "cryptography-42.0.0-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:56ce0c106d5c3fec1038c3cca3d55ac320a5be1b44bf15116732d0bc716979a2"},
-    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:324721d93b998cb7367f1e6897370644751e5580ff9b370c0a50dc60a2003513"},
-    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:d97aae66b7de41cdf5b12087b5509e4e9805ed6f562406dfcf60e8481a9a28f8"},
-    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:85f759ed59ffd1d0baad296e72780aa62ff8a71f94dc1ab340386a1207d0ea81"},
-    {file = "cryptography-42.0.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:206aaf42e031b93f86ad60f9f5d9da1b09164f25488238ac1dc488334eb5e221"},
-    {file = "cryptography-42.0.0-cp39-abi3-win32.whl", hash = "sha256:74f18a4c8ca04134d2052a140322002fef535c99cdbc2a6afc18a8024d5c9d5b"},
-    {file = "cryptography-42.0.0-cp39-abi3-win_amd64.whl", hash = "sha256:14e4b909373bc5bf1095311fa0f7fcabf2d1a160ca13f1e9e467be1ac4cbdf94"},
-    {file = "cryptography-42.0.0-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:3005166a39b70c8b94455fdbe78d87a444da31ff70de3331cdec2c568cf25b7e"},
-    {file = "cryptography-42.0.0-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:be14b31eb3a293fc6e6aa2807c8a3224c71426f7c4e3639ccf1a2f3ffd6df8c3"},
-    {file = "cryptography-42.0.0-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:bd7cf7a8d9f34cc67220f1195884151426ce616fdc8285df9054bfa10135925f"},
-    {file = "cryptography-42.0.0-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:c310767268d88803b653fffe6d6f2f17bb9d49ffceb8d70aed50ad45ea49ab08"},
-    {file = "cryptography-42.0.0-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:bdce70e562c69bb089523e75ef1d9625b7417c6297a76ac27b1b8b1eb51b7d0f"},
-    {file = "cryptography-42.0.0-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:e9326ca78111e4c645f7e49cbce4ed2f3f85e17b61a563328c85a5208cf34440"},
-    {file = "cryptography-42.0.0-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:69fd009a325cad6fbfd5b04c711a4da563c6c4854fc4c9544bff3088387c77c0"},
-    {file = "cryptography-42.0.0-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:988b738f56c665366b1e4bfd9045c3efae89ee366ca3839cd5af53eaa1401bce"},
-    {file = "cryptography-42.0.0.tar.gz", hash = "sha256:6cf9b76d6e93c62114bd19485e5cb003115c134cf9ce91f8ac924c44f8c8c3f4"},
+    {file = "cryptography-42.0.2-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:701171f825dcab90969596ce2af253143b93b08f1a716d4b2a9d2db5084ef7be"},
+    {file = "cryptography-42.0.2-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:61321672b3ac7aade25c40449ccedbc6db72c7f5f0fdf34def5e2f8b51ca530d"},
+    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ea2c3ffb662fec8bbbfce5602e2c159ff097a4631d96235fcf0fb00e59e3ece4"},
+    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:3b15c678f27d66d247132cbf13df2f75255627bcc9b6a570f7d2fd08e8c081d2"},
+    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:8e88bb9eafbf6a4014d55fb222e7360eef53e613215085e65a13290577394529"},
+    {file = "cryptography-42.0.2-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:a047682d324ba56e61b7ea7c7299d51e61fd3bca7dad2ccc39b72bd0118d60a1"},
+    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:36d4b7c4be6411f58f60d9ce555a73df8406d484ba12a63549c88bd64f7967f1"},
+    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:a00aee5d1b6c20620161984f8ab2ab69134466c51f58c052c11b076715e72929"},
+    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:b97fe7d7991c25e6a31e5d5e795986b18fbbb3107b873d5f3ae6dc9a103278e9"},
+    {file = "cryptography-42.0.2-cp37-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:5fa82a26f92871eca593b53359c12ad7949772462f887c35edaf36f87953c0e2"},
+    {file = "cryptography-42.0.2-cp37-abi3-win32.whl", hash = "sha256:4b063d3413f853e056161eb0c7724822a9740ad3caa24b8424d776cebf98e7ee"},
+    {file = "cryptography-42.0.2-cp37-abi3-win_amd64.whl", hash = "sha256:841ec8af7a8491ac76ec5a9522226e287187a3107e12b7d686ad354bb78facee"},
+    {file = "cryptography-42.0.2-cp39-abi3-macosx_10_12_universal2.whl", hash = "sha256:55d1580e2d7e17f45d19d3b12098e352f3a37fe86d380bf45846ef257054b242"},
+    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28cb2c41f131a5758d6ba6a0504150d644054fd9f3203a1e8e8d7ac3aea7f73a"},
+    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b9097a208875fc7bbeb1286d0125d90bdfed961f61f214d3f5be62cd4ed8a446"},
+    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:44c95c0e96b3cb628e8452ec060413a49002a247b2b9938989e23a2c8291fc90"},
+    {file = "cryptography-42.0.2-cp39-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2f9f14185962e6a04ab32d1abe34eae8a9001569ee4edb64d2304bf0d65c53f3"},
+    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:09a77e5b2e8ca732a19a90c5bca2d124621a1edb5438c5daa2d2738bfeb02589"},
+    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:ad28cff53f60d99a928dfcf1e861e0b2ceb2bc1f08a074fdd601b314e1cc9e0a"},
+    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:130c0f77022b2b9c99d8cebcdd834d81705f61c68e91ddd614ce74c657f8b3ea"},
+    {file = "cryptography-42.0.2-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:fa3dec4ba8fb6e662770b74f62f1a0c7d4e37e25b58b2bf2c1be4c95372b4a33"},
+    {file = "cryptography-42.0.2-cp39-abi3-win32.whl", hash = "sha256:3dbd37e14ce795b4af61b89b037d4bc157f2cb23e676fa16932185a04dfbf635"},
+    {file = "cryptography-42.0.2-cp39-abi3-win_amd64.whl", hash = "sha256:8a06641fb07d4e8f6c7dda4fc3f8871d327803ab6542e33831c7ccfdcb4d0ad6"},
+    {file = "cryptography-42.0.2-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:087887e55e0b9c8724cf05361357875adb5c20dec27e5816b653492980d20380"},
+    {file = "cryptography-42.0.2-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:a7ef8dd0bf2e1d0a27042b231a3baac6883cdd5557036f5e8df7139255feaac6"},
+    {file = "cryptography-42.0.2-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4383b47f45b14459cab66048d384614019965ba6c1a1a141f11b5a551cace1b2"},
+    {file = "cryptography-42.0.2-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:fbeb725c9dc799a574518109336acccaf1303c30d45c075c665c0793c2f79a7f"},
+    {file = "cryptography-42.0.2-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:320948ab49883557a256eab46149df79435a22d2fefd6a66fe6946f1b9d9d008"},
+    {file = "cryptography-42.0.2-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:5ef9bc3d046ce83c4bbf4c25e1e0547b9c441c01d30922d812e887dc5f125c12"},
+    {file = "cryptography-42.0.2-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:52ed9ebf8ac602385126c9a2fe951db36f2cb0c2538d22971487f89d0de4065a"},
+    {file = "cryptography-42.0.2-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:141e2aa5ba100d3788c0ad7919b288f89d1fe015878b9659b307c9ef867d3a65"},
+    {file = "cryptography-42.0.2.tar.gz", hash = "sha256:e0ec52ba3c7f1b7d813cd52649a5b3ef1fc0d433219dc8c93827c57eab6cf888"},
 ]

 [package.dependencies]
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -234,9 +234,11 @@ async fn main() -> anyhow::Result<()> {
    let endpoint_rate_limiter = Arc::new(EndpointRateLimiter::new(&config.endpoint_rps_limit));
    let cancel_map = CancelMap::default();
    let redis_publisher = match &args.redis_notifications {
-        Some(url) => Some(Arc::new(Mutex::new(
-            RedisPublisherClient::new(url, args.region.clone(), &config.redis_rps_limit).await?,
-        ))),
+        Some(url) => Some(Arc::new(Mutex::new(RedisPublisherClient::new(
+            url,
+            args.region.clone(),
+            &config.redis_rps_limit,
+        )?))),
        None => None,
    };
    let cancellation_handler = Arc::new(CancellationHandler::new(
--- a/proxy/src/context/parquet.rs
+++ b/proxy/src/context/parquet.rs
@@ -13,7 +13,7 @@ use parquet::{
    },
    record::RecordWriter,
 };
-use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig};
+use remote_storage::{GenericRemoteStorage, RemotePath, RemoteStorageConfig, TimeoutOrCancel};
 use tokio::{sync::mpsc, time};
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, info, Span};
@@ -314,20 +314,23 @@ async fn upload_parquet(
    let path = RemotePath::from_string(&format!(
        "{year:04}/{month:02}/{day:02}/{hour:02}/requests_{id}.parquet"
    ))?;
+    let cancel = CancellationToken::new();
    backoff::retry(
        || async {
            let stream = futures::stream::once(futures::future::ready(Ok(data.clone())));
-            storage.upload(stream, data.len(), &path, None).await
+            storage
+                .upload(stream, data.len(), &path, None, &cancel)
+                .await
        },
-        |_e| false,
+        TimeoutOrCancel::caused_by_cancel,
        FAILED_UPLOAD_WARN_THRESHOLD,
        FAILED_UPLOAD_MAX_RETRIES,
        "request_data_upload",
        // we don't want cancellation to interrupt here, so we make a dummy cancel token
-        &CancellationToken::new(),
+        &cancel,
    )
    .await
-    .ok_or_else(|| anyhow::anyhow!("Cancelled"))
+    .ok_or_else(|| anyhow::Error::new(TimeoutOrCancel::Cancel))
    .and_then(|x| x)
    .context("request_data_upload")?;

@@ -413,7 +416,8 @@ mod tests {
                    )
                    .unwrap(),
                    max_keys_per_list_response: DEFAULT_MAX_KEYS_PER_LIST_RESPONSE,
-                })
+                }),
+                timeout: RemoteStorageConfig::DEFAULT_TIMEOUT,
            })
        );
        assert_eq!(parquet_upload.parquet_upload_row_group_size, 100);
@@ -466,6 +470,7 @@ mod tests {
    ) -> Vec<(u64, usize, i64)> {
        let remote_storage_config = RemoteStorageConfig {
            storage: RemoteStorageKind::LocalFs(tmpdir.to_path_buf()),
+            timeout: std::time::Duration::from_secs(120),
        };
        let storage = GenericRemoteStorage::from_config(&remote_storage_config).unwrap();

--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -331,6 +331,7 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
        compute: node,
        req: _request_gauge,
        conn: _client_gauge,
+        cancel: session,
    }))
 }

--- a/proxy/src/proxy/connect_compute.rs
+++ b/proxy/src/proxy/connect_compute.rs
@@ -122,25 +122,24 @@ where

    error!(error = ?err, "could not connect to compute node");

-    let node_info =
-        if err.get_error_kind() == crate::error::ErrorKind::Postgres || !node_info.cached() {
-            // If the error is Postgres, that means that we managed to connect to the compute node, but there was an error.
-            // Do not need to retrieve a new node_info, just return the old one.
-            if !err.should_retry(num_retries) {
-                return Err(err.into());
-            }
-            node_info
-        } else {
-            // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
-            info!("compute node's state has likely changed; requesting a wake-up");
-            ctx.latency_timer.cache_miss();
-            let old_node_info = invalidate_cache(node_info);
-            let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
-            node_info.reuse_settings(old_node_info);
+    let node_info = if !node_info.cached() {
+        // If we just recieved this from cplane and dodn't get it from cache, we shouldn't retry.
+        // Do not need to retrieve a new node_info, just return the old one.
+        if !err.should_retry(num_retries) {
+            return Err(err.into());
+        }
+        node_info
+    } else {
+        // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
+        info!("compute node's state has likely changed; requesting a wake-up");
+        ctx.latency_timer.cache_miss();
+        let old_node_info = invalidate_cache(node_info);
+        let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
+        node_info.reuse_settings(old_node_info);

-            mechanism.update_connect_config(&mut node_info.config);
-            node_info
-        };
+        mechanism.update_connect_config(&mut node_info.config);
+        node_info
+    };

    // now that we have a new node, try connect to it repeatedly.
    // this can error for a few reasons, for instance:
--- a/proxy/src/proxy/passthrough.rs
+++ b/proxy/src/proxy/passthrough.rs
@@ -1,4 +1,5 @@
 use crate::{
+    cancellation,
    compute::PostgresConnection,
    console::messages::MetricsAuxInfo,
    metrics::NUM_BYTES_PROXIED_COUNTER,
@@ -57,6 +58,7 @@ pub struct ProxyPassthrough<S> {

    pub req: IntCounterPairGuard,
    pub conn: IntCounterPairGuard,
+    pub cancel: cancellation::Session,
 }

 impl<S: AsyncRead + AsyncWrite + Unpin> ProxyPassthrough<S> {
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -375,8 +375,6 @@ enum ConnectAction {
    Connect,
    Retry,
    Fail,
-    RetryPg,
-    FailPg,
 }

 #[derive(Clone)]
@@ -466,14 +464,6 @@ impl ConnectMechanism for TestConnectMechanism {
                retryable: false,
                kind: ErrorKind::Compute,
            }),
-            ConnectAction::FailPg => Err(TestConnectError {
-                retryable: false,
-                kind: ErrorKind::Postgres,
-            }),
-            ConnectAction::RetryPg => Err(TestConnectError {
-                retryable: true,
-                kind: ErrorKind::Postgres,
-            }),
            x => panic!("expecting action {:?}, connect is called instead", x),
        }
    }
@@ -572,32 +562,6 @@ async fn connect_to_compute_retry() {
    mechanism.verify();
 }

-#[tokio::test]
-async fn connect_to_compute_retry_pg() {
-    let _ = env_logger::try_init();
-    use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Wake, RetryPg, Connect]);
-    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
-        .await
-        .unwrap();
-    mechanism.verify();
-}
-
-#[tokio::test]
-async fn connect_to_compute_fail_pg() {
-    let _ = env_logger::try_init();
-    use ConnectAction::*;
-    let mut ctx = RequestMonitoring::test();
-    let mechanism = TestConnectMechanism::new(vec![Wake, FailPg]);
-    let user_info = helper_create_connect_info(&mechanism);
-    connect_to_compute(&mut ctx, &mechanism, &user_info, false)
-        .await
-        .unwrap_err();
-    mechanism.verify();
-}
-
 /// Test that we don't retry if the error is not retryable.
 #[tokio::test]
 async fn connect_to_compute_non_retry_1() {
--- a/proxy/src/redis/publisher.rs
+++ b/proxy/src/redis/publisher.rs
@@ -1,12 +1,5 @@
-use std::hash::BuildHasherDefault;
-
-use lasso::{Spur, ThreadedRodeo};
 use pq_proto::CancelKeyData;
-use redis::{
-    streams::{StreamReadOptions, StreamReadReply},
-    AsyncCommands, FromRedisValue,
-};
-use rustc_hash::FxHasher;
+use redis::AsyncCommands;
 use uuid::Uuid;

 use crate::rate_limiter::{RateBucketInfo, RedisRateLimiter};
@@ -21,74 +14,15 @@ pub struct RedisPublisherClient {
 }

 impl RedisPublisherClient {
-    pub async fn new(
+    pub fn new(
        url: &str,
        region_id: String,
        info: &'static [RateBucketInfo],
    ) -> anyhow::Result<Self> {
        let client = redis::Client::open(url)?;
-
-        let mut conn = client.get_async_connection().await.unwrap();
-
-        let len: u64 = conn.xlen("neon:endpoints:testing").await.unwrap();
-        tracing::info!("starting with {len} endpoints in redis");
-
-        for i in len..300000u64 {
-            let _key: String = conn
-                .xadd(
-                    "neon:endpoints:testing",
-                    "*",
-                    &[("endpoint_id", format!("ep-hello-world-{i}"))],
-                )
-                .await
-                .unwrap();
-            if i.is_power_of_two() {
-                tracing::debug!("endpoints written {i}");
-            }
-            // client.
-        }
-        tracing::info!("written endpoints to redis");
-
-        // start reading
-
-        let s = ThreadedRodeo::<Spur, BuildHasherDefault<FxHasher>>::with_hasher(
-            BuildHasherDefault::default(),
-        );
-        let opts = StreamReadOptions::default().count(1000);
-        let mut id = "0-0".to_string();
-        loop {
-            let mut res: StreamReadReply = conn
-                .xread_options(&["neon:endpoints:testing"], &[&id], &opts)
-                .await
-                .unwrap();
-            if res.keys.is_empty() {
-                break;
-            }
-
-            assert_eq!(res.keys.len(), 1);
-            let res = res.keys.pop().unwrap();
-            assert_eq!(res.key, "neon:endpoints:testing");
-
-            if res.ids.is_empty() {
-                break;
-            }
-            for x in res.ids {
-                id = x.id;
-                if let Some(ep) = x.map.get("endpoint_id") {
-                    let ep = String::from_redis_value(ep).unwrap();
-                    s.get_or_intern(ep);
-
-                    if s.len().is_power_of_two() {
-                        tracing::debug!("endpoints read {}", s.len());
-                    }
-                }
-            }
-        }
-        tracing::info!("read {} endpoints from redis", s.len());
-
        Ok(Self {
            client,
-            publisher: Some(conn),
+            publisher: None,
            region_id,
            limiter: RedisRateLimiter::new(info),
        })
--- a/s3_scrubber/src/cloud_admin_api.rs
+++ b/s3_scrubber/src/cloud_admin_api.rs
@@ -1,11 +1,7 @@
-#![allow(unused)]
-
-use std::str::FromStr;
 use std::time::Duration;

 use chrono::{DateTime, Utc};
 use hex::FromHex;
-use pageserver::tenant::Tenant;
 use reqwest::{header, Client, StatusCode, Url};
 use serde::Deserialize;
 use tokio::sync::Semaphore;
@@ -290,7 +286,7 @@ impl CloudAdminApiClient {
                    tokio::time::sleep(Duration::from_millis(500)).await;
                    continue;
                }
-                status => {
+                _status => {
                    return Err(Error::new(
                        "List active projects".to_string(),
                        ErrorKind::ResponseStatus(response.status()),
--- a/safekeeper/src/auth.rs
+++ b/safekeeper/src/auth.rs
@@ -12,8 +12,12 @@ pub fn check_permission(claims: &Claims, tenant_id: Option<TenantId>) -> Result<
            }
            Ok(())
        }
-        (Scope::PageServerApi, _) => Err(AuthError(
-            "PageServerApi scope makes no sense for Safekeeper".into(),
+        (Scope::PageServerApi | Scope::GenerationsApi, _) => Err(AuthError(
+            format!(
+                "JWT scope '{:?}' is ineligible for Safekeeper auth",
+                claims.scope
+            )
+            .into(),
        )),
        (Scope::SafekeeperData, _) => Ok(()),
    }
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -511,7 +511,11 @@ async fn backup_object(

    let file = tokio_util::io::ReaderStream::with_capacity(file, BUFFER_SIZE);

-    storage.upload_storage_object(file, size, target_file).await
+    let cancel = CancellationToken::new();
+
+    storage
+        .upload_storage_object(file, size, target_file, &cancel)
+        .await
 }

 pub async fn read_object(
@@ -526,8 +530,10 @@ pub async fn read_object(

    info!("segment download about to start from remote path {file_path:?} at offset {offset}");

+    let cancel = CancellationToken::new();
+
    let download = storage
-        .download_storage_object(Some((offset, None)), file_path)
+        .download_storage_object(Some((offset, None)), file_path, &cancel)
        .await
        .with_context(|| {
            format!("Failed to open WAL segment download stream for remote path {file_path:?}")
@@ -559,7 +565,8 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
    // Note: listing segments might take a long time if there are many of them.
    // We don't currently have http requests timeout cancellation, but if/once
    // we have listing should get streaming interface to make progress.
-    let token = CancellationToken::new(); // not really used
+
+    let cancel = CancellationToken::new(); // not really used
    backoff::retry(
        || async {
            // Do list-delete in batch_size batches to make progress even if there a lot of files.
@@ -567,7 +574,7 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
            // I'm not sure deleting while iterating is expected in s3.
            loop {
                let files = storage
-                    .list_files(Some(&remote_path), Some(batch_size))
+                    .list_files(Some(&remote_path), Some(batch_size), &cancel)
                    .await?;
                if files.is_empty() {
                    return Ok(()); // done
@@ -580,14 +587,15 @@ pub async fn delete_timeline(ttid: &TenantTimelineId) -> Result<()> {
                    files.first().unwrap().object_name().unwrap_or(""),
                    files.last().unwrap().object_name().unwrap_or("")
                );
-                storage.delete_objects(&files).await?;
+                storage.delete_objects(&files, &cancel).await?;
            }
        },
+        // consider TimeoutOrCancel::caused_by_cancel when using cancellation
        |_| false,
        3,
        10,
        "executing WAL segments deletion batch",
-        &token,
+        &cancel,
    )
    .await
    .ok_or_else(|| anyhow::anyhow!("canceled"))
@@ -617,7 +625,12 @@ pub async fn copy_s3_segments(

    let remote_path = RemotePath::new(&relative_dst_path)?;

-    let files = storage.list_files(Some(&remote_path), None).await?;
+    let cancel = CancellationToken::new();
+
+    let files = storage
+        .list_files(Some(&remote_path), None, &cancel)
+        .await?;
+
    let uploaded_segments = &files
        .iter()
        .filter_map(|file| file.object_name().map(ToOwned::to_owned))
@@ -645,7 +658,7 @@ pub async fn copy_s3_segments(
        let from = RemotePath::new(&relative_src_path.join(&segment_name))?;
        let to = RemotePath::new(&relative_dst_path.join(&segment_name))?;

-        storage.copy_object(&from, &to).await?;
+        storage.copy_object(&from, &to, &cancel).await?;
    }

    info!(
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3967,27 +3967,24 @@ def list_files_to_compare(pgdata_dir: Path) -> List[str]:

 # pg is the existing and running compute node, that we want to compare with a basebackup
 def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint: Endpoint):
-    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
-
    # Get the timeline ID. We need it for the 'basebackup' command
    timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])

+    # many tests already checkpoint, but do it just in case
+    with closing(endpoint.connect()) as conn:
+        with conn.cursor() as cur:
+            cur.execute("CHECKPOINT")
+
+    # wait for pageserver to catch up
+    wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
    # stop postgres to ensure that files won't change
    endpoint.stop()

-    # Read the shutdown checkpoint's LSN
-    pg_controldata_path = os.path.join(pg_bin.pg_bin_path, "pg_controldata")
-    cmd = f"{pg_controldata_path} -D {endpoint.pgdata_dir}"
-    result = subprocess.run(cmd, capture_output=True, text=True, shell=True)
-    checkpoint_lsn = re.findall(
-        "Latest checkpoint location:\\s+([0-9A-F]+/[0-9A-F]+)", result.stdout
-    )[0]
-    log.debug(f"last checkpoint at {checkpoint_lsn}")
-
    # Take a basebackup from pageserver
    restored_dir_path = env.repo_dir / f"{endpoint.endpoint_id}_restored_datadir"
    restored_dir_path.mkdir(exist_ok=True)

+    pg_bin = PgBin(test_output_dir, env.pg_distrib_dir, env.pg_version)
    psql_path = os.path.join(pg_bin.pg_bin_path, "psql")

    pageserver_id = env.attachment_service.locate(endpoint.tenant_id)[0]["node_id"]
@@ -3995,7 +3992,7 @@ def check_restored_datadir_content(test_output_dir: Path, env: NeonEnv, endpoint
        {psql_path}                                    \
            --no-psqlrc                                \
            postgres://localhost:{env.get_pageserver(pageserver_id).service_port.pg}  \
-            -c 'basebackup {endpoint.tenant_id} {timeline_id} {checkpoint_lsn}'  \
+            -c 'basebackup {endpoint.tenant_id} {timeline_id}'  \
         | tar -x -C {restored_dir_path}
    """

--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -2,57 +2,58 @@ import os
 from typing import Optional

 import pytest
-from _pytest.fixtures import FixtureRequest
 from _pytest.python import Metafunc

 from fixtures.pg_version import PgVersion

 """
-Dynamically parametrize tests by Postgres version, build type (debug/release/remote), and possibly by other parameters
+Dynamically parametrize tests by different parameters
 """


@pytest.fixture(scope="function", autouse=True)
-def pg_version(request: FixtureRequest) -> Optional[PgVersion]:
-    # Do not parametrize performance tests yet, we need to prepare grafana charts first
-    if "test_runner/performance" in str(request.node.path):
-        v = os.environ.get("DEFAULT_PG_VERSION")
-        return PgVersion(v)
-
+def pg_version() -> Optional[PgVersion]:
    return None


@pytest.fixture(scope="function", autouse=True)
-def build_type(request: FixtureRequest) -> Optional[str]:
-    # Do not parametrize performance tests yet, we need to prepare grafana charts first
-    if "test_runner/performance" in str(request.node.path):
-        return os.environ.get("BUILD_TYPE", "").lower()
-
+def build_type() -> Optional[str]:
    return None


@pytest.fixture(scope="function", autouse=True)
-def pageserver_virtual_file_io_engine(request: FixtureRequest) -> Optional[str]:
+def platform() -> Optional[str]:
+    return None
+
+
+@pytest.fixture(scope="function", autouse=True)
+def pageserver_virtual_file_io_engine() -> Optional[str]:
    return None


 def pytest_generate_tests(metafunc: Metafunc):
-    if (v := os.environ.get("DEFAULT_PG_VERSION")) is None:
-        pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET]
-    else:
-        pg_versions = [PgVersion(v)]
-
-    if (bt := os.environ.get("BUILD_TYPE")) is None:
+    if (bt := os.getenv("BUILD_TYPE")) is None:
        build_types = ["debug", "release"]
    else:
        build_types = [bt.lower()]

-    # Do not parametrize performance tests yet by Postgres version or build type, we need to prepare grafana charts first
-    if "test_runner/performance" not in metafunc.definition._nodeid:
-        metafunc.parametrize("build_type", build_types)
-        metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))
+    metafunc.parametrize("build_type", build_types)
+
+    if (v := os.getenv("DEFAULT_PG_VERSION")) is None:
+        pg_versions = [version for version in PgVersion if version != PgVersion.NOT_SET]
+    else:
+        pg_versions = [PgVersion(v)]
+
+    metafunc.parametrize("pg_version", pg_versions, ids=map(lambda v: f"pg{v}", pg_versions))

    # A hacky way to parametrize tests only for `pageserver_virtual_file_io_engine=tokio-epoll-uring`
    # And do not change test name for default `pageserver_virtual_file_io_engine=std-fs` to keep tests statistics
-    if (io_engine := os.environ.get("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
+    if (io_engine := os.getenv("PAGESERVER_VIRTUAL_FILE_IO_ENGINE", "")) not in ("", "std-fs"):
        metafunc.parametrize("pageserver_virtual_file_io_engine", [io_engine])
+
+    # For performance tests, parametrize also by platform
+    if (
+        "test_runner/performance" in metafunc.definition._nodeid
+        and (platform := os.getenv("PLATFORM")) is not None
+    ):
+        metafunc.parametrize("platform", [platform.lower()])
--- a/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
+++ b/test_runner/performance/pageserver/pagebench/test_pageserver_max_throughput_getpage_at_latest_lsn.py
@@ -1,4 +1,5 @@
 import json
+import os
 from pathlib import Path
 from typing import Any, Dict, Tuple

@@ -33,6 +34,10 @@ from performance.pageserver.util import ensure_pageserver_ready_for_benchmarking
@pytest.mark.timeout(
    10000
 )  # TODO: this value is just "a really high number"; have this per instance type
+@pytest.mark.skipif(
+    os.getenv("CI", "false") == "true",
+    reason="The test if flaky on CI: https://github.com/neondatabase/neon/issues/6724",
+)
 def test_pageserver_max_throughput_getpage_at_latest_lsn(
    neon_env_builder: NeonEnvBuilder,
    zenbenchmark: NeonBenchmarker,
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -176,6 +176,14 @@ def test_fully_custom_config(positive_env: NeonEnv):
        "lazy_slru_download": True,
        "max_lsn_wal_lag": 230000,
        "min_resident_size_override": 23,
+        "timeline_get_throttle": {
+            "task_kinds": ["PageRequestHandler"],
+            "fair": True,
+            "initial": 0,
+            "refill_interval": "1s",
+            "refill_amount": 1000,
+            "max": 1000,
+        },
        "trace_read_requests": True,
        "walreceiver_connect_timeout": "13m",
    }
--- a/test_runner/regress/test_auth.py
+++ b/test_runner/regress/test_auth.py
@@ -225,9 +225,7 @@ def test_auth_failures(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):

        check_pageserver(True, password=pageserver_token)

-        env.pageserver.allowed_errors.append(
-            ".*SafekeeperData scope makes no sense for Pageserver.*"
-        )
+        env.pageserver.allowed_errors.append(".*JWT scope '.+' is ineligible for Pageserver auth.*")
        check_pageserver(False, password=safekeeper_token)

    def check_safekeeper(expect_success: bool, **conn_kwargs):
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -141,7 +141,12 @@ def test_create_snapshot(
    )
    if compatibility_snapshot_dir.exists():
        shutil.rmtree(compatibility_snapshot_dir)
-    shutil.copytree(test_output_dir, compatibility_snapshot_dir)
+
+    shutil.copytree(
+        test_output_dir,
+        compatibility_snapshot_dir,
+        ignore=shutil.ignore_patterns("pg_dynshmem"),
+    )


@check_ondisk_data_compatibility_if_enabled
--- a/Show More
+++ b/Show More