chore: clippy::too_many_arguments

refactor: cleanup extra cancellation waits
fix: layer backoff
2026-02-05 03:30:36 +00:00 · 2023-12-13 23:07:25 +00:00 · 2023-12-13 23:05:54 +00:00 · 2023-12-13 23:05:54 +00:00 · 2023-12-13 23:05:48 +00:00 · 2023-12-13 23:05:35 +00:00
237 changed files with 13654 additions and 6646 deletions
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -1,17 +1,3 @@
 # The binaries are really slow, if you compile them in 'dev' mode with the defaults.
 # Enable some optimizations even in 'dev' mode, to make tests faster. The basic
 # optimizations enabled by "opt-level=1" don't affect debuggability too much.
 #
 # See https://www.reddit.com/r/rust/comments/gvrgca/this_is_a_neat_trick_for_getting_good_runtime/
 #
 [profile.dev.package."*"]
 # Set the default for dependencies in Development mode.
 opt-level = 3
 [profile.dev]
 # Turn on a small amount of optimization in Development mode.
 opt-level = 1
 [build]
 # This is only present for local builds, as it will be overridden
 # by the RUSTDOCFLAGS env var in CI.
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -199,6 +199,10 @@ jobs:
          #
          git config --global --add safe.directory ${{ github.workspace }}
          git config --global --add safe.directory ${GITHUB_WORKSPACE}
          for r in 14 15 16; do
            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
          done
      - name: Checkout
        uses: actions/checkout@v3
@@ -404,7 +408,7 @@ jobs:
        uses: ./.github/actions/save-coverage-data
  regress-tests:
-    needs: [ check-permissions, build-neon ]
+    needs: [ check-permissions, build-neon, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -436,6 +440,7 @@ jobs:
        env:
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}
      - name: Merge and upload coverage data
        if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
@@ -1096,6 +1101,10 @@ jobs:
          #
          git config --global --add safe.directory ${{ github.workspace }}
          git config --global --add safe.directory ${GITHUB_WORKSPACE}
          for r in 14 15 16; do
            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
          done
      - name: Checkout
        uses: actions/checkout@v3
--- a/.github/workflows/neon_extra_builds.yml
+++ b/.github/workflows/neon_extra_builds.yml
@@ -142,6 +142,10 @@ jobs:
          #
          git config --global --add safe.directory ${{ github.workspace }}
          git config --global --add safe.directory ${GITHUB_WORKSPACE}
          for r in 14 15 16; do
            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
          done
      - name: Checkout
        uses: actions/checkout@v4
@@ -238,6 +242,20 @@ jobs:
      options: --init
    steps:
      - name: Fix git ownership
        run: |
          # Workaround for `fatal: detected dubious ownership in repository at ...`
          #
          # Use both ${{ github.workspace }} and ${GITHUB_WORKSPACE} because they're different on host and in containers
          #   Ref https://github.com/actions/checkout/issues/785
          #
          git config --global --add safe.directory ${{ github.workspace }}
          git config --global --add safe.directory ${GITHUB_WORKSPACE}
          for r in 14 15 16; do
            git config --global --add safe.directory "${{ github.workspace }}/vendor/postgres-v$r"
            git config --global --add safe.directory "${GITHUB_WORKSPACE}/vendor/postgres-v$r"
          done
      - name: Checkout
        uses: actions/checkout@v4
        with:
--- a/.github/workflows/release.yml
+++ b/.github/workflows/release.yml
@@ -2,7 +2,7 @@ name: Create Release Branch
 on:
  schedule:
-    - cron: '0 7 * * 5'
+    - cron: '0 6 * * 1'
  workflow_dispatch:
 jobs:
--- a/.gitignore
+++ b/.gitignore
@@ -18,3 +18,6 @@ test_output/
 *.o
 *.so
 *.Po
 # pgindent typedef lists
 *.list
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -9,6 +9,24 @@ refactoring, additional comments, and so forth. Let's try to raise the
 bar, and clean things up as we go. Try to leave code in a better shape
 than it was before.
 ## Pre-commit hook
 We have a sample pre-commit hook in `pre-commit.py`.
 To set it up, run:
 ```bash
 ln -s ../../pre-commit.py .git/hooks/pre-commit
 ```
 This will run following checks on staged files before each commit:
 - `rustfmt`
 - checks for python files, see [obligatory checks](/docs/sourcetree.md#obligatory-checks).
 There is also a separate script `./run_clippy.sh` that runs `cargo clippy` on the whole project
 and `./scripts/reformat` that runs all formatting tools to ensure the project is up to date.
 If you want to skip the hook, run `git commit` with `--no-verify` option.
 ## Submitting changes
 1. Get at least one +1 on your PR before you push.
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -37,20 +37,19 @@ license = "Apache-2.0"
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
-async-compression = { version = "0.4.0", features = ["tokio", "gzip"] }
+async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
-azure_core = "0.16"
+azure_core = "0.18"
-azure_identity = "0.16"
+azure_identity = "0.18"
-azure_storage = "0.16"
+azure_storage = "0.18"
-azure_storage_blobs = "0.16"
+azure_storage_blobs = "0.18"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
-aws-config = { version = "0.56", default-features = false, features=["rustls"] }
+aws-config = { version = "1.0", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "0.29"
+aws-sdk-s3 = "1.0"
-aws-smithy-http = "0.56"
+aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] }
-aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
+aws-smithy-types = "1.0"
-aws-credential-types = "0.56"
+aws-credential-types = "1.0"
 aws-types = "0.56"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -89,6 +88,7 @@ humantime-serde = "1.1.1"
 hyper = "0.14"
 hyper-tungstenite = "0.11"
 inotify = "0.10.2"
 ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "8"
 libc = "0.2"
@@ -109,7 +109,7 @@ pin-project-lite = "0.2"
 prometheus = {version = "0.13", default_features=false, features = ["process"]} # removes protobuf dependency
 prost = "0.11"
 rand = "0.8"
-regex = "1.4"
+regex = "1.10.2"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
 reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_19"] }
 reqwest-middleware = "0.2.0"
@@ -122,14 +122,17 @@ rustls-pemfile = "1"
 rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
 sd-notify = "0.4.1"
 sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
 serde_path_to_error = "0.1"
 serde_with = "2.0"
 serde_assert = "0.5.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
 smallvec = "1.11"
 smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
@@ -146,7 +149,7 @@ tokio-postgres-rustls = "0.10.0"
 tokio-rustls = "0.24"
 tokio-stream = "0.1"
 tokio-tar = "0.3"
-tokio-util = { version = "0.7", features = ["io"] }
+tokio-util = { version = "0.7.10", features = ["io", "rt"] }
 toml = "0.7"
 toml_edit = "0.19"
 tonic = {version = "0.9", features = ["tls", "tls-roots"]}
@@ -165,11 +168,11 @@ env_logger = "0.10"
 log = "0.4"
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -206,7 +209,7 @@ tonic-build = "0.9"
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="6ce32f791526e27533cab0232a6bb243b2c32584" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", branch="neon" }
 ################# Binary contents sections
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -393,7 +393,9 @@ RUN case "${PG_VERSION}" in \
        export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
        ;; \
      *) \
-        echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \
+        export TIMESCALEDB_VERSION=2.13.0 \
        export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
        ;; \
    esac && \
    apt-get update && \
    apt-get install -y cmake && \
@@ -714,6 +716,23 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -
    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
 #########################################################################################
 #
 # Layer "wal2json-build"
 # Compile "wal2json" extension
 #
 #########################################################################################
 FROM build-deps AS wal2json-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
    echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
    mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
    make -j $(getconf _NPROCESSORS_ONLN) install
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -750,6 +769,7 @@ COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY pgxn/ pgxn/
 RUN make -j $(getconf _NPROCESSORS_ONLN) \
--- a/38
+++ b/38
@@ -260,6 +260,44 @@ distclean:
 fmt:
 	./pre-commit.py --fix-inplace
 postgres-%-pg-bsd-indent: postgres-%
 	+@echo "Compiling pg_bsd_indent"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/
 # Create typedef list for the core. Note that generally it should be combined with
 # buildfarm one to cover platform specific stuff.
 # https://wiki.postgresql.org/wiki/Running_pgindent_on_non-core_code_or_development_code
 postgres-%-typedefs.list: postgres-%
 	$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/find_typedef $(POSTGRES_INSTALL_DIR)/$*/bin > $@
 # Indent postgres. See src/tools/pgindent/README for details.
 .PHONY: postgres-%-pgindent
 postgres-%-pgindent: postgres-%-pg-bsd-indent postgres-%-typedefs.list
 	+@echo merge with buildfarm typedef to cover all platforms
 	+@echo note: I first tried to download from pgbuildfarm.org, but for unclear reason e.g. \
 		REL_16_STABLE list misses PGSemaphoreData
 	# wget -q -O - "http://www.pgbuildfarm.org/cgi-bin/typedefs.pl?branch=REL_16_STABLE" |\
 	# cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
 	cat $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/typedefs.list |\
 		cat - postgres-$*-typedefs.list | sort | uniq > postgres-$*-typedefs-full.list
 	+@echo note: you might want to run it on selected files/dirs instead.
 	INDENT=$(POSTGRES_INSTALL_DIR)/build/$*/src/tools/pg_bsd_indent/pg_bsd_indent \
 		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/pgindent --typedefs postgres-$*-typedefs-full.list \
 		$(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/ \
 		--excludes $(ROOT_PROJECT_DIR)/vendor/postgres-$*/src/tools/pgindent/exclude_file_patterns
 	rm -f pg*.BAK
 # Indent pxgn/neon.
 .PHONY: pgindent
 neon-pgindent: postgres-v16-pg-bsd-indent neon-pg-ext-v16
 	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/v16/bin/pg_config CFLAGS='$(PG_CFLAGS) $(COPT)' \
 		FIND_TYPEDEF=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/find_typedef \
 		INDENT=$(POSTGRES_INSTALL_DIR)/build/v16/src/tools/pg_bsd_indent/pg_bsd_indent \
 		PGINDENT_SCRIPT=$(ROOT_PROJECT_DIR)/vendor/postgres-v16/src/tools/pgindent/pgindent \
 		-C $(POSTGRES_INSTALL_DIR)/build/neon-v16 \
 		-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile pgindent
 .PHONY: setup-pre-commit-hook
 setup-pre-commit-hook:
 	ln -s -f $(ROOT_PROJECT_DIR)/pre-commit.py .git/hooks/pre-commit
--- a/README.md
+++ b/README.md
@@ -149,6 +149,9 @@ tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
 Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
 Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one
 # create postgres compute node
 > cargo neon endpoint create main
 # start postgres compute node
 > cargo neon endpoint start main
 Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
@@ -185,8 +188,11 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
 (L) main [de200bd42b49cc1814412c7e592dd6e9]
 (L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]
 # create postgres on that branch
 > cargo neon endpoint create migration_check --branch-name migration_check
 # start postgres on that branch
-> cargo neon endpoint start migration_check --branch-name migration_check
+> cargo neon endpoint start migration_check
 Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
 Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'
--- a/compute_tools/Cargo.toml
+++ b/compute_tools/Cargo.toml
@@ -38,3 +38,4 @@ toml_edit.workspace = true
 remote_storage = { version = "0.1", path = "../libs/remote_storage/" }
 vm_monitor = { version = "0.1", path = "../libs/vm_monitor/" }
 zstd = "0.12.4"
 bytes = "1.0"
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -31,7 +31,7 @@
 //!             -C 'postgresql://cloud_admin@localhost/postgres' \
 //!             -S /var/db/postgres/specs/current.json \
 //!             -b /usr/local/bin/postgres \
-//!             -r {"bucket": "neon-dev-extensions-eu-central-1", "region": "eu-central-1"}
+//!             -r http://pg-ext-s3-gateway
 //! ```
 //!
 use std::collections::HashMap;
@@ -51,7 +51,7 @@ use compute_api::responses::ComputeStatus;
 use compute_tools::compute::{ComputeNode, ComputeState, ParsedSpec};
 use compute_tools::configurator::launch_configurator;
-use compute_tools::extension_server::{get_pg_version, init_remote_storage};
+use compute_tools::extension_server::get_pg_version;
 use compute_tools::http::api::launch_http_server;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
@@ -60,7 +60,7 @@ use compute_tools::spec::*;
 // this is an arbitrary build tag. Fine as a default / for testing purposes
 // in-case of not-set environment var
-const BUILD_TAG_DEFAULT: &str = "5670669815";
+const BUILD_TAG_DEFAULT: &str = "latest";
 fn main() -> Result<()> {
    init_tracing_and_logging(DEFAULT_LOG_LEVEL)?;
@@ -74,10 +74,18 @@ fn main() -> Result<()> {
    let pgbin_default = String::from("postgres");
    let pgbin = matches.get_one::<String>("pgbin").unwrap_or(&pgbin_default);
-    let remote_ext_config = matches.get_one::<String>("remote-ext-config");
+    let ext_remote_storage = matches
-    let ext_remote_storage = remote_ext_config.map(|x| {
+        .get_one::<String>("remote-ext-config")
-        init_remote_storage(x).expect("cannot initialize remote extension storage from config")
+        // Compatibility hack: if the control plane specified any remote-ext-config
-    });
+        // use the default value for extension storage proxy gateway.
        // Remove this once the control plane is updated to pass the gateway URL
        .map(|conf| {
            if conf.starts_with("http") {
                conf.trim_end_matches('/')
            } else {
                "http://pg-ext-s3-gateway"
            }
        });
    let http_port = *matches
        .get_one::<u16>("http-port")
@@ -198,7 +206,7 @@ fn main() -> Result<()> {
        live_config_allowed,
        state: Mutex::new(new_state),
        state_changed: Condvar::new(),
-        ext_remote_storage,
+        ext_remote_storage: ext_remote_storage.map(|s| s.to_string()),
        ext_download_progress: RwLock::new(HashMap::new()),
        build_tag,
    };
@@ -266,7 +274,13 @@ fn main() -> Result<()> {
            let mut state = compute.state.lock().unwrap();
            state.error = Some(format!("{:?}", err));
            state.status = ComputeStatus::Failed;
-            drop(state);
+            // Notify others that Postgres failed to start. In case of configuring the
            // empty compute, it's likely that API handler is still waiting for compute
            // state change. With this we will notify it that compute is in Failed state,
            // so control plane will know about it earlier and record proper error instead
            // of timeout.
            compute.state_changed.notify_all();
            drop(state); // unlock
            delay_exit = true;
            None
        }
@@ -479,13 +493,6 @@ fn cli() -> clap::Command {
                )
                .value_name("FILECACHE_CONNSTR"),
        )
        .arg(
            // DEPRECATED, NO LONGER DOES ANYTHING.
            // See https://github.com/neondatabase/cloud/issues/7516
            Arg::new("file-cache-on-disk")
                .long("file-cache-on-disk")
                .action(clap::ArgAction::SetTrue),
        )
 }
 #[test]
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -22,10 +22,10 @@ use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 use compute_api::responses::{ComputeMetrics, ComputeStatus};
-use compute_api::spec::{ComputeMode, ComputeSpec};
+use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;
-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
+use remote_storage::{DownloadError, RemotePath};
 use crate::checker::create_availability_check_data;
 use crate::pg_helpers::*;
@@ -59,8 +59,8 @@ pub struct ComputeNode {
    pub state: Mutex<ComputeState>,
    /// `Condvar` to allow notifying waiters about state changes.
    pub state_changed: Condvar,
-    ///  the S3 bucket that we search for extensions in
+    /// the address of extension storage proxy gateway
-    pub ext_remote_storage: Option<GenericRemoteStorage>,
+    pub ext_remote_storage: Option<String>,
    // key: ext_archive_name, value: started download time, download_completed?
    pub ext_download_progress: RwLock<HashMap<String, (DateTime<Utc>, bool)>>,
    pub build_tag: String,
@@ -252,7 +252,7 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
                    IF NOT EXISTS (
                        SELECT FROM pg_catalog.pg_roles WHERE rolname = 'neon_superuser')
                    THEN
-                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION IN ROLE pg_read_all_data, pg_write_all_data;
+                        CREATE ROLE neon_superuser CREATEDB CREATEROLE NOLOGIN REPLICATION BYPASSRLS IN ROLE pg_read_all_data, pg_write_all_data;
                        IF array_length(roles, 1) IS NOT NULL THEN
                            EXECUTE format('GRANT neon_superuser TO %s',
                                           array_to_string(ARRAY(SELECT quote_ident(x) FROM unnest(roles) as x), ', '));
@@ -277,6 +277,17 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
 }
 impl ComputeNode {
    /// Check that compute node has corresponding feature enabled.
    pub fn has_feature(&self, feature: ComputeFeature) -> bool {
        let state = self.state.lock().unwrap();
        if let Some(s) = state.pspec.as_ref() {
            s.spec.features.contains(&feature)
        } else {
            false
        }
    }
    pub fn set_status(&self, status: ComputeStatus) {
        let mut state = self.state.lock().unwrap();
        state.status = status;
@@ -698,6 +709,7 @@ impl ComputeNode {
        handle_role_deletions(spec, self.connstr.as_str(), &mut client)?;
        handle_grants(spec, &mut client, self.connstr.as_str())?;
        handle_extensions(spec, &mut client)?;
        handle_extension_neon(&mut client)?;
        create_availability_check_data(&mut client)?;
        // 'Close' connection
@@ -727,7 +739,12 @@ impl ComputeNode {
        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
+        let postgresql_conf_path = pgdata_path.join("postgresql.conf");
        config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
        // temporarily reset max_cluster_size in config
        // to avoid the possibility of hitting the limit, while we are reconfiguring:
        // creating new extensions, roles, etc...
        config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
        self.pg_reload_conf()?;
        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
@@ -742,11 +759,16 @@ impl ComputeNode {
            handle_role_deletions(&spec, self.connstr.as_str(), &mut client)?;
            handle_grants(&spec, &mut client, self.connstr.as_str())?;
            handle_extensions(&spec, &mut client)?;
            handle_extension_neon(&mut client)?;
        }
        // 'Close' connection
        drop(client);
        // reset max_cluster_size in config back to original value and reload config
        config::compute_ctl_temp_override_remove(pgdata_path)?;
        self.pg_reload_conf()?;
        let unknown_op = "unknown".to_string();
        let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
        info!(
@@ -807,7 +829,17 @@ impl ComputeNode {
        let config_time = Utc::now();
        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
            let pgdata_path = Path::new(&self.pgdata);
            // temporarily reset max_cluster_size in config
            // to avoid the possibility of hitting the limit, while we are applying config:
            // creating new extensions, roles, etc...
            config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
            self.pg_reload_conf()?;
            self.apply_config(&compute_state)?;
            config::compute_ctl_temp_override_remove(pgdata_path)?;
            self.pg_reload_conf()?;
        }
        let startup_end_time = Utc::now();
@@ -955,12 +987,12 @@ LIMIT 100",
        real_ext_name: String,
        ext_path: RemotePath,
    ) -> Result<u64, DownloadError> {
-        let remote_storage = self
+        let ext_remote_storage =
-            .ext_remote_storage
+            self.ext_remote_storage
-            .as_ref()
+                .as_ref()
-            .ok_or(DownloadError::BadInput(anyhow::anyhow!(
+                .ok_or(DownloadError::BadInput(anyhow::anyhow!(
-                "Remote extensions storage is not configured",
+                    "Remote extensions storage is not configured",
-            )))?;
+                )))?;
        let ext_archive_name = ext_path.object_name().expect("bad path");
@@ -1016,7 +1048,7 @@ LIMIT 100",
        let download_size = extension_server::download_extension(
            &real_ext_name,
            &ext_path,
-            remote_storage,
+            ext_remote_storage,
            &self.pgbin,
        )
        .await
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -93,5 +93,25 @@ pub fn write_postgres_conf(
        writeln!(file, "neon.extension_server_port={}", port)?;
    }
    // This is essential to keep this line at the end of the file,
    // because it is intended to override any settings above.
    writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;
    Ok(())
 }
 /// create file compute_ctl_temp_override.conf in pgdata_dir
 /// add provided options to this file
 pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> {
    let path = pgdata_path.join("compute_ctl_temp_override.conf");
    let mut file = File::create(path)?;
    write!(file, "{}", options)?;
    Ok(())
 }
 /// remove file compute_ctl_temp_override.conf in pgdata_dir
 pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> {
    let path = pgdata_path.join("compute_ctl_temp_override.conf");
    std::fs::remove_file(path)?;
    Ok(())
 }
--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -71,18 +71,16 @@ More specifically, here is an example ext_index.json
    }
 }
 */
 use anyhow::Context;
 use anyhow::{self, Result};
 use anyhow::{bail, Context};
 use bytes::Bytes;
 use compute_api::spec::RemoteExtSpec;
 use regex::Regex;
 use remote_storage::*;
-use serde_json;
+use reqwest::StatusCode;
 use std::io::Read;
 use std::num::NonZeroUsize;
 use std::path::Path;
 use std::str;
 use tar::Archive;
 use tokio::io::AsyncReadExt;
 use tracing::info;
 use tracing::log::warn;
 use zstd::stream::read::Decoder;
@@ -138,23 +136,31 @@ fn parse_pg_version(human_version: &str) -> &str {
 pub async fn download_extension(
    ext_name: &str,
    ext_path: &RemotePath,
-    remote_storage: &GenericRemoteStorage,
+    ext_remote_storage: &str,
    pgbin: &str,
 ) -> Result<u64> {
    info!("Download extension {:?} from {:?}", ext_name, ext_path);
-    let mut download = remote_storage.download(ext_path).await?;
+
-    let mut download_buffer = Vec::new();
+    // TODO add retry logic
-    download
+    let download_buffer =
-        .download_stream
+        match download_extension_tar(ext_remote_storage, &ext_path.to_string()).await {
-        .read_to_end(&mut download_buffer)
+            Ok(buffer) => buffer,
-        .await?;
+            Err(error_message) => {
                return Err(anyhow::anyhow!(
                    "error downloading extension {:?}: {:?}",
                    ext_name,
                    error_message
                ));
            }
        };
    let download_size = download_buffer.len() as u64;
    info!("Download size {:?}", download_size);
    // it's unclear whether it is more performant to decompress into memory or not
    // TODO: decompressing into memory can be avoided
-    let mut decoder = Decoder::new(download_buffer.as_slice())?;
+    let decoder = Decoder::new(download_buffer.as_ref())?;
-    let mut decompress_buffer = Vec::new();
+    let mut archive = Archive::new(decoder);
-    decoder.read_to_end(&mut decompress_buffer)?;
+
    let mut archive = Archive::new(decompress_buffer.as_slice());
    let unzip_dest = pgbin
        .strip_suffix("/bin/postgres")
        .expect("bad pgbin")
@@ -222,29 +228,32 @@ pub fn create_control_files(remote_extensions: &RemoteExtSpec, pgbin: &str) {
    }
 }
-// This function initializes the necessary structs to use remote storage
+// Do request to extension storage proxy, i.e.
-pub fn init_remote_storage(remote_ext_config: &str) -> anyhow::Result<GenericRemoteStorage> {
+// curl http://pg-ext-s3-gateway/latest/v15/extensions/anon.tar.zst
-    #[derive(Debug, serde::Deserialize)]
+// using HHTP GET
-    struct RemoteExtJson {
+// and return the response body as bytes
-        bucket: String,
+//
-        region: String,
+async fn download_extension_tar(ext_remote_storage: &str, ext_path: &str) -> Result<Bytes> {
-        endpoint: Option<String>,
+    let uri = format!("{}/{}", ext_remote_storage, ext_path);
        prefix: Option<String>,
    }
    let remote_ext_json = serde_json::from_str::<RemoteExtJson>(remote_ext_config)?;
-    let config = S3Config {
+    info!("Download extension {:?} from uri {:?}", ext_path, uri);
-        bucket_name: remote_ext_json.bucket,
+
-        bucket_region: remote_ext_json.region,
+    let resp = reqwest::get(uri).await?;
-        prefix_in_bucket: remote_ext_json.prefix,
+
-        endpoint: remote_ext_json.endpoint,
+    match resp.status() {
-        concurrency_limit: NonZeroUsize::new(100).expect("100 != 0"),
+        StatusCode::OK => match resp.bytes().await {
-        max_keys_per_list_response: None,
+            Ok(resp) => {
-    };
+                info!("Download extension {:?} completed successfully", ext_path);
-    let config = RemoteStorageConfig {
+                Ok(resp)
-        storage: RemoteStorageKind::AwsS3(config),
+            }
-    };
+            Err(e) => bail!("could not deserialize remote extension response: {}", e),
-    GenericRemoteStorage::from_config(&config)
+        },
        StatusCode::SERVICE_UNAVAILABLE => bail!("remote extension is temporarily unavailable"),
        _ => bail!(
            "unexpected remote extension response status code: {}",
            resp.status()
        ),
    }
 }
 #[cfg(test)]
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -123,7 +123,7 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
            }
        }
-        // download extension files from S3 on demand
+        // download extension files from remote extension storage on demand
        (&Method::POST, route) if route.starts_with("/extension_server/") => {
            info!("serving {:?} POST request", route);
            info!("req.uri {:?}", req.uri());
@@ -227,7 +227,7 @@ async fn handle_configure_request(
        let parsed_spec = match ParsedSpec::try_from(spec) {
            Ok(ps) => ps,
-            Err(msg) => return Err((msg, StatusCode::PRECONDITION_FAILED)),
+            Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
        };
        // XXX: wrap state update under lock in code blocks. Otherwise,
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -156,17 +156,17 @@ paths:
                description: Error text or 'OK' if download succeeded.
                example: "OK"
        400:
-        description: Request is invalid.
+          description: Request is invalid.
-        content:
+          content:
-          application/json:
+            application/json:
-            schema:
+              schema:
-              $ref: "#/components/schemas/GenericError"
+                $ref: "#/components/schemas/GenericError"
        500:
-        description: Extension download request failed.
+          description: Extension download request failed.
-        content:
+          content:
-          application/json:
+            application/json:
-            schema:
+              schema:
-              $ref: "#/components/schemas/GenericError"
+                $ref: "#/components/schemas/GenericError"
 components:
  securitySchemes:
--- a/compute_tools/src/pg_helpers.rs
+++ b/compute_tools/src/pg_helpers.rs
@@ -193,16 +193,11 @@ impl Escaping for PgIdent {
 /// Build a list of existing Postgres roles
 pub fn get_existing_roles(xact: &mut Transaction<'_>) -> Result<Vec<Role>> {
    let postgres_roles = xact
-        .query(
+        .query("SELECT rolname, rolpassword FROM pg_catalog.pg_authid", &[])?
            "SELECT rolname, rolpassword, rolreplication, rolbypassrls FROM pg_catalog.pg_authid",
            &[],
        )?
        .iter()
        .map(|row| Role {
            name: row.get("rolname"),
            encrypted_password: row.get("rolpassword"),
            replication: Some(row.get("rolreplication")),
            bypassrls: Some(row.get("rolbypassrls")),
            options: None,
        })
        .collect();
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -118,19 +118,6 @@ pub fn get_spec_from_control_plane(
    spec
 }
 /// It takes cluster specification and does the following:
 /// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
 /// - Update `pg_hba.conf` to allow external connections.
 pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
    // File `postgresql.conf` is no longer included into `basebackup`, so just
    // always write all config into it creating new file.
    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
    update_pg_hba(pgdata_path)?;
    Ok(())
 }
 /// Check `pg_hba.conf` and update if needed to allow external connections.
 pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
    // XXX: consider making it a part of spec.json
@@ -265,8 +252,6 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        let action = if let Some(r) = pg_role {
            if (r.encrypted_password.is_none() && role.encrypted_password.is_some())
                || (r.encrypted_password.is_some() && role.encrypted_password.is_none())
                || !r.bypassrls.unwrap_or(false)
                || !r.replication.unwrap_or(false)
            {
                RoleAction::Update
            } else if let Some(pg_pwd) = &r.encrypted_password {
@@ -298,14 +283,22 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
        match action {
            RoleAction::None => {}
            RoleAction::Update => {
-                let mut query: String =
+                // This can be run on /every/ role! Not just ones created through the console.
-                    format!("ALTER ROLE {} BYPASSRLS REPLICATION", name.pg_quote());
+                // This means that if you add some funny ALTER here that adds a permission,
                // this will get run even on user-created roles! This will result in different
                // behavior before and after a spec gets reapplied. The below ALTER as it stands
                // now only grants LOGIN and changes the password. Please do not allow this branch
                // to do anything silly.
                let mut query: String = format!("ALTER ROLE {} ", name.pg_quote());
                query.push_str(&role.to_pg_options());
                xact.execute(query.as_str(), &[])?;
            }
            RoleAction::Create => {
                // This branch only runs when roles are created through the console, so it is
                // safe to add more permissions here. BYPASSRLS and REPLICATION are inherited
                // from neon_superuser.
                let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS REPLICATION IN ROLE neon_superuser",
+                    "CREATE ROLE {} INHERIT CREATEROLE CREATEDB IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
@@ -674,3 +667,33 @@ pub fn handle_extensions(spec: &ComputeSpec, client: &mut Client) -> Result<()>
    Ok(())
 }
 /// Run CREATE and ALTER EXTENSION neon UPDATE for postgres database
 #[instrument(skip_all)]
 pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
    info!("handle extension neon");
    let mut query = "CREATE SCHEMA IF NOT EXISTS neon";
    client.simple_query(query)?;
    query = "CREATE EXTENSION IF NOT EXISTS neon WITH SCHEMA neon";
    info!("create neon extension with query: {}", query);
    client.simple_query(query)?;
    query = "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'";
    client.simple_query(query)?;
    query = "ALTER EXTENSION neon SET SCHEMA neon";
    info!("alter neon extension schema with query: {}", query);
    client.simple_query(query)?;
    // this will be a no-op if extension is already up to date,
    // which may happen in two cases:
    // - extension was just installed
    // - extension was already installed and is up to date
    let query = "ALTER EXTENSION neon UPDATE";
    info!("update neon extension schema with query: {}", query);
    client.simple_query(query)?;
    Ok(())
 }
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -9,6 +9,7 @@ use clap::Parser;
 use hex::FromHex;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response};
 use pageserver_api::shard::TenantShardId;
 use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
 use std::{collections::HashMap, sync::Arc};
@@ -173,7 +174,8 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
        if state.pageserver == Some(reattach_req.node_id) {
            state.generation += 1;
            response.tenants.push(ReAttachResponseTenant {
-                id: *t,
+                // TODO(sharding): make this shard-aware
                id: TenantShardId::unsharded(*t),
                gen: state.generation,
            });
        }
@@ -196,8 +198,15 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
    };
    for req_tenant in validate_req.tenants {
-        if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
+        // TODO(sharding): make this shard-aware
        if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
            let valid = tenant_state.generation == req_tenant.gen;
            tracing::info!(
                "handle_validate: {}(gen {}): valid={valid} (latest {})",
                req_tenant.id,
                req_tenant.gen,
                tenant_state.generation
            );
            response.tenants.push(ValidateResponseTenant {
                id: req_tenant.id,
                valid,
@@ -247,6 +256,13 @@ async fn handle_attach_hook(mut req: Request<Body>) -> Result<Response<Body>, Ap
    tenant_state.pageserver = attach_req.node_id;
    let generation = tenant_state.generation;
    tracing::info!(
        "handle_attach_hook: tenant {} set generation {}, pageserver {}",
        attach_req.tenant_id,
        tenant_state.generation,
        attach_req.node_id.unwrap_or(utils::id::NodeId(0xfffffff))
    );
    locked.save().await.map_err(ApiError::InternalServerError)?;
    json_response(
@@ -286,6 +302,7 @@ async fn main() -> anyhow::Result<()> {
    logging::init(
        LogFormat::Plain,
        logging::TracingErrorLayerEnablement::Disabled,
        logging::Output::Stdout,
    )?;
    let args = Cli::parse();
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -168,7 +168,7 @@ fn print_timelines_tree(
                    info: t.clone(),
                    children: BTreeSet::new(),
                    name: timeline_name_mappings
-                        .remove(&TenantTimelineId::new(t.tenant_id, t.timeline_id)),
+                        .remove(&TenantTimelineId::new(t.tenant_id.tenant_id, t.timeline_id)),
                },
            )
        })
@@ -415,6 +415,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                None,
                None,
                Some(pg_version),
                None,
            )?;
            let new_timeline_id = timeline_info.timeline_id;
            let last_record_lsn = timeline_info.last_record_lsn;
@@ -487,8 +488,16 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                .copied()
                .context("Failed to parse postgres version from the argument string")?;
-            let timeline_info =
+            let new_timeline_id_opt = parse_timeline_id(create_match)?;
-                pageserver.timeline_create(tenant_id, None, None, None, Some(pg_version))?;
+
            let timeline_info = pageserver.timeline_create(
                tenant_id,
                new_timeline_id_opt,
                None,
                None,
                Some(pg_version),
                None,
            )?;
            let new_timeline_id = timeline_info.timeline_id;
            let last_record_lsn = timeline_info.last_record_lsn;
@@ -575,6 +584,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                start_lsn,
                Some(ancestor_timeline_id),
                None,
                None,
            )?;
            let new_timeline_id = timeline_info.timeline_id;
@@ -601,11 +611,9 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
    };
    let mut cplane = ComputeControlPlane::load(env.clone())?;
    // All subcommands take an optional --tenant-id option
    let tenant_id = get_tenant_id(sub_args, env)?;
    match sub_name {
        "list" => {
            let tenant_id = get_tenant_id(sub_args, env)?;
            let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
                eprintln!("Failed to load timeline info: {}", e);
                HashMap::new()
@@ -665,6 +673,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
            println!("{table}");
        }
        "create" => {
            let tenant_id = get_tenant_id(sub_args, env)?;
            let branch_name = sub_args
                .get_one::<String>("branch-name")
                .map(|s| s.as_str())
@@ -709,6 +718,18 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
            };
            match (mode, hot_standby) {
                (ComputeMode::Static(_), true) => {
                    bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
                }
                (ComputeMode::Primary, true) => {
                    bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
                }
                _ => {}
            }
            cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
            cplane.new_endpoint(
                &endpoint_id,
                tenant_id,
@@ -721,8 +742,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
            )?;
        }
        "start" => {
            let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
            let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
            let endpoint_id = sub_args
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
@@ -751,80 +770,28 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    env.safekeepers.iter().map(|sk| sk.id).collect()
                };
-            let endpoint = cplane.endpoints.get(endpoint_id.as_str());
+            let endpoint = cplane
                .endpoints
                .get(endpoint_id.as_str())
                .ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
            cplane.check_conflicting_endpoints(
                endpoint.mode,
                endpoint.tenant_id,
                endpoint.timeline_id,
            )?;
            let ps_conf = env.get_pageserver_conf(pageserver_id)?;
            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
-                let claims = Claims::new(Some(tenant_id), Scope::Tenant);
+                let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);
                Some(env.generate_auth_token(&claims)?)
            } else {
                None
            };
-            let hot_standby = sub_args
+            println!("Starting existing endpoint {endpoint_id}...");
-                .get_one::<bool>("hot-standby")
+            endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
                .copied()
                .unwrap_or(false);
            if let Some(endpoint) = endpoint {
                match (&endpoint.mode, hot_standby) {
                    (ComputeMode::Static(_), true) => {
                        bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
                    }
                    (ComputeMode::Primary, true) => {
                        bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
                    }
                    _ => {}
                }
                println!("Starting existing endpoint {endpoint_id}...");
                endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
            } else {
                let branch_name = sub_args
                    .get_one::<String>("branch-name")
                    .map(|s| s.as_str())
                    .unwrap_or(DEFAULT_BRANCH_NAME);
                let timeline_id = env
                    .get_branch_timeline_id(branch_name, tenant_id)
                    .ok_or_else(|| {
                        anyhow!("Found no timeline id for branch name '{branch_name}'")
                    })?;
                let lsn = sub_args
                    .get_one::<String>("lsn")
                    .map(|lsn_str| Lsn::from_str(lsn_str))
                    .transpose()
                    .context("Failed to parse Lsn from the request")?;
                let pg_version = sub_args
                    .get_one::<u32>("pg-version")
                    .copied()
                    .context("Failed to `pg-version` from the argument string")?;
                let mode = match (lsn, hot_standby) {
                    (Some(lsn), false) => ComputeMode::Static(lsn),
                    (None, true) => ComputeMode::Replica,
                    (None, false) => ComputeMode::Primary,
                    (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
                };
                // when used with custom port this results in non obvious behaviour
                // port is remembered from first start command, i e
                // start --port X
                // stop
                // start <-- will also use port X even without explicit port argument
                println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
                let ep = cplane.new_endpoint(
                    endpoint_id,
                    tenant_id,
                    timeline_id,
                    pg_port,
                    http_port,
                    pg_version,
                    mode,
                    pageserver_id,
                )?;
                ep.start(&auth_token, safekeepers, remote_ext_config)?;
            }
        }
        "reconfigure" => {
            let endpoint_id = sub_args
@@ -1245,7 +1212,7 @@ fn cli() -> Command {
    let remote_ext_config_args = Arg::new("remote-ext-config")
        .long("remote-ext-config")
        .num_args(1)
-        .help("Configure the S3 bucket that we search for extensions in.")
+        .help("Configure the remote extensions storage proxy gateway to request for extensions.")
        .required(false);
    let lsn_arg = Arg::new("lsn")
@@ -1308,6 +1275,7 @@ fn cli() -> Command {
            .subcommand(Command::new("create")
                .about("Create a new blank timeline")
                .arg(tenant_id_arg.clone())
                .arg(timeline_id_arg.clone())
                .arg(branch_name_arg.clone())
                .arg(pg_version_arg.clone())
            )
@@ -1429,15 +1397,7 @@ fn cli() -> Command {
                .subcommand(Command::new("start")
                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
                    .arg(endpoint_id_arg.clone())
                    .arg(tenant_id_arg.clone())
                    .arg(branch_name_arg.clone())
                    .arg(timeline_id_arg.clone())
                    .arg(lsn_arg)
                    .arg(pg_port_arg)
                    .arg(http_port_arg)
                    .arg(endpoint_pageserver_id_arg.clone())
                    .arg(pg_version_arg)
                    .arg(hot_standby_arg)
                    .arg(safekeepers_arg)
                    .arg(remote_ext_config_args)
                )
@@ -1450,7 +1410,6 @@ fn cli() -> Command {
                .subcommand(
                    Command::new("stop")
                    .arg(endpoint_id_arg)
                    .arg(tenant_id_arg.clone())
                    .arg(
                        Arg::new("destroy")
                            .help("Also delete data directory (now optional, should be default in future)")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -45,6 +45,7 @@ use std::sync::Arc;
 use std::time::Duration;
 use anyhow::{anyhow, bail, Context, Result};
 use compute_api::spec::RemoteExtSpec;
 use serde::{Deserialize, Serialize};
 use utils::id::{NodeId, TenantId, TimelineId};
@@ -124,6 +125,7 @@ impl ComputeControlPlane {
        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
        let pageserver =
            PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
        let ep = Arc::new(Endpoint {
            endpoint_id: endpoint_id.to_owned(),
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
@@ -168,6 +170,30 @@ impl ComputeControlPlane {
        Ok(ep)
    }
    pub fn check_conflicting_endpoints(
        &self,
        mode: ComputeMode,
        tenant_id: TenantId,
        timeline_id: TimelineId,
    ) -> Result<()> {
        if matches!(mode, ComputeMode::Primary) {
            // this check is not complete, as you could have a concurrent attempt at
            // creating another primary, both reading the state before checking it here,
            // but it's better than nothing.
            let mut duplicates = self.endpoints.iter().filter(|(_k, v)| {
                v.tenant_id == tenant_id
                    && v.timeline_id == timeline_id
                    && v.mode == mode
                    && v.status() != "stopped"
            });
            if let Some((key, _)) = duplicates.next() {
                bail!("attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported.");
            }
        }
        Ok(())
    }
 }
 ///////////////////////////////////////////////////////////////////////////////
@@ -476,11 +502,24 @@ impl Endpoint {
            }
        }
        // check for file remote_extensions_spec.json
        // if it is present, read it and pass to compute_ctl
        let remote_extensions_spec_path = self.endpoint_path().join("remote_extensions_spec.json");
        let remote_extensions_spec = std::fs::File::open(remote_extensions_spec_path);
        let remote_extensions: Option<RemoteExtSpec>;
        if let Ok(spec_file) = remote_extensions_spec {
            remote_extensions = serde_json::from_reader(spec_file).ok();
        } else {
            remote_extensions = None;
        };
        // Create spec file
        let spec = ComputeSpec {
            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
            format_version: 1.0,
            operation_uuid: None,
            features: vec![],
            cluster: Cluster {
                cluster_id: None, // project ID: not used
                name: None,       // project name: not used
@@ -497,7 +536,7 @@ impl Endpoint {
            pageserver_connstring: Some(pageserver_connstring),
            safekeeper_connstrings,
            storage_auth_token: auth_token.clone(),
-            remote_extensions: None,
+            remote_extensions,
        };
        let spec_path = self.endpoint_path().join("spec.json");
        std::fs::write(spec_path, serde_json::to_string_pretty(&spec)?)?;
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -11,6 +11,7 @@ use std::io::{BufReader, Write};
 use std::num::NonZeroU64;
 use std::path::PathBuf;
 use std::process::{Child, Command};
 use std::time::Duration;
 use std::{io, result};
 use anyhow::{bail, Context};
@@ -522,19 +523,24 @@ impl PageServerNode {
        &self,
        tenant_id: TenantId,
        config: LocationConfig,
        flush_ms: Option<Duration>,
    ) -> anyhow::Result<()> {
        let req_body = TenantLocationConfigRequest { tenant_id, config };
-        self.http_request(
+        let path = format!(
-            Method::PUT,
+            "{}/tenant/{}/location_config",
-            format!(
+            self.http_base_url, tenant_id
-                "{}/tenant/{}/location_config",
+        );
-                self.http_base_url, tenant_id
+        let path = if let Some(flush_ms) = flush_ms {
-            ),
+            format!("{}?flush_ms={}", path, flush_ms.as_millis())
-        )?
+        } else {
-        .json(&req_body)
+            path
-        .send()?
+        };
-        .error_from_body()?;
+
        self.http_request(Method::PUT, path)?
            .json(&req_body)
            .send()?
            .error_from_body()?;
        Ok(())
    }
@@ -559,6 +565,7 @@ impl PageServerNode {
        ancestor_start_lsn: Option<Lsn>,
        ancestor_timeline_id: Option<TimelineId>,
        pg_version: Option<u32>,
        existing_initdb_timeline_id: Option<TimelineId>,
    ) -> anyhow::Result<TimelineInfo> {
        // If timeline ID was not specified, generate one
        let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
@@ -572,6 +579,7 @@ impl PageServerNode {
            ancestor_start_lsn,
            ancestor_timeline_id,
            pg_version,
            existing_initdb_timeline_id,
        })
        .send()?
        .error_from_body()?
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -14,7 +14,6 @@ use pageserver_api::models::{
 use std::collections::HashMap;
 use std::time::Duration;
 use utils::{
    generation::Generation,
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };
@@ -93,6 +92,22 @@ pub fn migrate_tenant(
    // Get a new generation
    let attachment_service = AttachmentService::from_env(env);
    fn build_location_config(
        mode: LocationConfigMode,
        generation: Option<u32>,
        secondary_conf: Option<LocationConfigSecondary>,
    ) -> LocationConfig {
        LocationConfig {
            mode,
            generation,
            secondary_conf,
            tenant_conf: TenantConfig::default(),
            shard_number: 0,
            shard_count: 0,
            shard_stripe_size: 0,
        }
    }
    let previous = attachment_service.inspect(tenant_id)?;
    let mut baseline_lsns = None;
    if let Some((generation, origin_ps_id)) = &previous {
@@ -101,40 +116,26 @@ pub fn migrate_tenant(
        if origin_ps_id == &dest_ps.conf.id {
            println!("🔁 Already attached to {origin_ps_id}, freshening...");
            let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
-            let dest_conf = LocationConfig {
+            let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-                mode: LocationConfigMode::AttachedSingle,
+            dest_ps.location_config(tenant_id, dest_conf, None)?;
                generation: gen.map(Generation::new),
                secondary_conf: None,
                tenant_conf: TenantConfig::default(),
            };
            dest_ps.location_config(tenant_id, dest_conf)?;
            println!("✅ Migration complete");
            return Ok(());
        }
        println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");
-        let stale_conf = LocationConfig {
+        let stale_conf =
-            mode: LocationConfigMode::AttachedStale,
+            build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
-            generation: Some(Generation::new(*generation)),
+        origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?;
            secondary_conf: None,
            tenant_conf: TenantConfig::default(),
        };
        origin_ps.location_config(tenant_id, stale_conf)?;
        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
    }
    let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
-    let dest_conf = LocationConfig {
+    let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);
        mode: LocationConfigMode::AttachedMulti,
        generation: gen.map(Generation::new),
        secondary_conf: None,
        tenant_conf: TenantConfig::default(),
    };
    println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
-    dest_ps.location_config(tenant_id, dest_conf)?;
+    dest_ps.location_config(tenant_id, dest_conf, None)?;
    if let Some(baseline) = baseline_lsns {
        println!("🕑 Waiting for LSN to catch up...");
@@ -164,37 +165,31 @@ pub fn migrate_tenant(
        let found = other_ps_tenants
            .into_iter()
            .map(|t| t.id)
-            .any(|i| i == tenant_id);
+            .any(|i| i.tenant_id == tenant_id);
        if !found {
            continue;
        }
        // Downgrade to a secondary location
-        let secondary_conf = LocationConfig {
+        let secondary_conf = build_location_config(
-            mode: LocationConfigMode::Secondary,
+            LocationConfigMode::Secondary,
-            generation: None,
+            None,
-            secondary_conf: Some(LocationConfigSecondary { warm: true }),
+            Some(LocationConfigSecondary { warm: true }),
-            tenant_conf: TenantConfig::default(),
+        );
        };
        println!(
            "💤 Switching to secondary mode on pageserver {}",
            other_ps.conf.id
        );
-        other_ps.location_config(tenant_id, secondary_conf)?;
+        other_ps.location_config(tenant_id, secondary_conf, None)?;
    }
    println!(
        "🔁 Switching to AttachedSingle mode on pageserver {}",
        dest_ps.conf.id
    );
-    let dest_conf = LocationConfig {
+    let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-        mode: LocationConfigMode::AttachedSingle,
+    dest_ps.location_config(tenant_id, dest_conf, None)?;
        generation: gen.map(Generation::new),
        secondary_conf: None,
        tenant_conf: TenantConfig::default(),
    };
    dest_ps.location_config(tenant_id, dest_conf)?;
    println!("✅ Migration complete");
--- a/docs/rfcs/029-pageserver-wal-disaster-recovery.md
+++ b/docs/rfcs/029-pageserver-wal-disaster-recovery.md
@@ -0,0 +1,205 @@
 # Name
 Created on: 2023-09-08
 Author: Arpad Müller
 ## Summary
 Enable the pageserver to recover from data corruption events by implementing
 a feature to re-apply historic WAL records in parallel to the already occurring
 WAL replay.
 The feature is outside of the user-visible backup and history story, and only
 serves as a second-level backup for the case that there is a bug in the
 pageservers that corrupted the served pages.
 The RFC proposes the addition of two new features:
 * recover a broken branch from WAL (downtime is allowed)
 * a test recovery system to recover random branches to make sure recovery works
 ## Motivation
 The historic WAL is currently stored in S3 even after it has been replayed by
 the pageserver and thus been integrated into the pageserver's storage system.
 This is done to defend from data corruption failures inside the pageservers.
 However, application of this WAL in the disaster recovery setting is currently
 very manual and we want to automate this to make it easier.
 ### Use cases
 There are various use cases for this feature, like:
 * The main motivation is replaying in the instance of pageservers corrupting
  data.
 * We might want to, beyond the user-visible history features, through our
  support channels and upon customer request, in select instances, recover
  historic versions beyond the range of history that we officially support.
 * Running the recovery process in the background for random tenant timelines
  to figure out if there was a corruption of data (we would compare with what
  the pageserver stores for the "official" timeline).
 * Using the WAL to arrive at historic pages we can then back up to S3 so that
  WAL itself can be discarded, or at least not used for future replays.
  Again, this sounds a lot like what the pageserver is already doing, but the
  point is to provide a fallback to the service provided by the pageserver.
 ## Design
 ### Design constraints
 The main design constraint is that the feature needs to be *simple* enough that
 the number of bugs are as low, and reliability as high as possible: the main
 goal of this endeavour is to achieve higher correctness than the pageserver.
 For the background process, we cannot afford a downtime of the timeline that is
 being cloned, as we don't want to restrict ourselves to offline tenants only.
 In the scenario where we want to recover from disasters or roll back to a
 historic lsn through support staff, downtimes are more affordable, and
 inevitable if the original had been subject to the corruption. Ideally, the
 two code paths would share code, so the solution would be designed for not
 requiring downtimes.
 ### API endpoint changes
 This RFC proposes two API endpoint changes in the safekeeper and the
 pageserver.
 Remember, the pageserver timeline API creation endpoint is to this URL:
 ```
 /v1/tenant/{tenant_id}/timeline/
 ```
 Where `{tenant_id}` is the ID of the tenant the timeline is created for,
 and specified as part of the URL. The timeline ID is passed via the POST
 request body as the only required parameter `new_timeline_id`.
 This proposal adds one optional parameter called
 `existing_initdb_timeline_id` to the request's json body. If the parameter
 is not specified, behaviour should be as existing, so the pageserver runs
 initdb.
 If the parameter is specified, it is expected to point to a timeline ID.
 In fact that ID might match `new_timeline_id`, what's important is that
 S3 storage contains a matching initdb under the URL matching the given
 tenant and timeline.
 Having both `ancestor_timeline_id` and `existing_initdb_timeline_id`
 specified is illegal and will yield in an HTTP error. This feature is
 only meant for the "main" branch that doesn't have any ancestors
 of its own, as only here initdb is relevant.
 For the safekeeper, we propose the addition of the following copy endpoint:
 ```
 /v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy
 ```
 it is meant for POST requests with json, and the two URL parameters
 `tenant_id` and `source_timeline_id`. The json request body contains
 the two required parameters `target_timeline_id` and `until_lsn`.
 After invoking, the copy endpoint starts a copy process of the WAL from
 the source ID to the target ID. The lsn is updated according to the
 progress of the API call.
 ### Higher level features
 We want the API changes to support the following higher level features:
 * recovery-after-corruption DR of the main timeline of a tenant. This
  feature allows for downtime.
 * test DR of the main timeline into a special copy timeline. this feature
  is meant to run against selected production tenants in the background,
  without the user noticing, so it does not allow for downtime.
 The recovery-after-corruption DR only needs the pageserver changes.
 It works as follows:
 * delete the timeline from the pageservers via timeline deletion API
 * re-create it via timeline creation API (same ID as before) and set
  `existing_initdb_timeline_id` to the same timeline ID
 The test DR requires also the copy primitive and works as follows:
 * copy the WAL of the timeline to a new place
 * create a new timeline for the tenant
 ## Non Goals
 At the danger of being repetitive, the main goal of this feature is to be a
 backup method, so reliability is very important. This implies that other
 aspects like performance or space reduction are less important.
 ### Corrupt WAL
 The process suggested by this RFC assumes that the WAL is free of corruption.
 In some instances, corruption can make it into WAL, like for example when
 higher level components like postgres or the application first read corrupt
 data, and then execute a write with data derived from that earlier read. That
 written data might then contain the corruption.
 Common use cases can hit this quite easily. For example, an application reads
 some counter, increments it, and then writes the new counter value to the
 database.
 On a lower level, the compute might put FPIs (Full Page Images) into the WAL,
 which have corrupt data for rows unrelated to the write operation at hand.
 Separating corrupt writes from non-corrupt ones is a hard problem in general,
 and if the application was involved in making the corrupt write, a recovery
 would also involve the application. Therefore, corruption that has made it into
 the WAL is outside of the scope of this feature. However, the WAL replay can be
 issued to right before the point in time where the corruption occured. Then the
 data loss is isolated to post-corruption writes only.
 ## Impacted components (e.g. pageserver, safekeeper, console, etc)
 Most changes would happen to the pageservers.
 For the higher level features, maybe other components like the console would
 be involved.
 We need to make sure that the shadow timelines are not subject to the usual
 limits and billing we apply to existing timelines.
 ## Proposed implementation
 The first problem to keep in mind is the reproducability of `initdb`.
 So an initial step would be to upload `initdb` snapshots to S3.
 After that, we'd have the endpoint spawn a background process which
 performs the replay of the WAL to that new timeline. This process should
 follow the existing workflows as closely as possible, just using the
 WAL records of a different timeline.
 The timeline created will be in a special state that solely looks for WAL
 entries of the timeline it is trying to copy. Once the target LSN is reached,
 it turns into a normal timeline that also accepts writes to its own
 timeline ID.
 ### Scalability
 For now we want to run this entire process on a single node, and as
 it is by nature linear, it's hard to parallelize. However, for the
 verification workloads, we can easily start the WAL replay in parallel
 for different points in time. This is valuable especially for tenants
 with large WAL records.
 Compare this with the tricks to make addition circuits execute with
 lower latency by making them perform the addition for both possible
 values of the carry bit, and then, in a second step, taking the
 result for the carry bit that was actually obtained.
 The other scalability dimension to consider is the WAL length, which
 is a growing question as tenants accumulate changes. There are
 possible approaches to this, including creating snapshots of the
 page files and uploading them to S3, but if we do this for every single
 branch, we lose the cheap branching property.
 ### Implementation by component
 The proposed changes for the various components of the neon architecture
 are written up in this notion page:
 https://www.notion.so/neondatabase/Pageserver-disaster-recovery-one-pager-4ecfb5df16ce4f6bbfc3817ed1a6cbb2
 ### Unresolved questions
 none known (outside of the mentioned ones).
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -26,6 +26,13 @@ pub struct ComputeSpec {
    // but we don't use it for anything. Serde will ignore missing fields when
    // deserializing it.
    pub operation_uuid: Option<String>,
    /// Compute features to enable. These feature flags are provided, when we
    /// know all the details about client's compute, so they cannot be used
    /// to change `Empty` compute behavior.
    #[serde(default)]
    pub features: Vec<ComputeFeature>,
    /// Expected cluster state at the end of transition process.
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,
@@ -68,6 +75,19 @@ pub struct ComputeSpec {
    pub remote_extensions: Option<RemoteExtSpec>,
 }
 /// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
 #[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
 #[serde(rename_all = "snake_case")]
 pub enum ComputeFeature {
    // XXX: Add more feature flags here.
    // This is a special feature flag that is used to represent unknown feature flags.
    // Basically all unknown to enum flags are represented as this one. See unit test
    // `parse_unknown_features()` for more details.
    #[serde(other)]
    UnknownFeature,
 }
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct RemoteExtSpec {
    pub public_extensions: Option<Vec<String>>,
@@ -187,8 +207,6 @@ pub struct DeltaOp {
 pub struct Role {
    pub name: PgIdent,
    pub encrypted_password: Option<String>,
    pub replication: Option<bool>,
    pub bypassrls: Option<bool>,
    pub options: GenericOptions,
 }
@@ -229,7 +247,10 @@ mod tests {
    #[test]
    fn parse_spec_file() {
        let file = File::open("tests/cluster_spec.json").unwrap();
-        let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
+        let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
        // Features list defaults to empty vector.
        assert!(spec.features.is_empty());
    }
    #[test]
@@ -241,4 +262,22 @@ mod tests {
        ob.insert("unknown_field_123123123".into(), "hello".into());
        let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
    }
    #[test]
    fn parse_unknown_features() {
        // Test that unknown feature flags do not cause any errors.
        let file = File::open("tests/cluster_spec.json").unwrap();
        let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
        let ob = json.as_object_mut().unwrap();
        // Add unknown feature flags.
        let features = vec!["foo_bar_feature", "baz_feature"];
        ob.insert("features".into(), features.into());
        let spec: ComputeSpec = serde_json::from_value(json).unwrap();
        assert!(spec.features.len() == 2);
        assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
        assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
    }
 }
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -18,6 +18,7 @@ enum-map.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 hex.workspace = true
 thiserror.workspace = true
 workspace_hack.workspace = true
--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
@@ -4,7 +4,9 @@
 //! See docs/rfcs/025-generation-numbers.md
 use serde::{Deserialize, Serialize};
-use utils::id::{NodeId, TenantId};
+use utils::id::NodeId;
 use crate::shard::TenantShardId;
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachRequest {
@@ -13,7 +15,7 @@ pub struct ReAttachRequest {
 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponseTenant {
-    pub id: TenantId,
+    pub id: TenantShardId,
    pub gen: u32,
 }
@@ -24,7 +26,7 @@ pub struct ReAttachResponse {
 #[derive(Serialize, Deserialize)]
 pub struct ValidateRequestTenant {
-    pub id: TenantId,
+    pub id: TenantShardId,
    pub gen: u32,
 }
@@ -40,6 +42,6 @@ pub struct ValidateResponse {
 #[derive(Serialize, Deserialize)]
 pub struct ValidateResponseTenant {
-    pub id: TenantId,
+    pub id: TenantShardId,
    pub valid: bool,
 }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -140,3 +140,7 @@ impl Key {
        })
    }
 }
 pub fn is_rel_block_key(key: &Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0
 }
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -10,7 +10,6 @@ use serde_with::serde_as;
 use strum_macros;
 use utils::{
    completion,
    generation::Generation,
    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
@@ -180,6 +179,8 @@ pub struct TimelineCreateRequest {
    #[serde(default)]
    pub ancestor_timeline_id: Option<TimelineId>,
    #[serde(default)]
    pub existing_initdb_timeline_id: Option<TimelineId>,
    #[serde(default)]
    pub ancestor_start_lsn: Option<Lsn>,
    pub pg_version: Option<u32>,
 }
@@ -262,10 +263,19 @@ pub struct LocationConfig {
    pub mode: LocationConfigMode,
    /// If attaching, in what generation?
    #[serde(default)]
-    pub generation: Option<Generation>,
+    pub generation: Option<u32>,
    #[serde(default)]
    pub secondary_conf: Option<LocationConfigSecondary>,
    // Shard parameters: if shard_count is nonzero, then other shard_* fields
    // must be set accurately.
    #[serde(default)]
    pub shard_number: u8,
    #[serde(default)]
    pub shard_count: u8,
    #[serde(default)]
    pub shard_stripe_size: u32,
    // If requesting mode `Secondary`, configuration for that.
    // Custom storage configuration for the tenant, if any
    pub tenant_conf: TenantConfig,
@@ -306,31 +316,14 @@ impl std::ops::Deref for TenantConfigRequest {
 impl TenantConfigRequest {
    pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
-        let config = TenantConfig {
+        let config = TenantConfig::default();
            checkpoint_distance: None,
            checkpoint_timeout: None,
            compaction_target_size: None,
            compaction_period: None,
            compaction_threshold: None,
            gc_horizon: None,
            gc_period: None,
            image_creation_threshold: None,
            pitr_interval: None,
            walreceiver_connect_timeout: None,
            lagging_wal_timeout: None,
            max_lsn_wal_lag: None,
            trace_read_requests: None,
            eviction_policy: None,
            min_resident_size_override: None,
            evictions_low_residence_duration_metric_threshold: None,
            gc_feedback: None,
        };
        TenantConfigRequest { tenant_id, config }
    }
 }
 #[derive(Debug, Deserialize)]
 pub struct TenantAttachRequest {
    #[serde(default)]
    pub config: TenantAttachConfig,
    #[serde(default)]
    pub generation: Option<u32>,
@@ -338,7 +331,7 @@ pub struct TenantAttachRequest {
 /// Newtype to enforce deny_unknown_fields on TenantConfig for
 /// its usage inside `TenantAttachRequest`.
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize, Default)]
 #[serde(deny_unknown_fields)]
 pub struct TenantAttachConfig {
    #[serde(flatten)]
@@ -364,7 +357,7 @@ pub enum TenantAttachmentStatus {
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
-    pub id: TenantId,
+    pub id: TenantShardId,
    // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
    pub state: TenantState,
    /// Sum of the size of all layer files.
@@ -376,7 +369,7 @@ pub struct TenantInfo {
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
 #[derive(Debug, Serialize, Deserialize, Clone)]
 pub struct TimelineInfo {
-    pub tenant_id: TenantId,
+    pub tenant_id: TenantShardId,
    pub timeline_id: TimelineId,
    pub ancestor_timeline_id: Option<TimelineId>,
@@ -392,7 +385,12 @@ pub struct TimelineInfo {
    /// The LSN that we are advertizing to safekeepers
    pub remote_consistent_lsn_visible: Lsn,
-    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
+    /// The LSN from the start of the root timeline (never changes)
    pub initdb_lsn: Lsn,
    pub current_logical_size: u64,
    pub current_logical_size_is_accurate: bool,
    /// Sum of the size of all layer files.
    /// If a layer is present in both local FS and S3, it counts only once.
    pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
@@ -828,7 +826,7 @@ mod tests {
    fn test_tenantinfo_serde() {
        // Test serialization/deserialization of TenantInfo
        let original_active = TenantInfo {
-            id: TenantId::generate(),
+            id: TenantShardId::unsharded(TenantId::generate()),
            state: TenantState::Active,
            current_physical_size: Some(42),
            attachment_status: TenantAttachmentStatus::Attached,
@@ -845,7 +843,7 @@ mod tests {
        });
        let original_broken = TenantInfo {
-            id: TenantId::generate(),
+            id: TenantShardId::unsharded(TenantId::generate()),
            state: TenantState::Broken {
                reason: "reason".into(),
                backtrace: "backtrace info".into(),
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,13 +1,15 @@
 use std::{ops::RangeInclusive, str::FromStr};
 use crate::key::{is_rel_block_key, Key};
 use hex::FromHex;
 use serde::{Deserialize, Serialize};
 use thiserror;
 use utils::id::TenantId;
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardNumber(pub u8);
-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(pub u8);
 impl ShardCount {
@@ -38,7 +40,7 @@ impl ShardNumber {
 /// Note that the binary encoding is _not_ backward compatible, because
 /// at the time sharding is introduced, there are no existing binary structures
 /// containing TenantId that we need to handle.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct TenantShardId {
    pub tenant_id: TenantId,
    pub shard_number: ShardNumber,
@@ -71,19 +73,33 @@ impl TenantShardId {
        )
    }
-    pub fn shard_slug(&self) -> String {
+    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
-        format!("{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+        ShardSlug(self)
    }
    /// Convenience for code that has special behavior on the 0th shard.
    pub fn is_zero(&self) -> bool {
        self.shard_number == ShardNumber(0)
    }
 }
 /// Formatting helper
 struct ShardSlug<'a>(&'a TenantShardId);
 impl<'a> std::fmt::Display for ShardSlug<'a> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(
            f,
            "{:02x}{:02x}",
            self.0.shard_number.0, self.0.shard_count.0
        )
    }
 }
 impl std::fmt::Display for TenantShardId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        if self.shard_count != ShardCount(0) {
-            write!(
+            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
                f,
                "{}-{:02x}{:02x}",
                self.tenant_id, self.shard_number.0, self.shard_count.0
            )
        } else {
            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
            // is distinct from the normal single shard case (shard count == 1).
@@ -139,6 +155,89 @@ impl From<[u8; 18]> for TenantShardId {
    }
 }
 /// For use within the context of a particular tenant, when we need to know which
 /// shard we're dealing with, but do not need to know the full ShardIdentity (because
 /// we won't be doing any page->shard mapping), and do not need to know the fully qualified
 /// TenantShardId.
 #[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
 pub struct ShardIndex {
    pub shard_number: ShardNumber,
    pub shard_count: ShardCount,
 }
 impl ShardIndex {
    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
        Self {
            shard_number: number,
            shard_count: count,
        }
    }
    pub fn unsharded() -> Self {
        Self {
            shard_number: ShardNumber(0),
            shard_count: ShardCount(0),
        }
    }
    pub fn is_unsharded(&self) -> bool {
        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
    }
    /// For use in constructing remote storage paths: concatenate this with a TenantId
    /// to get a fully qualified TenantShardId.
    ///
    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
    /// that the legacy pre-sharding remote key format is preserved.
    pub fn get_suffix(&self) -> String {
        if self.is_unsharded() {
            "".to_string()
        } else {
            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
        }
    }
 }
 impl std::fmt::Display for ShardIndex {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
    }
 }
 impl std::fmt::Debug for ShardIndex {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        // Debug is the same as Display: the compact hex representation
        write!(f, "{}", self)
    }
 }
 impl std::str::FromStr for ShardIndex {
    type Err = hex::FromHexError;
    fn from_str(s: &str) -> Result<Self, Self::Err> {
        // Expect format: 1 byte shard number, 1 byte shard count
        if s.len() == 4 {
            let bytes = s.as_bytes();
            let mut shard_parts: [u8; 2] = [0u8; 2];
            hex::decode_to_slice(bytes, &mut shard_parts)?;
            Ok(Self {
                shard_number: ShardNumber(shard_parts[0]),
                shard_count: ShardCount(shard_parts[1]),
            })
        } else {
            Err(hex::FromHexError::InvalidStringLength)
        }
    }
 }
 impl From<[u8; 2]> for ShardIndex {
    fn from(b: [u8; 2]) -> Self {
        Self {
            shard_number: ShardNumber(b[0]),
            shard_count: ShardCount(b[1]),
        }
    }
 }
 impl Serialize for TenantShardId {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
@@ -209,6 +308,261 @@ impl<'de> Deserialize<'de> for TenantShardId {
    }
 }
 /// Stripe size in number of pages
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardStripeSize(pub u32);
 /// Layout version: for future upgrades where we might change how the key->shard mapping works
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardLayout(u8);
 const LAYOUT_V1: ShardLayout = ShardLayout(1);
 /// ShardIdentity uses a magic layout value to indicate if it is unusable
 const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
 /// Default stripe size in pages: 256MiB divided by 8kiB page size.
 const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
 /// The ShardIdentity contains the information needed for one member of map
 /// to resolve a key to a shard, and then check whether that shard is ==self.
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardIdentity {
    pub number: ShardNumber,
    pub count: ShardCount,
    stripe_size: ShardStripeSize,
    layout: ShardLayout,
 }
 #[derive(thiserror::Error, Debug, PartialEq, Eq)]
 pub enum ShardConfigError {
    #[error("Invalid shard count")]
    InvalidCount,
    #[error("Invalid shard number")]
    InvalidNumber,
    #[error("Invalid stripe size")]
    InvalidStripeSize,
 }
 impl ShardIdentity {
    /// An identity with number=0 count=0 is a "none" identity, which represents legacy
    /// tenants.  Modern single-shard tenants should not use this: they should
    /// have number=0 count=1.
    pub fn unsharded() -> Self {
        Self {
            number: ShardNumber(0),
            count: ShardCount(0),
            layout: LAYOUT_V1,
            stripe_size: DEFAULT_STRIPE_SIZE,
        }
    }
    /// A broken instance of this type is only used for `TenantState::Broken` tenants,
    /// which are constructed in code paths that don't have access to proper configuration.
    ///
    /// A ShardIdentity in this state may not be used for anything, and should not be persisted.
    /// Enforcement is via assertions, to avoid making our interface fallible for this
    /// edge case: it is the Tenant's responsibility to avoid trying to do any I/O when in a broken
    /// state, and by extension to avoid trying to do any page->shard resolution.
    pub fn broken(number: ShardNumber, count: ShardCount) -> Self {
        Self {
            number,
            count,
            layout: LAYOUT_BROKEN,
            stripe_size: DEFAULT_STRIPE_SIZE,
        }
    }
    pub fn is_unsharded(&self) -> bool {
        self.number == ShardNumber(0) && self.count == ShardCount(0)
    }
    /// Count must be nonzero, and number must be < count. To construct
    /// the legacy case (count==0), use Self::unsharded instead.
    pub fn new(
        number: ShardNumber,
        count: ShardCount,
        stripe_size: ShardStripeSize,
    ) -> Result<Self, ShardConfigError> {
        if count.0 == 0 {
            Err(ShardConfigError::InvalidCount)
        } else if number.0 > count.0 - 1 {
            Err(ShardConfigError::InvalidNumber)
        } else if stripe_size.0 == 0 {
            Err(ShardConfigError::InvalidStripeSize)
        } else {
            Ok(Self {
                number,
                count,
                layout: LAYOUT_V1,
                stripe_size,
            })
        }
    }
    fn is_broken(&self) -> bool {
        self.layout == LAYOUT_BROKEN
    }
    pub fn get_shard_number(&self, key: &Key) -> ShardNumber {
        assert!(!self.is_broken());
        key_to_shard_number(self.count, self.stripe_size, key)
    }
    /// Return true if the key should be ingested by this shard
    pub fn is_key_local(&self, key: &Key) -> bool {
        assert!(!self.is_broken());
        if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
            true
        } else {
            key_to_shard_number(self.count, self.stripe_size, key) == self.number
        }
    }
    pub fn shard_slug(&self) -> String {
        if self.count > ShardCount(0) {
            format!("-{:02x}{:02x}", self.number.0, self.count.0)
        } else {
            String::new()
        }
    }
    /// Convenience for checking if this identity is the 0th shard in a tenant,
    /// for special cases on shard 0 such as ingesting relation sizes.
    pub fn is_zero(&self) -> bool {
        self.number == ShardNumber(0)
    }
 }
 impl Serialize for ShardIndex {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
        S: serde::Serializer,
    {
        if serializer.is_human_readable() {
            serializer.collect_str(self)
        } else {
            // Binary encoding is not used in index_part.json, but is included in anticipation of
            // switching various structures (e.g. inter-process communication, remote metadata) to more
            // compact binary encodings in future.
            let mut packed: [u8; 2] = [0; 2];
            packed[0] = self.shard_number.0;
            packed[1] = self.shard_count.0;
            packed.serialize(serializer)
        }
    }
 }
 impl<'de> Deserialize<'de> for ShardIndex {
    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
    where
        D: serde::Deserializer<'de>,
    {
        struct IdVisitor {
            is_human_readable_deserializer: bool,
        }
        impl<'de> serde::de::Visitor<'de> for IdVisitor {
            type Value = ShardIndex;
            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
                if self.is_human_readable_deserializer {
                    formatter.write_str("value in form of hex string")
                } else {
                    formatter.write_str("value in form of integer array([u8; 2])")
                }
            }
            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
            where
                A: serde::de::SeqAccess<'de>,
            {
                let s = serde::de::value::SeqAccessDeserializer::new(seq);
                let id: [u8; 2] = Deserialize::deserialize(s)?;
                Ok(ShardIndex::from(id))
            }
            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
            where
                E: serde::de::Error,
            {
                ShardIndex::from_str(v).map_err(E::custom)
            }
        }
        if deserializer.is_human_readable() {
            deserializer.deserialize_str(IdVisitor {
                is_human_readable_deserializer: true,
            })
        } else {
            deserializer.deserialize_tuple(
                2,
                IdVisitor {
                    is_human_readable_deserializer: false,
                },
            )
        }
    }
 }
 /// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
 /// in order to be able to serve basebackup requests without peer communication).
 fn key_is_shard0(key: &Key) -> bool {
    // To decide what to shard out to shards >0, we apply a simple rule that only
    // relation pages are distributed to shards other than shard zero. Everything else gets
    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
    // requests, and any request other than those for particular blocks in relations.
    //
    // In this condition:
    // - is_rel_block_key includes only relations, i.e. excludes SLRU data and
    // all metadata.
    // - field6 is set to -1 for relation size pages.
    !(is_rel_block_key(key) && key.field6 != 0xffffffff)
 }
 /// Provide the same result as the function in postgres `hashfn.h` with the same name
 fn murmurhash32(mut h: u32) -> u32 {
    h ^= h >> 16;
    h = h.wrapping_mul(0x85ebca6b);
    h ^= h >> 13;
    h = h.wrapping_mul(0xc2b2ae35);
    h ^= h >> 16;
    h
 }
 /// Provide the same result as the function in postgres `hashfn.h` with the same name
 fn hash_combine(mut a: u32, mut b: u32) -> u32 {
    b = b.wrapping_add(0x9e3779b9);
    b = b.wrapping_add(a << 6);
    b = b.wrapping_add(a >> 2);
    a ^= b;
    a
 }
 /// Where a Key is to be distributed across shards, select the shard.  This function
 /// does not account for keys that should be broadcast across shards.
 ///
 /// The hashing in this function must exactly match what we do in postgres smgr
 /// code.  The resulting distribution of pages is intended to preserve locality within
 /// `stripe_size` ranges of contiguous block numbers in the same relation, while otherwise
 /// distributing data pseudo-randomly.
 ///
 /// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
 /// and will be handled at higher levels when shards are split.
 fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
    // Fast path for un-sharded tenants or broadcast keys
    if count < ShardCount(2) || key_is_shard0(key) {
        return ShardNumber(0);
    }
    // relNode
    let mut hash = murmurhash32(key.field4);
    // blockNum/stripe size
    hash = hash_combine(hash, murmurhash32(key.field6 / stripe_size.0));
    ShardNumber((hash % count.0 as u32) as u8)
 }
 #[cfg(test)]
 mod tests {
    use std::str::FromStr;
@@ -318,4 +672,91 @@ mod tests {
        Ok(())
    }
    #[test]
    fn shard_identity_validation() -> Result<(), ShardConfigError> {
        // Happy cases
        ShardIdentity::new(ShardNumber(0), ShardCount(1), DEFAULT_STRIPE_SIZE)?;
        ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(1))?;
        ShardIdentity::new(ShardNumber(254), ShardCount(255), ShardStripeSize(1))?;
        assert_eq!(
            ShardIdentity::new(ShardNumber(0), ShardCount(0), DEFAULT_STRIPE_SIZE),
            Err(ShardConfigError::InvalidCount)
        );
        assert_eq!(
            ShardIdentity::new(ShardNumber(10), ShardCount(10), DEFAULT_STRIPE_SIZE),
            Err(ShardConfigError::InvalidNumber)
        );
        assert_eq!(
            ShardIdentity::new(ShardNumber(11), ShardCount(10), DEFAULT_STRIPE_SIZE),
            Err(ShardConfigError::InvalidNumber)
        );
        assert_eq!(
            ShardIdentity::new(ShardNumber(255), ShardCount(255), DEFAULT_STRIPE_SIZE),
            Err(ShardConfigError::InvalidNumber)
        );
        assert_eq!(
            ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(0)),
            Err(ShardConfigError::InvalidStripeSize)
        );
        Ok(())
    }
    #[test]
    fn shard_index_human_encoding() -> Result<(), hex::FromHexError> {
        let example = ShardIndex {
            shard_number: ShardNumber(13),
            shard_count: ShardCount(17),
        };
        let expected: String = "0d11".to_string();
        let encoded = format!("{example}");
        assert_eq!(&encoded, &expected);
        let decoded = ShardIndex::from_str(&encoded)?;
        assert_eq!(example, decoded);
        Ok(())
    }
    #[test]
    fn shard_index_binary_encoding() -> Result<(), hex::FromHexError> {
        let example = ShardIndex {
            shard_number: ShardNumber(13),
            shard_count: ShardCount(17),
        };
        let expected: [u8; 2] = [0x0d, 0x11];
        let encoded = bincode::serialize(&example).unwrap();
        assert_eq!(Hex(&encoded), Hex(&expected));
        let decoded = bincode::deserialize(&encoded).unwrap();
        assert_eq!(example, decoded);
        Ok(())
    }
    // These are only smoke tests to spot check that our implementation doesn't
    // deviate from a few examples values: not aiming to validate the overall
    // hashing algorithm.
    #[test]
    fn murmur_hash() {
        assert_eq!(murmurhash32(0), 0);
        assert_eq!(hash_combine(0xb1ff3b40, 0), 0xfb7923c9);
    }
    #[test]
    fn shard_mapping() {
        let key = Key {
            field1: 0x00,
            field2: 0x67f,
            field3: 0x5,
            field4: 0x400c,
            field5: 0x00,
            field6: 0x7d06,
        };
        let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
        assert_eq!(shard, ShardNumber(8));
    }
 }
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -289,10 +289,10 @@ impl FeStartupPacket {
        // We shouldn't advance `buf` as probably full message is not there yet,
        // so can't directly use Bytes::get_u32 etc.
        let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
-        // The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
+        // The proposed replacement is `!(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
        // which is less readable
        #[allow(clippy::manual_range_contains)]
-        if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
+        if len < 8 || len > MAX_STARTUP_PACKET_LENGTH {
            return Err(ProtocolError::Protocol(format!(
                "invalid startup packet message length {}",
                len
@@ -975,4 +975,10 @@ mod tests {
        let params = make_params("foo\\ bar \\ \\\\ baz\\  lol");
        assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
    }
    #[test]
    fn parse_fe_startup_packet_regression() {
        let data = [0, 0, 0, 7, 0, 0, 0, 0];
        FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err();
    }
 }
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -9,18 +9,18 @@ anyhow.workspace = true
 async-trait.workspace = true
 once_cell.workspace = true
 aws-smithy-async.workspace = true
-aws-smithy-http.workspace = true
+aws-smithy-types.workspace = true
 aws-types.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-credential-types.workspace = true
 bytes.workspace = true
 camino.workspace = true
 hyper = { workspace = true, features = ["stream"] }
 futures.workspace = true
 serde.workspace = true
 serde_json.workspace = true
 tokio = { workspace = true, features = ["sync", "fs", "io-util"] }
-tokio-util.workspace = true
+tokio-util = { workspace = true, features = ["compat"] }
 toml_edit.workspace = true
 tracing.workspace = true
 scopeguard.workspace = true
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -1,21 +1,24 @@
 //! Azure Blob Storage wrapper
 use std::borrow::Cow;
 use std::collections::HashMap;
 use std::env;
 use std::num::NonZeroU32;
 use std::pin::Pin;
 use std::sync::Arc;
 use std::{borrow::Cow, io::Cursor};
 use super::REMOTE_STORAGE_PREFIX_SEPARATOR;
 use anyhow::Result;
 use azure_core::request_options::{MaxResults, Metadata, Range};
 use azure_core::RetryOptions;
 use azure_identity::DefaultAzureCredential;
 use azure_storage::StorageCredentials;
 use azure_storage_blobs::prelude::ClientBuilder;
 use azure_storage_blobs::{blob::operations::GetBlobBuilder, prelude::ContainerClient};
 use bytes::Bytes;
 use futures::stream::Stream;
 use futures_util::StreamExt;
 use http_types::StatusCode;
 use tokio::io::AsyncRead;
 use tracing::debug;
 use crate::s3_bucket::RequestKind;
@@ -49,7 +52,8 @@ impl AzureBlobStorage {
            StorageCredentials::token_credential(Arc::new(token_credential))
        };
-        let builder = ClientBuilder::new(account, credentials);
+        // we have an outer retry
        let builder = ClientBuilder::new(account, credentials).retry(RetryOptions::none());
        let client = builder.container_client(azure_config.container_name.to_owned());
@@ -116,7 +120,8 @@ impl AzureBlobStorage {
        let mut metadata = HashMap::new();
        // TODO give proper streaming response instead of buffering into RAM
        // https://github.com/neondatabase/neon/issues/5563
-        let mut buf = Vec::new();
+
        let mut bufs = Vec::new();
        while let Some(part) = response.next().await {
            let part = part.map_err(to_download_error)?;
            if let Some(blob_meta) = part.blob.metadata {
@@ -127,10 +132,10 @@ impl AzureBlobStorage {
                .collect()
                .await
                .map_err(|e| DownloadError::Other(e.into()))?;
-            buf.extend_from_slice(&data.slice(..));
+            bufs.push(data);
        }
        Ok(Download {
-            download_stream: Box::pin(Cursor::new(buf)),
+            download_stream: Box::pin(futures::stream::iter(bufs.into_iter().map(Ok))),
            metadata: Some(StorageMetadata(metadata)),
        })
    }
@@ -217,9 +222,10 @@ impl RemoteStorage for AzureBlobStorage {
        }
        Ok(res)
    }
    async fn upload(
        &self,
-        mut from: impl AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -227,13 +233,12 @@ impl RemoteStorage for AzureBlobStorage {
        let _permit = self.permit(RequestKind::Put).await;
        let blob_client = self.client.blob_client(self.relative_path_to_name(to));
-        // TODO FIX THIS UGLY HACK and don't buffer the entire object
+        let from: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static>> =
-        // into RAM here, but use the streaming interface. For that,
+            Box::pin(from);
-        // we'd have to change the interface though...
+
-        // https://github.com/neondatabase/neon/issues/5563
+        let from = NonSeekableStream::new(from, data_size_bytes);
-        let mut buf = Vec::with_capacity(data_size_bytes);
+
-        tokio::io::copy(&mut from, &mut buf).await?;
+        let body = azure_core::Body::SeekableStream(Box::new(from));
        let body = azure_core::Body::Bytes(buf.into());
        let mut builder = blob_client.put_block_blob(body);
@@ -266,17 +271,12 @@ impl RemoteStorage for AzureBlobStorage {
        let mut builder = blob_client.get();
-        if let Some(end_exclusive) = end_exclusive {
+        let range: Range = if let Some(end_exclusive) = end_exclusive {
-            builder = builder.range(Range::new(start_inclusive, end_exclusive));
+            (start_inclusive..end_exclusive).into()
        } else {
-            // Open ranges are not supported by the SDK so we work around
+            (start_inclusive..).into()
-            // by setting the upper limit extremely high (but high enough
+        };
-            // to still be representable by signed 64 bit integers).
+        builder = builder.range(range);
            // TODO remove workaround once the SDK adds open range support
            // https://github.com/Azure/azure-sdk-for-rust/issues/1438
            let end_exclusive = u64::MAX / 4;
            builder = builder.range(Range::new(start_inclusive, end_exclusive));
        }
        self.download_for_builder(builder).await
    }
@@ -312,3 +312,153 @@ impl RemoteStorage for AzureBlobStorage {
        Ok(())
    }
 }
 pin_project_lite::pin_project! {
    /// Hack to work around not being able to stream once with azure sdk.
    ///
    /// Azure sdk clones streams around with the assumption that they are like
    /// `Arc<tokio::fs::File>` (except not supporting tokio), however our streams are not like
    /// that. For example for an `index_part.json` we just have a single chunk of [`Bytes`]
    /// representing the whole serialized vec. It could be trivially cloneable and "semi-trivially"
    /// seekable, but we can also just re-try the request easier.
    #[project = NonSeekableStreamProj]
    enum NonSeekableStream<S> {
        /// A stream wrappers initial form.
        ///
        /// Mutex exists to allow moving when cloning. If the sdk changes to do less than 1
        /// clone before first request, then this must be changed.
        Initial {
            inner: std::sync::Mutex<Option<tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>>>,
            len: usize,
        },
        /// The actually readable variant, produced by cloning the Initial variant.
        ///
        /// The sdk currently always clones once, even without retry policy.
        Actual {
            #[pin]
            inner: tokio_util::compat::Compat<tokio_util::io::StreamReader<S, Bytes>>,
            len: usize,
            read_any: bool,
        },
        /// Most likely unneeded, but left to make life easier, in case more clones are added.
        Cloned {
            len_was: usize,
        }
    }
 }
 impl<S> NonSeekableStream<S>
 where
    S: Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
 {
    fn new(inner: S, len: usize) -> NonSeekableStream<S> {
        use tokio_util::compat::TokioAsyncReadCompatExt;
        let inner = tokio_util::io::StreamReader::new(inner).compat();
        let inner = Some(inner);
        let inner = std::sync::Mutex::new(inner);
        NonSeekableStream::Initial { inner, len }
    }
 }
 impl<S> std::fmt::Debug for NonSeekableStream<S> {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        match self {
            Self::Initial { len, .. } => f.debug_struct("Initial").field("len", len).finish(),
            Self::Actual { len, .. } => f.debug_struct("Actual").field("len", len).finish(),
            Self::Cloned { len_was, .. } => f.debug_struct("Cloned").field("len", len_was).finish(),
        }
    }
 }
 impl<S> futures::io::AsyncRead for NonSeekableStream<S>
 where
    S: Stream<Item = std::io::Result<Bytes>>,
 {
    fn poll_read(
        self: std::pin::Pin<&mut Self>,
        cx: &mut std::task::Context<'_>,
        buf: &mut [u8],
    ) -> std::task::Poll<std::io::Result<usize>> {
        match self.project() {
            NonSeekableStreamProj::Actual {
                inner, read_any, ..
            } => {
                *read_any = true;
                inner.poll_read(cx, buf)
            }
            // NonSeekableStream::Initial does not support reading because it is just much easier
            // to have the mutex in place where one does not poll the contents, or that's how it
            // seemed originally. If there is a version upgrade which changes the cloning, then
            // that support needs to be hacked in.
            //
            // including {self:?} into the message would be useful, but unsure how to unproject.
            _ => std::task::Poll::Ready(Err(std::io::Error::new(
                std::io::ErrorKind::Other,
                "cloned or initial values cannot be read",
            ))),
        }
    }
 }
 impl<S> Clone for NonSeekableStream<S> {
    /// Weird clone implementation exists to support the sdk doing cloning before issuing the first
    /// request, see type documentation.
    fn clone(&self) -> Self {
        use NonSeekableStream::*;
        match self {
            Initial { inner, len } => {
                if let Some(inner) = inner.lock().unwrap().take() {
                    Actual {
                        inner,
                        len: *len,
                        read_any: false,
                    }
                } else {
                    Self::Cloned { len_was: *len }
                }
            }
            Actual { len, .. } => Cloned { len_was: *len },
            Cloned { len_was } => Cloned { len_was: *len_was },
        }
    }
 }
 #[async_trait::async_trait]
 impl<S> azure_core::SeekableStream for NonSeekableStream<S>
 where
    S: Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync + 'static,
 {
    async fn reset(&mut self) -> azure_core::error::Result<()> {
        use NonSeekableStream::*;
        let msg = match self {
            Initial { inner, .. } => {
                if inner.get_mut().unwrap().is_some() {
                    return Ok(());
                } else {
                    "reset after first clone is not supported"
                }
            }
            Actual { read_any, .. } if !*read_any => return Ok(()),
            Actual { .. } => "reset after reading is not supported",
            Cloned { .. } => "reset after second clone is not supported",
        };
        Err(azure_core::error::Error::new(
            azure_core::error::ErrorKind::Io,
            std::io::Error::new(std::io::ErrorKind::Other, msg),
        ))
    }
    // Note: it is not documented if this should be the total or remaining length, total passes the
    // tests.
    fn len(&self) -> usize {
        use NonSeekableStream::*;
        match self {
            Initial { len, .. } => *len,
            Actual { len, .. } => *len,
            Cloned { len_was, .. } => *len_was,
        }
    }
 }
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -19,8 +19,10 @@ use std::{collections::HashMap, fmt::Debug, num::NonZeroUsize, pin::Pin, sync::A
 use anyhow::{bail, Context};
 use camino::{Utf8Path, Utf8PathBuf};
 use bytes::Bytes;
 use futures::stream::Stream;
 use serde::{Deserialize, Serialize};
-use tokio::{io, sync::Semaphore};
+use tokio::sync::Semaphore;
 use toml_edit::Item;
 use tracing::info;
@@ -179,7 +181,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
    /// Streams the local file contents into remote into the remote storage entry.
    async fn upload(
        &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        // S3 PUT request requires the content length to be specified,
        // otherwise it starts to fail with the concurrent connection count increasing.
        data_size_bytes: usize,
@@ -206,7 +208,7 @@ pub trait RemoteStorage: Send + Sync + 'static {
 }
 pub struct Download {
-    pub download_stream: Pin<Box<dyn io::AsyncRead + Unpin + Send + Sync>>,
+    pub download_stream: Pin<Box<dyn Stream<Item = std::io::Result<Bytes>> + Unpin + Send + Sync>>,
    /// Extra key-value data, associated with the current remote file.
    pub metadata: Option<StorageMetadata>,
 }
@@ -300,7 +302,7 @@ impl GenericRemoteStorage {
    pub async fn upload(
        &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -398,7 +400,7 @@ impl GenericRemoteStorage {
    /// this path is used for the remote object id conversion only.
    pub async fn upload_storage_object(
        &self,
-        from: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        from_size_bytes: usize,
        to: &RemotePath,
    ) -> anyhow::Result<()> {
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -7,11 +7,14 @@
 use std::{borrow::Cow, future::Future, io::ErrorKind, pin::Pin};
 use anyhow::{bail, ensure, Context};
 use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use futures::stream::Stream;
 use tokio::{
    fs,
    io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tokio_util::io::ReaderStream;
 use tracing::*;
 use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
@@ -99,27 +102,35 @@ impl LocalFs {
        };
        // If we were given a directory, we may use it as our starting point.
-        // Otherwise, we must go up to the parent directory.  This is because
+        // Otherwise, we must go up to the first ancestor dir that exists.  This is because
        // S3 object list prefixes can be arbitrary strings, but when reading
        // the local filesystem we need a directory to start calling read_dir on.
        let mut initial_dir = full_path.clone();
-        match fs::metadata(full_path.clone()).await {
+        loop {
-            Ok(meta) => {
+            // Did we make it to the root?
-                if !meta.is_dir() {
+            if initial_dir.parent().is_none() {
                anyhow::bail!("list_files: failed to find valid ancestor dir for {full_path}");
            }
            match fs::metadata(initial_dir.clone()).await {
                Ok(meta) if meta.is_dir() => {
                    // We found a directory, break
                    break;
                }
                Ok(_meta) => {
                    // It's not a directory: strip back to the parent
                    initial_dir.pop();
                }
-            }
+                Err(e) if e.kind() == ErrorKind::NotFound => {
-            Err(e) if e.kind() == ErrorKind::NotFound => {
+                    // It's not a file that exists: strip the prefix back to the parent directory
-                // It's not a file that exists: strip the prefix back to the parent directory
+                    initial_dir.pop();
-                initial_dir.pop();
+                }
-            }
+                Err(e) => {
-            Err(e) => {
+                    // Unexpected I/O error
-                // Unexpected I/O error
+                    anyhow::bail!(e)
-                anyhow::bail!(e)
+                }
            }
        }
        // Note that Utf8PathBuf starts_with only considers full path segments, but
        // object prefixes are arbitrary strings, so we need the strings for doing
        // starts_with later.
@@ -211,7 +222,7 @@ impl RemoteStorage for LocalFs {
    async fn upload(
        &self,
-        data: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync,
        data_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -244,9 +255,12 @@ impl RemoteStorage for LocalFs {
        );
        let from_size_bytes = data_size_bytes as u64;
        let data = tokio_util::io::StreamReader::new(data);
        let data = std::pin::pin!(data);
        let mut buffer_to_read = data.take(from_size_bytes);
-        let bytes_read = io::copy(&mut buffer_to_read, &mut destination)
+        // alternatively we could just write the bytes to a file, but local_fs is a testing utility
        let bytes_read = io::copy_buf(&mut buffer_to_read, &mut destination)
            .await
            .with_context(|| {
                format!(
@@ -300,7 +314,7 @@ impl RemoteStorage for LocalFs {
    async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
        let target_path = from.with_base(&self.storage_root);
        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-            let source = io::BufReader::new(
+            let source = ReaderStream::new(
                fs::OpenOptions::new()
                    .read(true)
                    .open(&target_path)
@@ -340,16 +354,14 @@ impl RemoteStorage for LocalFs {
        }
        let target_path = from.with_base(&self.storage_root);
        if file_exists(&target_path).map_err(DownloadError::BadInput)? {
-            let mut source = io::BufReader::new(
+            let mut source = tokio::fs::OpenOptions::new()
-                fs::OpenOptions::new()
+                .read(true)
-                    .read(true)
+                .open(&target_path)
-                    .open(&target_path)
+                .await
-                    .await
+                .with_context(|| {
-                    .with_context(|| {
+                    format!("Failed to open source file {target_path:?} to use in the download")
-                        format!("Failed to open source file {target_path:?} to use in the download")
+                })
-                    })
+                .map_err(DownloadError::Other)?;
                    .map_err(DownloadError::Other)?,
            );
            source
                .seek(io::SeekFrom::Start(start_inclusive))
                .await
@@ -363,11 +375,13 @@ impl RemoteStorage for LocalFs {
            Ok(match end_exclusive {
                Some(end_exclusive) => Download {
                    metadata,
-                    download_stream: Box::pin(source.take(end_exclusive - start_inclusive)),
+                    download_stream: Box::pin(ReaderStream::new(
                        source.take(end_exclusive - start_inclusive),
                    )),
                },
                None => Download {
                    metadata,
-                    download_stream: Box::pin(source),
+                    download_stream: Box::pin(ReaderStream::new(source)),
                },
            })
        } else {
@@ -467,7 +481,9 @@ fn file_exists(file_path: &Utf8Path) -> anyhow::Result<bool> {
 mod fs_tests {
    use super::*;
    use bytes::Bytes;
    use camino_tempfile::tempdir;
    use futures_util::Stream;
    use std::{collections::HashMap, io::Write};
    async fn read_and_assert_remote_file_contents(
@@ -477,7 +493,7 @@ mod fs_tests {
        remote_storage_path: &RemotePath,
        expected_metadata: Option<&StorageMetadata>,
    ) -> anyhow::Result<String> {
-        let mut download = storage
+        let download = storage
            .download(remote_storage_path)
            .await
            .map_err(|e| anyhow::anyhow!("Download failed: {e}"))?;
@@ -486,13 +502,9 @@ mod fs_tests {
            "Unexpected metadata returned for the downloaded file"
        );
-        let mut contents = String::new();
+        let contents = aggregate(download.download_stream).await?;
-        download
+
-            .download_stream
+        String::from_utf8(contents).map_err(anyhow::Error::new)
            .read_to_string(&mut contents)
            .await
            .context("Failed to read remote file contents into string")?;
        Ok(contents)
    }
    #[tokio::test]
@@ -521,25 +533,26 @@ mod fs_tests {
        let storage = create_storage()?;
        let id = RemotePath::new(Utf8Path::new("dummy"))?;
-        let content = std::io::Cursor::new(b"12345");
+        let content = Bytes::from_static(b"12345");
        let content = move || futures::stream::once(futures::future::ready(Ok(content.clone())));
        // Check that you get an error if the size parameter doesn't match the actual
        // size of the stream.
        storage
-            .upload(Box::new(content.clone()), 0, &id, None)
+            .upload(content(), 0, &id, None)
            .await
            .expect_err("upload with zero size succeeded");
        storage
-            .upload(Box::new(content.clone()), 4, &id, None)
+            .upload(content(), 4, &id, None)
            .await
            .expect_err("upload with too short size succeeded");
        storage
-            .upload(Box::new(content.clone()), 6, &id, None)
+            .upload(content(), 6, &id, None)
            .await
            .expect_err("upload with too large size succeeded");
        // Correct size is 5, this should succeed.
-        storage.upload(Box::new(content), 5, &id, None).await?;
+        storage.upload(content(), 5, &id, None).await?;
        Ok(())
    }
@@ -587,7 +600,7 @@ mod fs_tests {
        let uploaded_bytes = dummy_contents(upload_name).into_bytes();
        let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
-        let mut first_part_download = storage
+        let first_part_download = storage
            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
            .await?;
        assert!(
@@ -595,21 +608,13 @@ mod fs_tests {
            "No metadata should be returned for no metadata upload"
        );
-        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        let first_part_remote = aggregate(first_part_download.download_stream).await?;
        io::copy(
            &mut first_part_download.download_stream,
            &mut first_part_remote,
        )
        .await?;
        first_part_remote.flush().await?;
        let first_part_remote = first_part_remote.into_inner().into_inner();
        assert_eq!(
-            first_part_local,
+            first_part_local, first_part_remote,
            first_part_remote.as_slice(),
            "First part bytes should be returned when requested"
        );
-        let mut second_part_download = storage
+        let second_part_download = storage
            .download_byte_range(
                &upload_target,
                first_part_local.len() as u64,
@@ -621,17 +626,9 @@ mod fs_tests {
            "No metadata should be returned for no metadata upload"
        );
-        let mut second_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        let second_part_remote = aggregate(second_part_download.download_stream).await?;
        io::copy(
            &mut second_part_download.download_stream,
            &mut second_part_remote,
        )
        .await?;
        second_part_remote.flush().await?;
        let second_part_remote = second_part_remote.into_inner().into_inner();
        assert_eq!(
-            second_part_local,
+            second_part_local, second_part_remote,
            second_part_remote.as_slice(),
            "Second part bytes should be returned when requested"
        );
@@ -721,17 +718,10 @@ mod fs_tests {
        let uploaded_bytes = dummy_contents(upload_name).into_bytes();
        let (first_part_local, _) = uploaded_bytes.split_at(3);
-        let mut partial_download_with_metadata = storage
+        let partial_download_with_metadata = storage
            .download_byte_range(&upload_target, 0, Some(first_part_local.len() as u64))
            .await?;
-        let mut first_part_remote = io::BufWriter::new(std::io::Cursor::new(Vec::new()));
+        let first_part_remote = aggregate(partial_download_with_metadata.download_stream).await?;
        io::copy(
            &mut partial_download_with_metadata.download_stream,
            &mut first_part_remote,
        )
        .await?;
        first_part_remote.flush().await?;
        let first_part_remote = first_part_remote.into_inner().into_inner();
        assert_eq!(
            first_part_local,
            first_part_remote.as_slice(),
@@ -807,16 +797,16 @@ mod fs_tests {
                )
            })?;
-        storage
+        let file = tokio_util::io::ReaderStream::new(file);
-            .upload(Box::new(file), size, &relative_path, metadata)
+
-            .await?;
+        storage.upload(file, size, &relative_path, metadata).await?;
        Ok(relative_path)
    }
    async fn create_file_for_upload(
        path: &Utf8Path,
        contents: &str,
-    ) -> anyhow::Result<(io::BufReader<fs::File>, usize)> {
+    ) -> anyhow::Result<(fs::File, usize)> {
        std::fs::create_dir_all(path.parent().unwrap())?;
        let mut file_for_writing = std::fs::OpenOptions::new()
            .write(true)
@@ -826,7 +816,7 @@ mod fs_tests {
        drop(file_for_writing);
        let file_size = path.metadata()?.len() as usize;
        Ok((
-            io::BufReader::new(fs::OpenOptions::new().read(true).open(&path).await?),
+            fs::OpenOptions::new().read(true).open(&path).await?,
            file_size,
        ))
    }
@@ -840,4 +830,16 @@ mod fs_tests {
        files.sort_by(|a, b| a.0.cmp(&b.0));
        Ok(files)
    }
    async fn aggregate(
        stream: impl Stream<Item = std::io::Result<Bytes>>,
    ) -> anyhow::Result<Vec<u8>> {
        use futures::stream::StreamExt;
        let mut out = Vec::new();
        let mut stream = std::pin::pin!(stream);
        while let Some(res) = stream.next().await {
            out.extend_from_slice(&res?[..]);
        }
        Ok(out)
    }
 }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -4,9 +4,14 @@
 //! allowing multiple api users to independently work with the same S3 bucket, if
 //! their bucket prefixes are both specified and different.
-use std::{borrow::Cow, sync::Arc};
+use std::{
    borrow::Cow,
    pin::Pin,
    sync::Arc,
    task::{Context, Poll},
 };
-use anyhow::Context;
+use anyhow::Context as _;
 use aws_config::{
    environment::credentials::EnvironmentVariableCredentialsProvider,
    imds::credentials::ImdsCredentialsProvider,
@@ -14,23 +19,24 @@ use aws_config::{
    provider_config::ProviderConfig,
    retry::{RetryConfigBuilder, RetryMode},
    web_identity_token::WebIdentityTokenCredentialsProvider,
    BehaviorVersion,
 };
-use aws_credential_types::cache::CredentialsCache;
+use aws_credential_types::provider::SharedCredentialsProvider;
 use aws_sdk_s3::{
-    config::{AsyncSleep, Config, Region, SharedAsyncSleep},
+    config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
    operation::get_object::GetObjectError,
    primitives::ByteStream,
    types::{Delete, ObjectIdentifier},
    Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
-use aws_smithy_http::body::SdkBody;
+
 use aws_smithy_types::body::SdkBody;
 use aws_smithy_types::byte_stream::ByteStream;
 use bytes::Bytes;
 use futures::stream::Stream;
 use hyper::Body;
 use scopeguard::ScopeGuard;
 use tokio::io::{self, AsyncRead};
 use tokio_util::io::ReaderStream;
 use tracing::debug;
 use super::StorageMetadata;
 use crate::{
@@ -61,7 +67,7 @@ struct GetObjectRequest {
 impl S3Bucket {
    /// Creates the S3 storage, errors if incorrect AWS S3 configuration provided.
    pub fn new(aws_config: &S3Config) -> anyhow::Result<Self> {
-        debug!(
+        tracing::debug!(
            "Creating s3 remote storage for S3 bucket {}",
            aws_config.bucket_name
        );
@@ -78,7 +84,6 @@ impl S3Bucket {
            // needed to access remote extensions bucket
            .or_else("token", {
                let provider_conf = ProviderConfig::without_region().with_region(region.clone());
                WebIdentityTokenCredentialsProvider::builder()
                    .configure(&provider_conf)
                    .build()
@@ -98,18 +103,20 @@ impl S3Bucket {
            .set_max_attempts(Some(1))
            .set_mode(Some(RetryMode::Adaptive));
-        let mut config_builder = Config::builder()
+        let mut config_builder = Builder::default()
            .behavior_version(BehaviorVersion::v2023_11_09())
            .region(region)
-            .credentials_cache(CredentialsCache::lazy())
+            .identity_cache(IdentityCache::lazy().build())
-            .credentials_provider(credentials_provider)
+            .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
-            .sleep_impl(SharedAsyncSleep::from(sleep_impl))
+            .retry_config(retry_config.build())
-            .retry_config(retry_config.build());
+            .sleep_impl(SharedAsyncSleep::from(sleep_impl));
        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
            config_builder = config_builder
                .endpoint_url(custom_endpoint)
                .force_path_style(true);
        }
        let client = Client::from_conf(config_builder.build());
        let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
@@ -222,12 +229,15 @@ impl S3Bucket {
        match get_object {
            Ok(object_output) => {
                let metadata = object_output.metadata().cloned().map(StorageMetadata);
                let body = object_output.body;
                let body = ByteStreamAsStream::from(body);
                let body = PermitCarrying::new(permit, body);
                let body = TimedDownload::new(started_at, body);
                Ok(Download {
                    metadata,
-                    download_stream: Box::pin(io::BufReader::new(TimedDownload::new(
+                    download_stream: Box::pin(body),
                        started_at,
                        RatelimitedAsyncRead::new(permit, object_output.body.into_async_read()),
                    ))),
                })
            }
            Err(SdkError::ServiceError(e)) if matches!(e.err(), GetObjectError::NoSuchKey(_)) => {
@@ -240,29 +250,55 @@ impl S3Bucket {
    }
 }
 pin_project_lite::pin_project! {
    struct ByteStreamAsStream {
        #[pin]
        inner: aws_smithy_types::byte_stream::ByteStream
    }
 }
 impl From<aws_smithy_types::byte_stream::ByteStream> for ByteStreamAsStream {
    fn from(inner: aws_smithy_types::byte_stream::ByteStream) -> Self {
        ByteStreamAsStream { inner }
    }
 }
 impl Stream for ByteStreamAsStream {
    type Item = std::io::Result<Bytes>;
    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
        // this does the std::io::ErrorKind::Other conversion
        self.project().inner.poll_next(cx).map_err(|x| x.into())
    }
    // cannot implement size_hint because inner.size_hint is remaining size in bytes, which makes
    // sense and Stream::size_hint does not really
 }
 pin_project_lite::pin_project! {
    /// An `AsyncRead` adapter which carries a permit for the lifetime of the value.
-    struct RatelimitedAsyncRead<S> {
+    struct PermitCarrying<S> {
        permit: tokio::sync::OwnedSemaphorePermit,
        #[pin]
        inner: S,
    }
 }
-impl<S: AsyncRead> RatelimitedAsyncRead<S> {
+impl<S> PermitCarrying<S> {
    fn new(permit: tokio::sync::OwnedSemaphorePermit, inner: S) -> Self {
-        RatelimitedAsyncRead { permit, inner }
+        Self { permit, inner }
    }
 }
-impl<S: AsyncRead> AsyncRead for RatelimitedAsyncRead<S> {
+impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for PermitCarrying<S> {
-    fn poll_read(
+    type Item = <S as Stream>::Item;
-        self: std::pin::Pin<&mut Self>,
+
-        cx: &mut std::task::Context<'_>,
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        buf: &mut io::ReadBuf<'_>,
+        self.project().inner.poll_next(cx)
-    ) -> std::task::Poll<std::io::Result<()>> {
+    }
-        let this = self.project();
+
-        this.inner.poll_read(cx, buf)
+    fn size_hint(&self) -> (usize, Option<usize>) {
        self.inner.size_hint()
    }
 }
@@ -282,7 +318,7 @@ pin_project_lite::pin_project! {
    }
 }
-impl<S: AsyncRead> TimedDownload<S> {
+impl<S> TimedDownload<S> {
    fn new(started_at: std::time::Instant, inner: S) -> Self {
        TimedDownload {
            started_at,
@@ -292,25 +328,26 @@ impl<S: AsyncRead> TimedDownload<S> {
    }
 }
-impl<S: AsyncRead> AsyncRead for TimedDownload<S> {
+impl<S: Stream<Item = std::io::Result<Bytes>>> Stream for TimedDownload<S> {
-    fn poll_read(
+    type Item = <S as Stream>::Item;
-        self: std::pin::Pin<&mut Self>,
+
-        cx: &mut std::task::Context<'_>,
+    fn poll_next(self: Pin<&mut Self>, cx: &mut Context<'_>) -> Poll<Option<Self::Item>> {
-        buf: &mut io::ReadBuf<'_>,
+        use std::task::ready;
-    ) -> std::task::Poll<std::io::Result<()>> {
+
        let this = self.project();
        let before = buf.filled().len();
        let read = std::task::ready!(this.inner.poll_read(cx, buf));
-        let read_eof = buf.filled().len() == before;
+        let res = ready!(this.inner.poll_next(cx));
-
+        match &res {
-        match read {
+            Some(Ok(_)) => {}
-            Ok(()) if read_eof => *this.outcome = AttemptOutcome::Ok,
+            Some(Err(_)) => *this.outcome = metrics::AttemptOutcome::Err,
-            Ok(()) => { /* still in progress */ }
+            None => *this.outcome = metrics::AttemptOutcome::Ok,
            Err(_) => *this.outcome = AttemptOutcome::Err,
        }
-        std::task::Poll::Ready(read)
+        Poll::Ready(res)
    }
    fn size_hint(&self) -> (usize, Option<usize>) {
        self.inner.size_hint()
    }
 }
@@ -371,11 +408,11 @@ impl RemoteStorage for S3Bucket {
            let response = response?;
-            let keys = response.contents().unwrap_or_default();
+            let keys = response.contents();
            let empty = Vec::new();
            let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);
-            tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
+            tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
            for object in keys {
                let object_path = object.key().expect("response does not contain a key");
@@ -400,7 +437,7 @@ impl RemoteStorage for S3Bucket {
    async fn upload(
        &self,
-        from: impl io::AsyncRead + Unpin + Send + Sync + 'static,
+        from: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        from_size_bytes: usize,
        to: &RemotePath,
        metadata: Option<StorageMetadata>,
@@ -410,8 +447,8 @@ impl RemoteStorage for S3Bucket {
        let started_at = start_measuring_requests(kind);
-        let body = Body::wrap_stream(ReaderStream::new(from));
+        let body = Body::wrap_stream(from);
-        let bytes_stream = ByteStream::new(SdkBody::from(body));
+        let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));
        let res = self
            .client
@@ -474,7 +511,7 @@ impl RemoteStorage for S3Bucket {
        for path in paths {
            let obj_id = ObjectIdentifier::builder()
                .set_key(Some(self.relative_path_to_s3_object(path)))
-                .build();
+                .build()?;
            delete_objects.push(obj_id);
        }
@@ -485,7 +522,11 @@ impl RemoteStorage for S3Bucket {
                .client
                .delete_objects()
                .bucket(self.bucket_name.clone())
-                .delete(Delete::builder().set_objects(Some(chunk.to_vec())).build())
+                .delete(
                    Delete::builder()
                        .set_objects(Some(chunk.to_vec()))
                        .build()?,
                )
                .send()
                .await;
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -1,6 +1,8 @@
 //! This module provides a wrapper around a real RemoteStorage implementation that
 //! causes the first N attempts at each upload or download operatio to fail. For
 //! testing purposes.
 use bytes::Bytes;
 use futures::stream::Stream;
 use std::collections::hash_map::Entry;
 use std::collections::HashMap;
 use std::sync::Mutex;
@@ -108,7 +110,7 @@ impl RemoteStorage for UnreliableWrapper {
    async fn upload(
        &self,
-        data: impl tokio::io::AsyncRead + Unpin + Send + Sync + 'static,
+        data: impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
        // S3 PUT request requires the content length to be specified,
        // otherwise it starts to fail with the concurrent connection count increasing.
        data_size_bytes: usize,
--- a/libs/remote_storage/tests/test_real_azure.rs
+++ b/libs/remote_storage/tests/test_real_azure.rs
@@ -7,7 +7,9 @@ use std::sync::Arc;
 use std::time::UNIX_EPOCH;
 use anyhow::Context;
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::stream::Stream;
 use once_cell::sync::OnceCell;
 use remote_storage::{
    AzureConfig, Download, GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind,
@@ -180,23 +182,14 @@ async fn azure_delete_objects_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Resu
    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;
-    let data1 = "remote blob data1".as_bytes();
+    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-    let data1_len = data1.len();
+    ctx.client.upload(data, len, &path1, None).await?;
    let data2 = "remote blob data2".as_bytes();
    let data2_len = data2.len();
    let data3 = "remote blob data3".as_bytes();
    let data3_len = data3.len();
    ctx.client
        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
        .await?;
-    ctx.client
+    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
-        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
+    ctx.client.upload(data, len, &path2, None).await?;
        .await?;
-    ctx.client
+    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
+    ctx.client.upload(data, len, &path3, None).await?;
        .await?;
    ctx.client.delete_objects(&[path1, path2]).await?;
@@ -219,53 +212,56 @@ async fn azure_upload_download_works(ctx: &mut MaybeEnabledAzure) -> anyhow::Res
    let path = RemotePath::new(Utf8Path::new(format!("{}/file", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;
-    let data = "remote blob data here".as_bytes();
+    let orig = bytes::Bytes::from_static("remote blob data here".as_bytes());
    let data_len = data.len() as u64;
-    ctx.client
+    let (data, len) = wrap_stream(orig.clone());
        .upload(std::io::Cursor::new(data), data.len(), &path, None)
        .await?;
-    async fn download_and_compare(mut dl: Download) -> anyhow::Result<Vec<u8>> {
+    ctx.client.upload(data, len, &path, None).await?;
    async fn download_and_compare(dl: Download) -> anyhow::Result<Vec<u8>> {
        let mut buf = Vec::new();
-        tokio::io::copy(&mut dl.download_stream, &mut buf).await?;
+        tokio::io::copy_buf(
            &mut tokio_util::io::StreamReader::new(dl.download_stream),
            &mut buf,
        )
        .await?;
        Ok(buf)
    }
    // Normal download request
    let dl = ctx.client.download(&path).await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data);
+    assert_eq!(&buf, &orig);
    // Full range (end specified)
    let dl = ctx
        .client
-        .download_byte_range(&path, 0, Some(data_len))
+        .download_byte_range(&path, 0, Some(len as u64))
        .await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data);
+    assert_eq!(&buf, &orig);
    // partial range (end specified)
    let dl = ctx.client.download_byte_range(&path, 4, Some(10)).await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data[4..10]);
+    assert_eq!(&buf, &orig[4..10]);
    // partial range (end beyond real end)
    let dl = ctx
        .client
-        .download_byte_range(&path, 8, Some(data_len * 100))
+        .download_byte_range(&path, 8, Some(len as u64 * 100))
        .await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data[8..]);
+    assert_eq!(&buf, &orig[8..]);
    // Partial range (end unspecified)
    let dl = ctx.client.download_byte_range(&path, 4, None).await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data[4..]);
+    assert_eq!(&buf, &orig[4..]);
    // Full range (end unspecified)
    let dl = ctx.client.download_byte_range(&path, 0, None).await?;
    let buf = download_and_compare(dl).await?;
-    assert_eq!(buf, data);
+    assert_eq!(&buf, &orig);
    debug!("Cleanup: deleting file at path {path:?}");
    ctx.client
@@ -281,6 +277,7 @@ fn ensure_logging_ready() {
        utils::logging::init(
            utils::logging::LogFormat::Test,
            utils::logging::TracingErrorLayerEnablement::Disabled,
            utils::logging::Output::Stdout,
        )
        .expect("logging init failed");
    });
@@ -503,11 +500,8 @@ async fn upload_azure_data(
            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
            debug!("Creating remote item {i} at path {blob_path:?}");
-            let data = format!("remote blob data {i}").into_bytes();
+            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
-            let data_len = data.len();
+            task_client.upload(data, len, &blob_path, None).await?;
            task_client
                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
                .await?;
            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
        });
@@ -588,11 +582,8 @@ async fn upload_simple_azure_data(
            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
            debug!("Creating remote item {i} at path {blob_path:?}");
-            let data = format!("remote blob data {i}").into_bytes();
+            let (data, len) = upload_stream(format!("remote blob data {i}").into_bytes().into());
-            let data_len = data.len();
+            task_client.upload(data, len, &blob_path, None).await?;
            task_client
                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
                .await?;
            Ok::<_, anyhow::Error>(blob_path)
        });
@@ -621,3 +612,32 @@ async fn upload_simple_azure_data(
        ControlFlow::Continue(uploaded_blobs)
    }
 }
 // FIXME: copypasted from test_real_s3, can't remember how to share a module which is not compiled
 // to binary
 fn upload_stream(
    content: std::borrow::Cow<'static, [u8]>,
 ) -> (
    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
    usize,
 ) {
    use std::borrow::Cow;
    let content = match content {
        Cow::Borrowed(x) => Bytes::from_static(x),
        Cow::Owned(vec) => Bytes::from(vec),
    };
    wrap_stream(content)
 }
 fn wrap_stream(
    content: bytes::Bytes,
 ) -> (
    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
    usize,
 ) {
    let len = content.len();
    let content = futures::future::ready(Ok(content));
    (futures::stream::once(content), len)
 }
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -7,7 +7,9 @@ use std::sync::Arc;
 use std::time::UNIX_EPOCH;
 use anyhow::Context;
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::stream::Stream;
 use once_cell::sync::OnceCell;
 use remote_storage::{
    GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
@@ -176,23 +178,14 @@ async fn s3_delete_objects_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()>
    let path3 = RemotePath::new(Utf8Path::new(format!("{}/path3", ctx.base_prefix).as_str()))
        .with_context(|| "RemotePath conversion")?;
-    let data1 = "remote blob data1".as_bytes();
+    let (data, len) = upload_stream("remote blob data1".as_bytes().into());
-    let data1_len = data1.len();
+    ctx.client.upload(data, len, &path1, None).await?;
    let data2 = "remote blob data2".as_bytes();
    let data2_len = data2.len();
    let data3 = "remote blob data3".as_bytes();
    let data3_len = data3.len();
    ctx.client
        .upload(std::io::Cursor::new(data1), data1_len, &path1, None)
        .await?;
-    ctx.client
+    let (data, len) = upload_stream("remote blob data2".as_bytes().into());
-        .upload(std::io::Cursor::new(data2), data2_len, &path2, None)
+    ctx.client.upload(data, len, &path2, None).await?;
        .await?;
-    ctx.client
+    let (data, len) = upload_stream("remote blob data3".as_bytes().into());
-        .upload(std::io::Cursor::new(data3), data3_len, &path3, None)
+    ctx.client.upload(data, len, &path3, None).await?;
        .await?;
    ctx.client.delete_objects(&[path1, path2]).await?;
@@ -210,6 +203,7 @@ fn ensure_logging_ready() {
        utils::logging::init(
            utils::logging::LogFormat::Test,
            utils::logging::TracingErrorLayerEnablement::Disabled,
            utils::logging::Output::Stdout,
        )
        .expect("logging init failed");
    });
@@ -431,11 +425,9 @@ async fn upload_s3_data(
            let blob_path = blob_prefix.join(Utf8Path::new(&format!("blob_{i}")));
            debug!("Creating remote item {i} at path {blob_path:?}");
-            let data = format!("remote blob data {i}").into_bytes();
+            let (data, data_len) =
-            let data_len = data.len();
+                upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client
+            task_client.upload(data, data_len, &blob_path, None).await?;
                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
                .await?;
            Ok::<_, anyhow::Error>((blob_prefix, blob_path))
        });
@@ -516,11 +508,9 @@ async fn upload_simple_s3_data(
            .with_context(|| format!("{blob_path:?} to RemotePath conversion"))?;
            debug!("Creating remote item {i} at path {blob_path:?}");
-            let data = format!("remote blob data {i}").into_bytes();
+            let (data, data_len) =
-            let data_len = data.len();
+                upload_stream(format!("remote blob data {i}").into_bytes().into());
-            task_client
+            task_client.upload(data, data_len, &blob_path, None).await?;
                .upload(std::io::Cursor::new(data), data_len, &blob_path, None)
                .await?;
            Ok::<_, anyhow::Error>(blob_path)
        });
@@ -549,3 +539,30 @@ async fn upload_simple_s3_data(
        ControlFlow::Continue(uploaded_blobs)
    }
 }
 fn upload_stream(
    content: std::borrow::Cow<'static, [u8]>,
 ) -> (
    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
    usize,
 ) {
    use std::borrow::Cow;
    let content = match content {
        Cow::Borrowed(x) => Bytes::from_static(x),
        Cow::Owned(vec) => Bytes::from(vec),
    };
    wrap_stream(content)
 }
 fn wrap_stream(
    content: bytes::Bytes,
 ) -> (
    impl Stream<Item = std::io::Result<Bytes>> + Send + Sync + 'static,
    usize,
 ) {
    let len = content.len();
    let content = futures::future::ready(Ok(content));
    (futures::stream::once(content), len)
 }
--- a/libs/utils/scripts/restore_from_wal_initdb.sh
+++ b/libs/utils/scripts/restore_from_wal_initdb.sh
@@ -0,0 +1,21 @@
 #!/bin/bash
 # like restore_from_wal.sh, but takes existing initdb.tar.zst
 set -euxo pipefail
 PG_BIN=$1
 WAL_PATH=$2
 DATA_DIR=$3
 PORT=$4
 echo "port=$PORT" >> "$DATA_DIR"/postgresql.conf
 echo "shared_preload_libraries='\$libdir/neon_rmgr.so'" >> "$DATA_DIR"/postgresql.conf
 REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
 declare -i WAL_SIZE=$REDO_POS+114
 "$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" start
 "$PG_BIN"/pg_ctl -D "$DATA_DIR" -l "$DATA_DIR/logfile.log" stop -m immediate
 cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
 cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
 for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
 dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
 rm -f 000000010000000000000001
--- a/libs/utils/src/completion.rs
+++ b/libs/utils/src/completion.rs
@@ -1,16 +1,14 @@
-use std::sync::Arc;
+use tokio_util::task::{task_tracker::TaskTrackerToken, TaskTracker};
 use tokio::sync::{mpsc, Mutex};
 /// While a reference is kept around, the associated [`Barrier::wait`] will wait.
 ///
 /// Can be cloned, moved and kept around in futures as "guard objects".
 #[derive(Clone)]
-pub struct Completion(mpsc::Sender<()>);
+pub struct Completion(TaskTrackerToken);
 /// Barrier will wait until all clones of [`Completion`] have been dropped.
 #[derive(Clone)]
-pub struct Barrier(Arc<Mutex<mpsc::Receiver<()>>>);
+pub struct Barrier(TaskTracker);
 impl Default for Barrier {
    fn default() -> Self {
@@ -21,7 +19,7 @@ impl Default for Barrier {
 impl Barrier {
    pub async fn wait(self) {
-        self.0.lock().await.recv().await;
+        self.0.wait().await;
    }
    pub async fn maybe_wait(barrier: Option<Barrier>) {
@@ -33,8 +31,7 @@ impl Barrier {
 impl PartialEq for Barrier {
    fn eq(&self, other: &Self) -> bool {
-        // we don't use dyn so this is good
+        TaskTracker::ptr_eq(&self.0, &other.0)
        Arc::ptr_eq(&self.0, &other.0)
    }
 }
@@ -42,8 +39,10 @@ impl Eq for Barrier {}
 /// Create new Guard and Barrier pair.
 pub fn channel() -> (Completion, Barrier) {
-    let (tx, rx) = mpsc::channel::<()>(1);
+    let tracker = TaskTracker::new();
-    let rx = Mutex::new(rx);
+    // otherwise wait never exits
-    let rx = Arc::new(rx);
+    tracker.close();
-    (Completion(tx), Barrier(rx))
+
    let token = tracker.token();
    (Completion(token), Barrier(tracker))
 }
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -152,3 +152,16 @@ impl Debug for Generation {
        }
    }
 }
 #[cfg(test)]
 mod test {
    use super::*;
    #[test]
    fn generation_gt() {
        // Important that a None generation compares less than a valid one, during upgrades from
        // pre-generation systems.
        assert!(Generation::none() < Generation::new(0));
        assert!(Generation::none() < Generation::new(1));
    }
 }
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -66,9 +66,17 @@ pub enum TracingErrorLayerEnablement {
    EnableWithRustLogFilter,
 }
 /// Where the logging should output to.
 #[derive(Clone, Copy)]
 pub enum Output {
    Stdout,
    Stderr,
 }
 pub fn init(
    log_format: LogFormat,
    tracing_error_layer_enablement: TracingErrorLayerEnablement,
    output: Output,
 ) -> anyhow::Result<()> {
    // We fall back to printing all spans at info-level or above if
    // the RUST_LOG environment variable is not set.
@@ -85,7 +93,12 @@ pub fn init(
        let log_layer = tracing_subscriber::fmt::layer()
            .with_target(false)
            .with_ansi(false)
-            .with_writer(std::io::stdout);
+            .with_writer(move || -> Box<dyn std::io::Write> {
                match output {
                    Output::Stdout => Box::new(std::io::stdout()),
                    Output::Stderr => Box::new(std::io::stderr()),
                }
            });
        let log_layer = match log_format {
            LogFormat::Json => log_layer.json().boxed(),
            LogFormat::Plain => log_layer.boxed(),
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -1,10 +1,10 @@
 //!
 //! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat
 //! similar to a lock, but it allows readers to "hold on" to an old value of RCU
-//! without blocking writers, and allows writing a new values without blocking
+//! without blocking writers, and allows writing a new value without blocking
-//! readers. When you update the new value, the new value is immediately visible
+//! readers. When you update the value, the new value is immediately visible
 //! to new readers, but the update waits until all existing readers have
-//! finishe, so that no one sees the old value anymore.
+//! finished, so that on return, no one sees the old value anymore.
 //!
 //! This implementation isn't wait-free; it uses an RwLock that is held for a
 //! short duration when the value is read or updated.
@@ -26,6 +26,7 @@
 //! Increment the value by one, and wait for old readers to finish:
 //!
 //! ```
 //! # async fn dox() {
 //! # let rcu = utils::simple_rcu::Rcu::new(1);
 //! let write_guard = rcu.lock_for_write();
 //!
@@ -36,15 +37,17 @@
 //!
 //! // Concurrent reads and writes are now possible again. Wait for all the readers
 //! // that still observe the old value to finish.
-//! waitlist.wait();
+//! waitlist.wait().await;
 //! # }
 //! ```
 //!
 #![warn(missing_docs)]
 use std::ops::Deref;
 use std::sync::mpsc::{sync_channel, Receiver, SyncSender};
 use std::sync::{Arc, Weak};
-use std::sync::{Mutex, RwLock, RwLockWriteGuard};
+use std::sync::{RwLock, RwLockWriteGuard};
 use tokio::sync::watch;
 ///
 /// Rcu allows multiple readers to read and hold onto a value without blocking
@@ -68,22 +71,21 @@ struct RcuCell<V> {
    value: V,
    /// A dummy channel. We never send anything to this channel. The point is
-    /// that when the RcuCell is dropped, any cloned Senders will be notified
+    /// that when the RcuCell is dropped, any subscribed Receivers will be notified
    /// that the channel is closed. Updaters can use this to wait out until the
    /// RcuCell has been dropped, i.e. until the old value is no longer in use.
    ///
-    /// We never do anything with the receiver, we just need to hold onto it so
+    /// We never send anything to this, we just need to hold onto it so that the
-    /// that the Senders will be notified when it's dropped. But because it's
+    /// Receivers will be notified when it's dropped.
-    /// not Sync, we need a Mutex on it.
+    watch: watch::Sender<()>,
    watch: (SyncSender<()>, Mutex<Receiver<()>>),
 }
 impl<V> RcuCell<V> {
    fn new(value: V) -> Self {
-        let (watch_sender, watch_receiver) = sync_channel(0);
+        let (watch_sender, _) = watch::channel(());
        RcuCell {
            value,
-            watch: (watch_sender, Mutex::new(watch_receiver)),
+            watch: watch_sender,
        }
    }
 }
@@ -141,10 +143,10 @@ impl<V> Deref for RcuReadGuard<V> {
 ///
 /// Write guard returned by `write`
 ///
-/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so
+/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so it should only be
-/// it should only be held for a short duration!
+/// held for a short duration!
 ///
-/// Calling `store` consumes the guard, making new reads and new writes possible
+/// Calling [`Self::store_and_unlock`] consumes the guard, making new reads and new writes possible
 /// again.
 ///
 pub struct RcuWriteGuard<'a, V> {
@@ -179,7 +181,7 @@ impl<'a, V> RcuWriteGuard<'a, V> {
            // the watches for any that do.
            self.inner.old_cells.retain(|weak| {
                if let Some(cell) = weak.upgrade() {
-                    watches.push(cell.watch.0.clone());
+                    watches.push(cell.watch.subscribe());
                    true
                } else {
                    false
@@ -193,20 +195,20 @@ impl<'a, V> RcuWriteGuard<'a, V> {
 ///
 /// List of readers who can still see old values.
 ///
-pub struct RcuWaitList(Vec<SyncSender<()>>);
+pub struct RcuWaitList(Vec<watch::Receiver<()>>);
 impl RcuWaitList {
    ///
    /// Wait for old readers to finish.
    ///
-    pub fn wait(mut self) {
+    pub async fn wait(mut self) {
        // after all the old_cells are no longer in use, we're done
        for w in self.0.iter_mut() {
            // This will block until the Receiver is closed. That happens when
            // the RcuCell is dropped.
            #[allow(clippy::single_match)]
-            match w.send(()) {
+            match w.changed().await {
-                Ok(_) => panic!("send() unexpectedly succeeded on dummy channel"),
+                Ok(_) => panic!("changed() unexpectedly succeeded on dummy channel"),
                Err(_) => {
                    // closed, which means that the cell has been dropped, and
                    // its value is no longer in use
@@ -220,11 +222,10 @@ impl RcuWaitList {
 mod tests {
    use super::*;
    use std::sync::{Arc, Mutex};
    use std::thread::{sleep, spawn};
    use std::time::Duration;
-    #[test]
+    #[tokio::test]
-    fn two_writers() {
+    async fn two_writers() {
        let rcu = Rcu::new(1);
        let read1 = rcu.read();
@@ -248,33 +249,35 @@ mod tests {
        assert_eq!(*read1, 1);
        let log = Arc::new(Mutex::new(Vec::new()));
-        // Wait for the old readers to finish in separate threads.
+        // Wait for the old readers to finish in separate tasks.
        let log_clone = Arc::clone(&log);
-        let thread2 = spawn(move || {
+        let task2 = tokio::spawn(async move {
-            wait2.wait();
+            wait2.wait().await;
            log_clone.lock().unwrap().push("wait2 done");
        });
        let log_clone = Arc::clone(&log);
-        let thread3 = spawn(move || {
+        let task3 = tokio::spawn(async move {
-            wait3.wait();
+            wait3.wait().await;
            log_clone.lock().unwrap().push("wait3 done");
        });
        // without this sleep the test can pass on accident if the writer is slow
-        sleep(Duration::from_millis(500));
+        tokio::time::sleep(Duration::from_millis(100)).await;
        // Release first reader. This allows first write to finish, but calling
-        // wait() on the second one would still block.
+        // wait() on the 'task3' would still block.
        log.lock().unwrap().push("dropping read1");
        drop(read1);
-        thread2.join().unwrap();
+        task2.await.unwrap();
-        sleep(Duration::from_millis(500));
+        assert!(!task3.is_finished());
        tokio::time::sleep(Duration::from_millis(100)).await;
        // Release second reader, and finish second writer.
        log.lock().unwrap().push("dropping read2");
        drop(read2);
-        thread3.join().unwrap();
+        task3.await.unwrap();
        assert_eq!(
            log.lock().unwrap().as_slice(),
--- a/libs/utils/src/sync/gate.rs
+++ b/libs/utils/src/sync/gate.rs
@@ -30,18 +30,32 @@ async fn warn_if_stuck<Fut: std::future::Future>(
    let mut fut = std::pin::pin!(fut);
-    loop {
+    let mut warned = false;
    let ret = loop {
        match tokio::time::timeout(warn_period, &mut fut).await {
-            Ok(ret) => return ret,
+            Ok(ret) => break ret,
            Err(_) => {
                tracing::warn!(
                    gate = name,
                    elapsed_ms = started.elapsed().as_millis(),
                    "still waiting, taking longer than expected..."
                );
                warned = true;
            }
        }
    };
    // If we emitted a warning for slowness, also emit a message when we complete, so that
    // someone debugging a shutdown can know for sure whether we have moved past this operation.
    if warned {
        tracing::info!(
            gate = name,
            elapsed_ms = started.elapsed().as_millis(),
            "completed, after taking longer than expected"
        )
    }
    ret
 }
 #[derive(Debug)]
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -51,6 +51,7 @@ regex.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
 serde_path_to_error.workspace = true
 serde_with.workspace = true
 signal-hook.workspace = true
 smallvec = { workspace = true, features = ["write"] }
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -3,6 +3,7 @@ use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::storage_layer::LayerFileName;
 use pageserver::tenant::storage_layer::PersistentLayerDesc;
 use pageserver_api::shard::TenantShardId;
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
@@ -211,7 +212,7 @@ fn bench_sequential(c: &mut Criterion) {
        let i32 = (i as u32) % 100;
        let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
        let layer = PersistentLayerDesc::new_img(
-            TenantId::generate(),
+            TenantShardId::unsharded(TenantId::generate()),
            TimelineId::generate(),
            zero.add(10 * i32)..zero.add(10 * i32 + 1),
            Lsn(i),
--- a/pageserver/ctl/Cargo.toml
+++ b/pageserver/ctl/Cargo.toml
@@ -18,3 +18,5 @@ tokio.workspace = true
 utils.workspace = true
 svg_fmt.workspace = true
 workspace_hack.workspace = true
 serde.workspace = true
 serde_json.workspace = true
--- a/pageserver/ctl/src/index_part.rs
+++ b/pageserver/ctl/src/index_part.rs
@@ -0,0 +1,38 @@
 use std::collections::HashMap;
 use anyhow::Context;
 use camino::Utf8PathBuf;
 use pageserver::tenant::remote_timeline_client::index::IndexLayerMetadata;
 use pageserver::tenant::storage_layer::LayerFileName;
 use pageserver::tenant::{metadata::TimelineMetadata, IndexPart};
 use utils::lsn::Lsn;
 #[derive(clap::Subcommand)]
 pub(crate) enum IndexPartCmd {
    Dump { path: Utf8PathBuf },
 }
 pub(crate) async fn main(cmd: &IndexPartCmd) -> anyhow::Result<()> {
    match cmd {
        IndexPartCmd::Dump { path } => {
            let bytes = tokio::fs::read(path).await.context("read file")?;
            let des: IndexPart = IndexPart::from_s3_bytes(&bytes).context("deserialize")?;
            #[derive(serde::Serialize)]
            struct Output<'a> {
                layer_metadata: &'a HashMap<LayerFileName, IndexLayerMetadata>,
                disk_consistent_lsn: Lsn,
                timeline_metadata: &'a TimelineMetadata,
            }
            let output = Output {
                layer_metadata: &des.layer_metadata,
                disk_consistent_lsn: des.get_disk_consistent_lsn(),
                timeline_metadata: &des.metadata,
            };
            let output = serde_json::to_string_pretty(&output).context("serialize output")?;
            println!("{output}");
            Ok(())
        }
    }
 }
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -1,13 +1,15 @@
 use std::path::{Path, PathBuf};
 use anyhow::Result;
-use camino::Utf8Path;
+use camino::{Utf8Path, Utf8PathBuf};
 use clap::Subcommand;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::block_io::BlockCursor;
 use pageserver::tenant::disk_btree::DiskBtreeReader;
 use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
 use pageserver::tenant::storage_layer::{delta_layer, image_layer};
 use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use pageserver::{page_cache, virtual_file};
 use pageserver::{
@@ -20,6 +22,7 @@ use pageserver::{
 };
 use std::fs;
 use utils::bin_ser::BeSer;
 use utils::id::{TenantId, TimelineId};
 use crate::layer_map_analyzer::parse_filename;
@@ -45,6 +48,13 @@ pub(crate) enum LayerCmd {
        /// The id from list-layer command
        id: usize,
    },
    RewriteSummary {
        layer_file_path: Utf8PathBuf,
        #[clap(long)]
        new_tenant_id: Option<TenantId>,
        #[clap(long)]
        new_timeline_id: Option<TimelineId>,
    },
 }
 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
@@ -100,6 +110,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                    println!("- timeline {}", timeline.file_name().to_string_lossy());
                }
            }
            Ok(())
        }
        LayerCmd::ListLayer {
            path,
@@ -128,6 +139,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                    idx += 1;
                }
            }
            Ok(())
        }
        LayerCmd::DumpLayer {
            path,
@@ -168,7 +180,63 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                    idx += 1;
                }
            }
            Ok(())
        }
        LayerCmd::RewriteSummary {
            layer_file_path,
            new_tenant_id,
            new_timeline_id,
        } => {
            pageserver::virtual_file::init(10);
            pageserver::page_cache::init(100);
            let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
            macro_rules! rewrite_closure {
                ($($summary_ty:tt)*) => {{
                    |summary| $($summary_ty)* {
                        tenant_id: new_tenant_id.unwrap_or(summary.tenant_id),
                        timeline_id: new_timeline_id.unwrap_or(summary.timeline_id),
                        ..summary
                    }
                }};
            }
            let res = ImageLayer::rewrite_summary(
                layer_file_path,
                rewrite_closure!(image_layer::Summary),
                &ctx,
            )
            .await;
            match res {
                Ok(()) => {
                    println!("Successfully rewrote summary of image layer {layer_file_path}");
                    return Ok(());
                }
                Err(image_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough
                Err(image_layer::RewriteSummaryError::Other(e)) => {
                    return Err(e);
                }
            }
            let res = DeltaLayer::rewrite_summary(
                layer_file_path,
                rewrite_closure!(delta_layer::Summary),
                &ctx,
            )
            .await;
            match res {
                Ok(()) => {
                    println!("Successfully rewrote summary of delta layer {layer_file_path}");
                    return Ok(());
                }
                Err(delta_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough
                Err(delta_layer::RewriteSummaryError::Other(e)) => {
                    return Err(e);
                }
            }
            anyhow::bail!("not an image or delta layer: {layer_file_path}");
        }
    }
    Ok(())
 }
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -5,11 +5,13 @@
 //! Separate, `metadata` subcommand allows to print and update pageserver's metadata file.
 mod draw_timeline_dir;
 mod index_part;
 mod layer_map_analyzer;
 mod layers;
 use camino::{Utf8Path, Utf8PathBuf};
 use clap::{Parser, Subcommand};
 use index_part::IndexPartCmd;
 use layers::LayerCmd;
 use pageserver::{
    context::{DownloadBehavior, RequestContext},
@@ -38,6 +40,8 @@ struct CliOpts {
 #[derive(Subcommand)]
 enum Commands {
    Metadata(MetadataCmd),
    #[command(subcommand)]
    IndexPart(IndexPartCmd),
    PrintLayerFile(PrintLayerFileCmd),
    DrawTimeline {},
    AnalyzeLayerMap(AnalyzeLayerMapCmd),
@@ -83,6 +87,9 @@ async fn main() -> anyhow::Result<()> {
        Commands::Metadata(cmd) => {
            handle_metadata(&cmd)?;
        }
        Commands::IndexPart(cmd) => {
            index_part::main(&cmd).await?;
        }
        Commands::DrawTimeline {} => {
            draw_timeline_dir::main()?;
        }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -103,7 +103,11 @@ fn main() -> anyhow::Result<()> {
    } else {
        TracingErrorLayerEnablement::Disabled
    };
-    logging::init(conf.log_format, tracing_error_layer_enablement)?;
+    logging::init(
        conf.log_format,
        tracing_error_layer_enablement,
        logging::Output::Stdout,
    )?;
    // mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
    // disarming this hook on pageserver, because we never tear down tracing.
@@ -366,13 +370,18 @@ fn start_pageserver(
    // Top-level cancellation token for the process
    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
    pageserver::PAGESERVER_SHUTDOWN_TOKEN
        .set(shutdown_pageserver.clone())
        .map_err(|_| ())
        .expect("cannot be set already");
    // Set up remote storage client
    let remote_storage = create_remote_storage_client(conf)?;
    // Set up deletion queue
    let (deletion_queue, deletion_workers) = DeletionQueue::new(
        remote_storage.clone(),
-        ControlPlaneClient::new(conf, &shutdown_pageserver),
+        ControlPlaneClient::new(conf, shutdown_pageserver.child_token()),
        conf,
    );
    if let Some(deletion_workers) = deletion_workers {
@@ -398,15 +407,11 @@ fn start_pageserver(
    let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel();
    let (init_done_tx, init_done_rx) = utils::completion::channel();
    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
    let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
    let order = pageserver::InitializationOrder {
        initial_tenant_load_remote: Some(init_done_tx),
        initial_tenant_load: Some(init_remote_done_tx),
        initial_logical_size_can_start: init_done_rx.clone(),
        initial_logical_size_attempt: Some(init_logical_size_done_tx),
        background_jobs_can_start: background_jobs_barrier.clone(),
    };
@@ -420,13 +425,12 @@ fn start_pageserver(
            deletion_queue_client,
        },
        order,
-        shutdown_pageserver.clone(),
+        shutdown_pageserver.child_token(),
    ))?;
    let tenant_manager = Arc::new(tenant_manager);
    BACKGROUND_RUNTIME.spawn({
-        let init_done_rx = init_done_rx;
+        let shutdown_pageserver = shutdown_pageserver.child_token();
        let shutdown_pageserver = shutdown_pageserver.clone();
        let drive_init = async move {
            // NOTE: unlike many futures in pageserver, this one is cancellation-safe
            let guard = scopeguard::guard_on_success((), |_| {
@@ -460,7 +464,7 @@ fn start_pageserver(
            });
            let WaitForPhaseResult {
-                timeout_remaining: timeout,
+                timeout_remaining: _timeout,
                skipped: init_load_skipped,
            } = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;
@@ -468,26 +472,6 @@ fn start_pageserver(
            scopeguard::ScopeGuard::into_inner(guard);
            let guard = scopeguard::guard_on_success((), |_| {
                tracing::info!("Cancelled before initial logical sizes completed")
            });
            let logical_sizes_done = std::pin::pin!(async {
                init_logical_size_done_rx.wait().await;
                startup_checkpoint(
                    started_startup_at,
                    "initial_logical_sizes",
                    "Initial logical sizes completed",
                );
            });
            let WaitForPhaseResult {
                timeout_remaining: _,
                skipped: logical_sizes_skipped,
            } = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;
            scopeguard::ScopeGuard::into_inner(guard);
            // allow background jobs to start: we either completed prior stages, or they reached timeout
            // and were skipped.  It is important that we do not let them block background jobs indefinitely,
            // because things like consumption metrics for billing are blocked by this barrier.
@@ -510,9 +494,6 @@ fn start_pageserver(
            if let Some(f) = init_load_skipped {
                f.await;
            }
            if let Some(f) = logical_sizes_skipped {
                f.await;
            }
            scopeguard::ScopeGuard::into_inner(guard);
            startup_checkpoint(started_startup_at, "complete", "Startup complete");
@@ -540,6 +521,7 @@ fn start_pageserver(
            remote_storage.clone(),
            disk_usage_eviction_state.clone(),
            background_jobs_barrier.clone(),
            shutdown_pageserver.child_token(),
        )?;
    }
@@ -560,13 +542,16 @@ fn start_pageserver(
            )
            .context("Failed to initialize router state")?,
        );
        let cancel = shutdown_pageserver.child_token();
        let router = http::make_router(router_state, launch_ts, http_auth.clone())?
            .build()
            .map_err(|err| anyhow!(err))?;
        let service = utils::http::RouterService::new(router).unwrap();
        let server = hyper::Server::from_tcp(http_listener)?
            .serve(service)
-            .with_graceful_shutdown(task_mgr::shutdown_watcher());
+            .with_graceful_shutdown(cancel.clone().cancelled_owned());
        task_mgr::spawn(
            MGMT_REQUEST_RUNTIME.handle(),
@@ -575,6 +560,7 @@ fn start_pageserver(
            None,
            "http endpoint listener",
            true,
            cancel,
            async {
                server.await?;
                Ok(())
@@ -583,7 +569,6 @@ fn start_pageserver(
    }
    if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
        let background_jobs_barrier = background_jobs_barrier;
        let metrics_ctx = RequestContext::todo_child(
            TaskKind::MetricsCollection,
            // This task itself shouldn't download anything.
@@ -601,6 +586,7 @@ fn start_pageserver(
            None,
            "consumption metrics collection",
            true,
            shutdown_pageserver.child_token(),
            async move {
                // first wait until background jobs are cleared to launch.
                //
@@ -621,6 +607,7 @@ fn start_pageserver(
                    conf.synthetic_size_calculation_interval,
                    conf.id,
                    local_disk_storage,
                    cancel,
                    metrics_ctx,
                )
                .instrument(info_span!("metrics_collection"))
@@ -648,6 +635,7 @@ fn start_pageserver(
            None,
            "libpq endpoint listener",
            true,
            shutdown_pageserver.child_token(),
            async move {
                page_service::libpq_listener_main(
                    conf,
@@ -681,9 +669,8 @@ fn start_pageserver(
                signal.name()
            );
-            // This cancels the `shutdown_pageserver` cancellation tree.
+            // This cancels the `shutdown_pageserver` cancellation tree and signals cancellation to
-            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
+            // all tasks in the system.
            // The plan is to change that over time.
            shutdown_pageserver.take();
            let bg_remote_storage = remote_storage.clone();
            let bg_deletion_queue = deletion_queue.clone();
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -5,6 +5,7 @@
 //! See also `settings.md` for better description on every parameter.
 use anyhow::{anyhow, bail, ensure, Context, Result};
 use pageserver_api::shard::TenantShardId;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use serde::de::IntoDeserializer;
 use std::env;
@@ -25,7 +26,7 @@ use toml_edit::{Document, Item};
 use camino::{Utf8Path, Utf8PathBuf};
 use postgres_backend::AuthType;
 use utils::{
-    id::{NodeId, TenantId, TimelineId},
+    id::{NodeId, TimelineId},
    logging::LogFormat,
 };
@@ -628,12 +629,13 @@ impl PageServerConf {
        self.deletion_prefix().join(format!("header-{VERSION:02x}"))
    }
-    pub fn tenant_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
+    pub fn tenant_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
-        self.tenants_path().join(tenant_id.to_string())
+        self.tenants_path().join(tenant_shard_id.to_string())
    }
-    pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
+    pub fn tenant_ignore_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
-        self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
+        self.tenant_path(tenant_shard_id)
            .join(IGNORED_TENANT_FILE_NAME)
    }
    /// Points to a place in pageserver's local directory,
@@ -641,47 +643,53 @@ impl PageServerConf {
    ///
    /// Legacy: superseded by tenant_location_config_path.  Eventually
    /// remove this function.
-    pub fn tenant_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
+    pub fn tenant_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
-        self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME)
+        self.tenant_path(tenant_shard_id).join(TENANT_CONFIG_NAME)
    }
-    pub fn tenant_location_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
+    pub fn tenant_location_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
-        self.tenant_path(tenant_id)
+        self.tenant_path(tenant_shard_id)
            .join(TENANT_LOCATION_CONFIG_NAME)
    }
-    pub fn timelines_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
+    pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
-        self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
+        self.tenant_path(tenant_shard_id)
            .join(TIMELINES_SEGMENT_NAME)
    }
-    pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
+    pub fn timeline_path(
-        self.timelines_path(tenant_id).join(timeline_id.to_string())
+        &self,
        tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
    ) -> Utf8PathBuf {
        self.timelines_path(tenant_shard_id)
            .join(timeline_id.to_string())
    }
    pub fn timeline_uninit_mark_file_path(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
    ) -> Utf8PathBuf {
        path_with_suffix_extension(
-            self.timeline_path(&tenant_id, &timeline_id),
+            self.timeline_path(&tenant_shard_id, &timeline_id),
            TIMELINE_UNINIT_MARK_SUFFIX,
        )
    }
    pub fn timeline_delete_mark_file_path(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
    ) -> Utf8PathBuf {
        path_with_suffix_extension(
-            self.timeline_path(&tenant_id, &timeline_id),
+            self.timeline_path(&tenant_shard_id, &timeline_id),
            TIMELINE_DELETE_MARK_SUFFIX,
        )
    }
-    pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
+    pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
-        self.tenant_path(tenant_id)
+        self.tenant_path(tenant_shard_id)
            .join(TENANT_DELETED_MARKER_FILE_NAME)
    }
@@ -691,20 +699,24 @@ impl PageServerConf {
    pub fn trace_path(
        &self,
-        tenant_id: &TenantId,
+        tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
        connection_id: &ConnectionId,
    ) -> Utf8PathBuf {
        self.traces_path()
-            .join(tenant_id.to_string())
+            .join(tenant_shard_id.to_string())
            .join(timeline_id.to_string())
            .join(connection_id.to_string())
    }
    /// Points to a place in pageserver's local directory,
    /// where certain timeline's metadata file should be located.
-    pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
+    pub fn metadata_path(
-        self.timeline_path(tenant_id, timeline_id)
+        &self,
        tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
    ) -> Utf8PathBuf {
        self.timeline_path(tenant_shard_id, timeline_id)
            .join(METADATA_FILE_NAME)
    }
@@ -767,7 +779,7 @@ impl PageServerConf {
                    builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
                }
                "tenant_config" => {
-                    t_conf = Self::parse_toml_tenant_conf(item)?;
+                    t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
                }
                "id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
                "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
@@ -841,114 +853,10 @@ impl PageServerConf {
        Ok(conf)
    }
    // subroutine of parse_and_validate to parse `[tenant_conf]` section
    pub fn parse_toml_tenant_conf(item: &toml_edit::Item) -> Result<TenantConfOpt> {
        let mut t_conf: TenantConfOpt = Default::default();
        if let Some(checkpoint_distance) = item.get("checkpoint_distance") {
            t_conf.checkpoint_distance =
                Some(parse_toml_u64("checkpoint_distance", checkpoint_distance)?);
        }
        if let Some(checkpoint_timeout) = item.get("checkpoint_timeout") {
            t_conf.checkpoint_timeout = Some(parse_toml_duration(
                "checkpoint_timeout",
                checkpoint_timeout,
            )?);
        }
        if let Some(compaction_target_size) = item.get("compaction_target_size") {
            t_conf.compaction_target_size = Some(parse_toml_u64(
                "compaction_target_size",
                compaction_target_size,
            )?);
        }
        if let Some(compaction_period) = item.get("compaction_period") {
            t_conf.compaction_period =
                Some(parse_toml_duration("compaction_period", compaction_period)?);
        }
        if let Some(compaction_threshold) = item.get("compaction_threshold") {
            t_conf.compaction_threshold =
                Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?);
        }
        if let Some(image_creation_threshold) = item.get("image_creation_threshold") {
            t_conf.image_creation_threshold = Some(
                parse_toml_u64("image_creation_threshold", image_creation_threshold)?.try_into()?,
            );
        }
        if let Some(gc_horizon) = item.get("gc_horizon") {
            t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?);
        }
        if let Some(gc_period) = item.get("gc_period") {
            t_conf.gc_period = Some(parse_toml_duration("gc_period", gc_period)?);
        }
        if let Some(pitr_interval) = item.get("pitr_interval") {
            t_conf.pitr_interval = Some(parse_toml_duration("pitr_interval", pitr_interval)?);
        }
        if let Some(walreceiver_connect_timeout) = item.get("walreceiver_connect_timeout") {
            t_conf.walreceiver_connect_timeout = Some(parse_toml_duration(
                "walreceiver_connect_timeout",
                walreceiver_connect_timeout,
            )?);
        }
        if let Some(lagging_wal_timeout) = item.get("lagging_wal_timeout") {
            t_conf.lagging_wal_timeout = Some(parse_toml_duration(
                "lagging_wal_timeout",
                lagging_wal_timeout,
            )?);
        }
        if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
            t_conf.max_lsn_wal_lag =
                Some(deserialize_from_item("max_lsn_wal_lag", max_lsn_wal_lag)?);
        }
        if let Some(trace_read_requests) = item.get("trace_read_requests") {
            t_conf.trace_read_requests =
                Some(trace_read_requests.as_bool().with_context(|| {
                    "configure option trace_read_requests is not a bool".to_string()
                })?);
        }
        if let Some(eviction_policy) = item.get("eviction_policy") {
            t_conf.eviction_policy = Some(
                deserialize_from_item("eviction_policy", eviction_policy)
                    .context("parse eviction_policy")?,
            );
        }
        if let Some(item) = item.get("min_resident_size_override") {
            t_conf.min_resident_size_override = Some(
                deserialize_from_item("min_resident_size_override", item)
                    .context("parse min_resident_size_override")?,
            );
        }
        if let Some(item) = item.get("evictions_low_residence_duration_metric_threshold") {
            t_conf.evictions_low_residence_duration_metric_threshold = Some(parse_toml_duration(
                "evictions_low_residence_duration_metric_threshold",
                item,
            )?);
        }
        if let Some(gc_feedback) = item.get("gc_feedback") {
            t_conf.gc_feedback = Some(
                gc_feedback
                    .as_bool()
                    .with_context(|| "configure option gc_feedback is not a bool".to_string())?,
            );
        }
        Ok(t_conf)
    }
    #[cfg(test)]
    pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
-        Utf8PathBuf::from(format!("../tmp_check/test_{test_name}"))
+        let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
        Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
    }
    pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
@@ -1417,6 +1325,37 @@ trace_read_requests = {trace_read_requests}"#,
        Ok(())
    }
    #[test]
    fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
        let config_string = r#"
            [tenant_config]
            checkpoint_distance = -1 # supposed to be an u64
        "#
        .to_string();
        let toml: Document = config_string.parse()?;
        let item = toml.get("tenant_config").unwrap();
        let error = TenantConfOpt::try_from(item.to_owned()).unwrap_err();
        let expected_error_str = "checkpoint_distance: invalid value: integer `-1`, expected u64";
        assert_eq!(error.to_string(), expected_error_str);
        Ok(())
    }
    #[test]
    fn parse_override_tenant_config() -> anyhow::Result<()> {
        let config_string = r#"tenant_config={ min_resident_size_override =  400 }"#.to_string();
        let toml: Document = config_string.parse()?;
        let item = toml.get("tenant_config").unwrap();
        let conf = TenantConfOpt::try_from(item.to_owned()).unwrap();
        assert_eq!(conf.min_resident_size_override, Some(400));
        Ok(())
    }
    #[test]
    fn eviction_pageserver_config_parse() -> anyhow::Result<()> {
        let tempdir = tempdir()?;
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -3,7 +3,7 @@
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::tasks::BackgroundLoopKind;
-use crate::tenant::{mgr, LogicalSizeCalculationCause};
+use crate::tenant::{mgr, LogicalSizeCalculationCause, PageReconstructError, Tenant};
 use camino::Utf8PathBuf;
 use consumption_metrics::EventType;
 use pageserver_api::models::TenantState;
@@ -12,6 +12,7 @@ use std::collections::HashMap;
 use std::sync::Arc;
 use std::time::{Duration, SystemTime};
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::NodeId;
@@ -37,6 +38,7 @@ type RawMetric = (MetricsKey, (EventType, u64));
 type Cache = HashMap<MetricsKey, (EventType, u64)>;
 /// Main thread that serves metrics collection
 #[allow(clippy::too_many_arguments)]
 pub async fn collect_metrics(
    metric_collection_endpoint: &Url,
    metric_collection_interval: Duration,
@@ -44,6 +46,7 @@ pub async fn collect_metrics(
    synthetic_size_calculation_interval: Duration,
    node_id: NodeId,
    local_disk_storage: Utf8PathBuf,
    cancel: CancellationToken,
    ctx: RequestContext,
 ) -> anyhow::Result<()> {
    if _cached_metric_collection_interval != Duration::ZERO {
@@ -62,10 +65,15 @@ pub async fn collect_metrics(
        None,
        "synthetic size calculation",
        false,
        cancel.child_token(),
        async move {
-            calculate_synthetic_size_worker(synthetic_size_calculation_interval, &worker_ctx)
+            calculate_synthetic_size_worker(
-                .instrument(info_span!("synthetic_size_worker"))
+                synthetic_size_calculation_interval,
-                .await?;
+                &cancel,
                &worker_ctx,
            )
            .instrument(info_span!("synthetic_size_worker"))
            .await?;
            Ok(())
        },
    );
@@ -241,6 +249,7 @@ async fn reschedule(
 /// Caclculate synthetic size for each active tenant
 async fn calculate_synthetic_size_worker(
    synthetic_size_calculation_interval: Duration,
    cancel: &CancellationToken,
    ctx: &RequestContext,
 ) -> anyhow::Result<()> {
    info!("starting calculate_synthetic_size_worker");
@@ -248,8 +257,6 @@ async fn calculate_synthetic_size_worker(
        info!("calculate_synthetic_size_worker stopped");
    };
    let cause = LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
    loop {
        let started_at = Instant::now();
@@ -261,21 +268,25 @@ async fn calculate_synthetic_size_worker(
            }
        };
-        for (tenant_id, tenant_state) in tenants {
+        for (tenant_shard_id, tenant_state) in tenants {
            if tenant_state != TenantState::Active {
                continue;
            }
-            if let Ok(tenant) = mgr::get_tenant(tenant_id, true) {
+            if !tenant_shard_id.is_zero() {
-                // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
+                // We only send consumption metrics from shard 0, so don't waste time calculating
-                // We can put in some prioritization for consumption metrics.
+                // synthetic size on other shards.
-                // Same for the loop that fetches computed metrics.
+                continue;
                // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
                // which turns out is really handy to understand the system.
                if let Err(e) = tenant.calculate_synthetic_size(cause, ctx).await {
                    error!("failed to calculate synthetic size for tenant {tenant_id}: {e:#}");
                }
            }
            let Ok(tenant) = mgr::get_tenant(tenant_shard_id, true) else {
                continue;
            };
            // there is never any reason to exit calculate_synthetic_size_worker following any
            // return value -- we don't need to care about shutdown because no tenant is found when
            // pageserver is shut down.
            calculate_and_log(&tenant, cancel, ctx).await;
        }
        crate::tenant::tasks::warn_when_period_overrun(
@@ -286,7 +297,7 @@ async fn calculate_synthetic_size_worker(
        let res = tokio::time::timeout_at(
            started_at + synthetic_size_calculation_interval,
-            task_mgr::shutdown_token().cancelled(),
+            cancel.cancelled(),
        )
        .await;
        if res.is_ok() {
@@ -294,3 +305,31 @@ async fn calculate_synthetic_size_worker(
        }
    }
 }
 async fn calculate_and_log(tenant: &Tenant, cancel: &CancellationToken, ctx: &RequestContext) {
    const CAUSE: LogicalSizeCalculationCause =
        LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize;
    // TODO should we use concurrent_background_tasks_rate_limit() here, like the other background tasks?
    // We can put in some prioritization for consumption metrics.
    // Same for the loop that fetches computed metrics.
    // By using the same limiter, we centralize metrics collection for "start" and "finished" counters,
    // which turns out is really handy to understand the system.
    let Err(e) = tenant.calculate_synthetic_size(CAUSE, cancel, ctx).await else {
        return;
    };
    // this error can be returned if timeline is shutting down, but it does not
    // mean the synthetic size worker should terminate. we do not need any checks
    // in this function because `mgr::get_tenant` will error out after shutdown has
    // progressed to shutting down tenants.
    let shutting_down = matches!(
        e.downcast_ref::<PageReconstructError>(),
        Some(PageReconstructError::Cancelled | PageReconstructError::AncestorStopping(_))
    );
    if !shutting_down {
        let tenant_shard_id = tenant.tenant_shard_id();
        error!("failed to calculate synthetic size for tenant {tenant_shard_id}: {e:#}");
    }
 }
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -1,5 +1,4 @@
-use crate::context::RequestContext;
+use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
 use anyhow::Context;
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
 use futures::stream::StreamExt;
@@ -198,12 +197,12 @@ pub(super) async fn collect_all_metrics(
    };
    let tenants = futures::stream::iter(tenants).filter_map(|(id, state)| async move {
-        if state != TenantState::Active {
+        if state != TenantState::Active || !id.is_zero() {
            None
        } else {
            crate::tenant::mgr::get_tenant(id, true)
                .ok()
-                .map(|tenant| (id, tenant))
+                .map(|tenant| (id.tenant_id, tenant))
        }
    });
@@ -351,14 +350,17 @@ impl TimelineSnapshot {
            let last_record_lsn = t.get_last_record_lsn();
            let current_exact_logical_size = {
-                let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
+                let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
-                let res = span
+                let size = span.in_scope(|| {
-                    .in_scope(|| t.get_current_logical_size(ctx))
+                    t.get_current_logical_size(
-                    .context("get_current_logical_size");
+                        crate::tenant::timeline::GetLogicalSizePriority::Background,
-                match res? {
+                        ctx,
                    )
                });
                match size {
                    // Only send timeline logical size when it is fully calculated.
-                    (size, is_exact) if is_exact => Some(size),
+                    CurrentLogicalSize::Exact(ref size) => Some(size.into()),
-                    (_, _) => None,
+                    CurrentLogicalSize::Approximate(_) => None,
                }
            };
--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -1,16 +1,15 @@
 use std::collections::HashMap;
-use pageserver_api::control_api::{
+use pageserver_api::{
-    ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
+    control_api::{
        ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
    },
    shard::TenantShardId,
 };
 use serde::{de::DeserializeOwned, Serialize};
 use tokio_util::sync::CancellationToken;
 use url::Url;
-use utils::{
+use utils::{backoff, generation::Generation, id::NodeId};
    backoff,
    generation::Generation,
    id::{NodeId, TenantId},
 };
 use crate::config::PageServerConf;
@@ -31,17 +30,17 @@ pub enum RetryForeverError {
 #[async_trait::async_trait]
 pub trait ControlPlaneGenerationsApi {
-    async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError>;
+    async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError>;
    async fn validate(
        &self,
-        tenants: Vec<(TenantId, Generation)>,
+        tenants: Vec<(TenantShardId, Generation)>,
-    ) -> Result<HashMap<TenantId, bool>, RetryForeverError>;
+    ) -> Result<HashMap<TenantShardId, bool>, RetryForeverError>;
 }
 impl ControlPlaneClient {
    /// A None return value indicates that the input `conf` object does not have control
    /// plane API enabled.
-    pub fn new(conf: &'static PageServerConf, cancel: &CancellationToken) -> Option<Self> {
+    pub fn new(conf: &'static PageServerConf, cancel: CancellationToken) -> Option<Self> {
        let mut url = match conf.control_plane_api.as_ref() {
            Some(u) => u.clone(),
            None => return None,
@@ -68,7 +67,7 @@ impl ControlPlaneClient {
            http_client: client.build().expect("Failed to construct HTTP client"),
            base_url: url,
            node_id: conf.id,
-            cancel: cancel.clone(),
+            cancel,
        })
    }
@@ -127,7 +126,7 @@ impl ControlPlaneClient {
 #[async_trait::async_trait]
 impl ControlPlaneGenerationsApi for ControlPlaneClient {
    /// Block until we get a successful response, or error out if we are shut down
-    async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
+    async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
        let re_attach_path = self
            .base_url
            .join("re-attach")
@@ -154,8 +153,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
    /// Block until we get a successful response, or error out if we are shut down
    async fn validate(
        &self,
-        tenants: Vec<(TenantId, Generation)>,
+        tenants: Vec<(TenantShardId, Generation)>,
-    ) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
+    ) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
        let re_attach_path = self
            .base_url
            .join("validate")
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -10,11 +10,12 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
 use crate::tenant::remote_timeline_client::LayerFileMetadata;
 use crate::virtual_file::MaybeFatalIo;
 use crate::virtual_file::VirtualFile;
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use hex::FromHex;
+use pageserver_api::shard::TenantShardId;
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use serde::Deserialize;
 use serde::Serialize;
@@ -25,7 +26,7 @@ use tracing::Instrument;
 use tracing::{self, debug, error};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::generation::Generation;
-use utils::id::{TenantId, TimelineId};
+use utils::id::TimelineId;
 use utils::lsn::AtomicLsn;
 use utils::lsn::Lsn;
@@ -159,11 +160,10 @@ pub struct DeletionQueueClient {
    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
 }
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
 struct TenantDeletionList {
    /// For each Timeline, a list of key fragments to append to the timeline remote path
    /// when reconstructing a full key
    #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
    timelines: HashMap<TimelineId, Vec<String>>,
    /// The generation in which this deletion was emitted: note that this may not be the
@@ -178,43 +178,11 @@ impl TenantDeletionList {
    }
 }
 /// For HashMaps using a `hex` compatible key, where we would like to encode the key as a string
 fn to_hex_map<S, V, I>(input: &HashMap<I, V>, serializer: S) -> Result<S::Ok, S::Error>
 where
    S: serde::Serializer,
    V: Serialize,
    I: AsRef<[u8]>,
 {
    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v));
    transformed
        .collect::<HashMap<String, &V>>()
        .serialize(serializer)
 }
 /// For HashMaps using a FromHex key, where we would like to decode the key
 fn from_hex_map<'de, D, V, I>(deserializer: D) -> Result<HashMap<I, V>, D::Error>
 where
    D: serde::de::Deserializer<'de>,
    V: Deserialize<'de>,
    I: FromHex + std::hash::Hash + Eq,
 {
    let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
    hex_map
        .into_iter()
        .map(|(k, v)| {
            I::from_hex(k)
                .map(|k| (k, v))
                .map_err(|_| serde::de::Error::custom("Invalid hex ID"))
        })
        .collect()
 }
 /// Files ending with this suffix will be ignored and erased
 /// during recovery as startup.
 const TEMP_SUFFIX: &str = "tmp";
-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
 struct DeletionList {
    /// Serialization version, for future use
    version: u8,
@@ -226,8 +194,7 @@ struct DeletionList {
    /// nested HashMaps by TenantTimelineID.  Each Tenant only appears once
    /// with one unique generation ID: if someone tries to push a second generation
    /// ID for the same tenant, we will start a new DeletionList.
-    #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
+    tenants: HashMap<TenantShardId, TenantDeletionList>,
    tenants: HashMap<TenantId, TenantDeletionList>,
    /// Avoid having to walk `tenants` to calculate the number of keys in
    /// the nested deletion lists
@@ -299,7 +266,7 @@ impl DeletionList {
    /// deletion list.
    fn push(
        &mut self,
-        tenant: &TenantId,
+        tenant: &TenantShardId,
        timeline: &TimelineId,
        generation: Generation,
        objects: &mut Vec<RemotePath>,
@@ -391,7 +358,7 @@ struct TenantLsnState {
 #[derive(Default)]
 struct VisibleLsnUpdates {
-    tenants: HashMap<TenantId, TenantLsnState>,
+    tenants: HashMap<TenantShardId, TenantLsnState>,
 }
 impl VisibleLsnUpdates {
@@ -448,7 +415,7 @@ impl DeletionQueueClient {
    pub(crate) fn recover(
        &self,
-        attached_tenants: HashMap<TenantId, Generation>,
+        attached_tenants: HashMap<TenantShardId, Generation>,
    ) -> Result<(), DeletionQueueError> {
        self.do_push(
            &self.tx,
@@ -465,7 +432,7 @@ impl DeletionQueueClient {
    /// backend will later wake up and notice that the tenant's generation requires validation.
    pub(crate) async fn update_remote_consistent_lsn(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        current_generation: Generation,
        lsn: Lsn,
@@ -476,10 +443,13 @@ impl DeletionQueueClient {
            .write()
            .expect("Lock should never be poisoned");
-        let tenant_entry = locked.tenants.entry(tenant_id).or_insert(TenantLsnState {
+        let tenant_entry = locked
-            timelines: HashMap::new(),
+            .tenants
-            generation: current_generation,
+            .entry(tenant_shard_id)
-        });
+            .or_insert(TenantLsnState {
                timelines: HashMap::new(),
                generation: current_generation,
            });
        if tenant_entry.generation != current_generation {
            // Generation might have changed if we were detached and then re-attached: in this case,
@@ -506,27 +476,29 @@ impl DeletionQueueClient {
    /// generations in `layers` are the generations in which those layers were written.
    pub(crate) async fn push_layers(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        current_generation: Generation,
-        layers: Vec<(LayerFileName, Generation)>,
+        layers: Vec<(LayerFileName, LayerFileMetadata)>,
    ) -> Result<(), DeletionQueueError> {
        if current_generation.is_none() {
            debug!("Enqueuing deletions in legacy mode, skipping queue");
            let mut layer_paths = Vec::new();
-            for (layer, generation) in layers {
+            for (layer, meta) in layers {
                layer_paths.push(remote_layer_path(
-                    &tenant_id,
+                    &tenant_shard_id.tenant_id,
                    &timeline_id,
                    meta.shard,
                    &layer,
-                    generation,
+                    meta.generation,
                ));
            }
            self.push_immediate(layer_paths).await?;
            return self.flush_immediate().await;
        }
-        self.push_layers_sync(tenant_id, timeline_id, current_generation, layers)
+        self.push_layers_sync(tenant_shard_id, timeline_id, current_generation, layers)
    }
    /// When a Tenant has a generation, push_layers is always synchronous because
@@ -536,10 +508,10 @@ impl DeletionQueueClient {
    /// support (`<https://github.com/neondatabase/neon/issues/5395>`)
    pub(crate) fn push_layers_sync(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        current_generation: Generation,
-        layers: Vec<(LayerFileName, Generation)>,
+        layers: Vec<(LayerFileName, LayerFileMetadata)>,
    ) -> Result<(), DeletionQueueError> {
        metrics::DELETION_QUEUE
            .keys_submitted
@@ -547,7 +519,7 @@ impl DeletionQueueClient {
        self.do_push(
            &self.tx,
            ListWriterQueueMessage::Delete(DeletionOp {
-                tenant_id,
+                tenant_shard_id,
                timeline_id,
                layers,
                generation: current_generation,
@@ -750,6 +722,7 @@ impl DeletionQueue {
 mod test {
    use camino::Utf8Path;
    use hex_literal::hex;
    use pageserver_api::shard::ShardIndex;
    use std::{io::ErrorKind, time::Duration};
    use tracing::info;
@@ -814,12 +787,12 @@ mod test {
        }
        fn set_latest_generation(&self, gen: Generation) {
-            let tenant_id = self.harness.tenant_id;
+            let tenant_shard_id = self.harness.tenant_shard_id;
            self.mock_control_plane
                .latest_generation
                .lock()
                .unwrap()
-                .insert(tenant_id, gen);
+                .insert(tenant_shard_id, gen);
        }
        /// Returns remote layer file name, suitable for use in assert_remote_files
@@ -828,8 +801,8 @@ mod test {
            file_name: LayerFileName,
            gen: Generation,
        ) -> anyhow::Result<String> {
-            let tenant_id = self.harness.tenant_id;
+            let tenant_shard_id = self.harness.tenant_shard_id;
-            let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+            let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
            let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path());
            std::fs::create_dir_all(&remote_timeline_path)?;
            let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix());
@@ -847,7 +820,7 @@ mod test {
    #[derive(Debug, Clone)]
    struct MockControlPlane {
-        pub latest_generation: std::sync::Arc<std::sync::Mutex<HashMap<TenantId, Generation>>>,
+        pub latest_generation: std::sync::Arc<std::sync::Mutex<HashMap<TenantShardId, Generation>>>,
    }
    impl MockControlPlane {
@@ -861,20 +834,20 @@ mod test {
    #[async_trait::async_trait]
    impl ControlPlaneGenerationsApi for MockControlPlane {
        #[allow(clippy::diverging_sub_expression)] // False positive via async_trait
-        async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
+        async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
            unimplemented!()
        }
        async fn validate(
            &self,
-            tenants: Vec<(TenantId, Generation)>,
+            tenants: Vec<(TenantShardId, Generation)>,
-        ) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
+        ) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
            let mut result = HashMap::new();
            let latest_generation = self.latest_generation.lock().unwrap();
-            for (tenant_id, generation) in tenants {
+            for (tenant_shard_id, generation) in tenants {
-                if let Some(latest) = latest_generation.get(&tenant_id) {
+                if let Some(latest) = latest_generation.get(&tenant_shard_id) {
-                    result.insert(tenant_id, *latest == generation);
+                    result.insert(tenant_shard_id, *latest == generation);
                }
            }
@@ -978,10 +951,10 @@ mod test {
        client.recover(HashMap::new())?;
        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
-        let tenant_id = ctx.harness.tenant_id;
+        let tenant_shard_id = ctx.harness.tenant_shard_id;
        let content: Vec<u8> = "victim1 contents".into();
-        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+        let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
        let deletion_prefix = ctx.harness.conf.deletion_prefix();
@@ -989,6 +962,8 @@ mod test {
        // we delete, and the generation of the running Tenant.
        let layer_generation = Generation::new(0xdeadbeef);
        let now_generation = Generation::new(0xfeedbeef);
        let layer_metadata =
            LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());
        let remote_layer_file_name_1 =
            format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
@@ -1009,10 +984,10 @@ mod test {
        info!("Pushing");
        client
            .push_layers(
-                tenant_id,
+                tenant_shard_id,
                TIMELINE_ID,
                now_generation,
-                [(layer_file_name_1.clone(), layer_generation)].to_vec(),
+                [(layer_file_name_1.clone(), layer_metadata)].to_vec(),
            )
            .await?;
        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
@@ -1051,11 +1026,13 @@ mod test {
        let stale_generation = latest_generation.previous();
        // Generation that our example layer file was written with
        let layer_generation = stale_generation.previous();
        let layer_metadata =
            LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());
        ctx.set_latest_generation(latest_generation);
-        let tenant_id = ctx.harness.tenant_id;
+        let tenant_shard_id = ctx.harness.tenant_shard_id;
-        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+        let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
        // Initial state: a remote layer exists
@@ -1065,10 +1042,10 @@ mod test {
        tracing::debug!("Pushing...");
        client
            .push_layers(
-                tenant_id,
+                tenant_shard_id,
                TIMELINE_ID,
                stale_generation,
-                [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+                [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
            )
            .await?;
@@ -1080,10 +1057,10 @@ mod test {
        tracing::debug!("Pushing...");
        client
            .push_layers(
-                tenant_id,
+                tenant_shard_id,
                TIMELINE_ID,
                latest_generation,
-                [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+                [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
            )
            .await?;
@@ -1102,14 +1079,16 @@ mod test {
        let client = ctx.deletion_queue.new_client();
        client.recover(HashMap::new())?;
-        let tenant_id = ctx.harness.tenant_id;
+        let tenant_shard_id = ctx.harness.tenant_shard_id;
-        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+        let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
        let deletion_prefix = ctx.harness.conf.deletion_prefix();
        let layer_generation = Generation::new(0xdeadbeef);
        let now_generation = Generation::new(0xfeedbeef);
        let layer_metadata =
            LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());
        // Inject a deletion in the generation before generation_now: after restart,
        // this deletion should _not_ get executed (only the immediately previous
@@ -1118,10 +1097,10 @@ mod test {
            ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
        client
            .push_layers(
-                tenant_id,
+                tenant_shard_id,
                TIMELINE_ID,
                now_generation.previous(),
-                [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+                [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
            )
            .await?;
@@ -1132,10 +1111,10 @@ mod test {
            ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
        client
            .push_layers(
-                tenant_id,
+                tenant_shard_id,
                TIMELINE_ID,
                now_generation,
-                [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_generation)].to_vec(),
+                [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
            )
            .await?;
@@ -1163,7 +1142,7 @@ mod test {
        drop(client);
        ctx.restart().await;
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::from([(tenant_id, now_generation)]))?;
+        client.recover(HashMap::from([(tenant_shard_id, now_generation)]))?;
        info!("Flush-executing");
        client.flush_execute().await?;
@@ -1225,12 +1204,13 @@ pub(crate) mod mock {
                match msg {
                    ListWriterQueueMessage::Delete(op) => {
                        let mut objects = op.objects;
-                        for (layer, generation) in op.layers {
+                        for (layer, meta) in op.layers {
                            objects.push(remote_layer_path(
-                                &op.tenant_id,
+                                &op.tenant_shard_id.tenant_id,
                                &op.timeline_id,
                                meta.shard,
                                &layer,
-                                generation,
+                                meta.generation,
                            ));
                        }
@@ -1310,4 +1290,34 @@ pub(crate) mod mock {
            }
        }
    }
    /// Test round-trip serialization/deserialization, and test stability of the format
    /// vs. a static expected string for the serialized version.
    #[test]
    fn deletion_list_serialization() -> anyhow::Result<()> {
        let tenant_id = "ad6c1a56f5680419d3a16ff55d97ec3c"
            .to_string()
            .parse::<TenantShardId>()?;
        let timeline_id = "be322c834ed9e709e63b5c9698691910"
            .to_string()
            .parse::<TimelineId>()?;
        let generation = Generation::new(123);
        let object =
            RemotePath::from_string(&format!("tenants/{tenant_id}/timelines/{timeline_id}/foo"))?;
        let mut objects = [object].to_vec();
        let mut example = DeletionList::new(1);
        example.push(&tenant_id, &timeline_id, generation, &mut objects);
        let encoded = serde_json::to_string(&example)?;
        let expected = "{\"version\":1,\"sequence\":1,\"tenants\":{\"ad6c1a56f5680419d3a16ff55d97ec3c\":{\"timelines\":{\"be322c834ed9e709e63b5c9698691910\":[\"foo\"]},\"generation\":123}},\"size\":1}".to_string();
        assert_eq!(encoded, expected);
        let decoded = serde_json::from_str::<DeletionList>(&encoded)?;
        assert_eq!(example, decoded);
        Ok(())
    }
 }
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -19,6 +19,7 @@ use std::collections::HashMap;
 use std::fs::create_dir_all;
 use std::time::Duration;
 use pageserver_api::shard::TenantShardId;
 use regex::Regex;
 use remote_storage::RemotePath;
 use tokio_util::sync::CancellationToken;
@@ -26,13 +27,13 @@ use tracing::debug;
 use tracing::info;
 use tracing::warn;
 use utils::generation::Generation;
 use utils::id::TenantId;
 use utils::id::TimelineId;
 use crate::config::PageServerConf;
 use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::LayerFileMetadata;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::virtual_file::on_fatal_io_error;
 use crate::virtual_file::MaybeFatalIo;
@@ -53,22 +54,22 @@ const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);
 #[derive(Debug)]
 pub(super) struct DeletionOp {
-    pub(super) tenant_id: TenantId,
+    pub(super) tenant_shard_id: TenantShardId,
    pub(super) timeline_id: TimelineId,
    // `layers` and `objects` are both just lists of objects.  `layers` is used if you do not
    // have a config object handy to project it to a remote key, and need the consuming worker
    // to do it for you.
-    pub(super) layers: Vec<(LayerFileName, Generation)>,
+    pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>,
    pub(super) objects: Vec<RemotePath>,
-    /// The _current_ generation of the Tenant attachment in which we are enqueuing
+    /// The _current_ generation of the Tenant shard attachment in which we are enqueuing
    /// this deletion.
    pub(super) generation: Generation,
 }
 #[derive(Debug)]
 pub(super) struct RecoverOp {
-    pub(super) attached_tenants: HashMap<TenantId, Generation>,
+    pub(super) attached_tenants: HashMap<TenantShardId, Generation>,
 }
 #[derive(Debug)]
@@ -205,7 +206,7 @@ impl ListWriter {
    async fn recover(
        &mut self,
-        attached_tenants: HashMap<TenantId, Generation>,
+        attached_tenants: HashMap<TenantShardId, Generation>,
    ) -> Result<(), anyhow::Error> {
        debug!(
            "recovering with {} attached tenants",
@@ -308,10 +309,21 @@ impl ListWriter {
                // generation was issued to another node in the interval while we restarted,
                // then we may treat deletion lists from the previous generation as if they
                // belong to our currently attached generation, and proceed to validate & execute.
-                for (tenant_id, tenant_list) in &mut deletion_list.tenants {
+                for (tenant_shard_id, tenant_list) in &mut deletion_list.tenants {
-                    if let Some(attached_gen) = attached_tenants.get(tenant_id) {
+                    if let Some(attached_gen) = attached_tenants.get(tenant_shard_id) {
                        if attached_gen.previous() == tenant_list.generation {
                            info!(
                                seq=%s, tenant_id=%tenant_shard_id.tenant_id,
                                shard_id=%tenant_shard_id.shard_slug(),
                                old_gen=?tenant_list.generation, new_gen=?attached_gen,
                                "Updating gen on recovered list");
                            tenant_list.generation = *attached_gen;
                        } else {
                            info!(
                                seq=%s, tenant_id=%tenant_shard_id.tenant_id,
                                shard_id=%tenant_shard_id.shard_slug(),
                                old_gen=?tenant_list.generation, new_gen=?attached_gen,
                                "Encountered stale generation on recovered list");
                        }
                    }
                }
@@ -387,25 +399,26 @@ impl ListWriter {
                    );
                    let mut layer_paths = Vec::new();
-                    for (layer, generation) in op.layers {
+                    for (layer, meta) in op.layers {
                        layer_paths.push(remote_layer_path(
-                            &op.tenant_id,
+                            &op.tenant_shard_id.tenant_id,
                            &op.timeline_id,
                            meta.shard,
                            &layer,
-                            generation,
+                            meta.generation,
                        ));
                    }
                    layer_paths.extend(op.objects);
                    if !self.pending.push(
-                        &op.tenant_id,
+                        &op.tenant_shard_id,
                        &op.timeline_id,
                        op.generation,
                        &mut layer_paths,
                    ) {
                        self.flush().await;
                        let retry_succeeded = self.pending.push(
-                            &op.tenant_id,
+                            &op.tenant_shard_id,
                            &op.timeline_id,
                            op.generation,
                            &mut layer_paths,
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -178,7 +178,14 @@ where
                .unwrap_or(false);
            if valid && *validated_generation == tenant_lsn_state.generation {
-                for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines {
+                for (timeline_id, pending_lsn) in tenant_lsn_state.timelines {
                    tracing::debug!(
                        %tenant_id,
                        %timeline_id,
                        current = %pending_lsn.result_slot.load(),
                        projected = %pending_lsn.projected,
                        "advancing validated remote_consistent_lsn",
                    );
                    pending_lsn.result_slot.store(pending_lsn.projected);
                }
            } else {
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -42,7 +42,6 @@
 //   reading these fields. We use the Debug impl for semi-structured logging, though.
 use std::{
    collections::HashMap,
    sync::Arc,
    time::{Duration, SystemTime},
 };
@@ -88,6 +87,7 @@ pub fn launch_disk_usage_global_eviction_task(
    storage: GenericRemoteStorage,
    state: Arc<State>,
    background_jobs_barrier: completion::Barrier,
    cancel: CancellationToken,
 ) -> anyhow::Result<()> {
    let Some(task_config) = &conf.disk_usage_based_eviction else {
        info!("disk usage based eviction task not configured");
@@ -103,6 +103,7 @@ pub fn launch_disk_usage_global_eviction_task(
        None,
        "disk usage based eviction",
        false,
        cancel,
        async move {
            let cancel = task_mgr::shutdown_token();
@@ -125,7 +126,7 @@ pub fn launch_disk_usage_global_eviction_task(
 async fn disk_usage_eviction_task(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
-    _storage: &GenericRemoteStorage,
+    storage: &GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: CancellationToken,
 ) {
@@ -149,8 +150,14 @@ async fn disk_usage_eviction_task(
        let start = Instant::now();
        async {
-            let res =
+            let res = disk_usage_eviction_task_iteration(
-                disk_usage_eviction_task_iteration(state, task_config, tenants_dir, &cancel).await;
+                state,
                task_config,
                storage,
                tenants_dir,
                &cancel,
            )
            .await;
            match res {
                Ok(()) => {}
@@ -181,12 +188,13 @@ pub trait Usage: Clone + Copy + std::fmt::Debug {
 async fn disk_usage_eviction_task_iteration(
    state: &State,
    task_config: &DiskUsageEvictionTaskConfig,
    storage: &GenericRemoteStorage,
    tenants_dir: &Utf8Path,
    cancel: &CancellationToken,
 ) -> anyhow::Result<()> {
    let usage_pre = filesystem_level_usage::get(tenants_dir, task_config)
        .context("get filesystem-level disk usage before evictions")?;
-    let res = disk_usage_eviction_task_iteration_impl(state, usage_pre, cancel).await;
+    let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await;
    match res {
        Ok(outcome) => {
            debug!(?outcome, "disk_usage_eviction_iteration finished");
@@ -268,8 +276,9 @@ struct LayerCount {
    count: usize,
 }
-pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
+pub(crate) async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    state: &State,
    _storage: &GenericRemoteStorage,
    usage_pre: U,
    cancel: &CancellationToken,
 ) -> anyhow::Result<IterationOutcome<U>> {
@@ -310,7 +319,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                .unwrap()
                .as_micros(),
            partition,
-            desc.tenant_id,
+            desc.tenant_shard_id,
            desc.timeline_id,
            candidate.layer,
        );
@@ -321,16 +330,16 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    // Walk through the list of candidates, until we have accumulated enough layers to get
    // us back under the pressure threshold. 'usage_planned' is updated so that it tracks
    // how much disk space would be used after evicting all the layers up to the current
-    // point in the list. The layers are collected in 'batched', grouped per timeline.
+    // point in the list.
    //
    // If we get far enough in the list that we start to evict layers that are below
    // the tenant's min-resident-size threshold, print a warning, and memorize the disk
    // usage at that point, in 'usage_planned_min_resident_size_respecting'.
    let mut batched: HashMap<_, Vec<_>> = HashMap::new();
    let mut warned = None;
    let mut usage_planned = usage_pre;
-    let mut max_batch_size = 0;
+    let mut evicted_amount = 0;
-    for (i, (partition, candidate)) in candidates.into_iter().enumerate() {
+
    for (i, (partition, candidate)) in candidates.iter().enumerate() {
        if !usage_planned.has_pressure() {
            debug!(
                no_candidates_evicted = i,
@@ -339,25 +348,13 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            break;
        }
-        if partition == MinResidentSizePartition::Below && warned.is_none() {
+        if partition == &MinResidentSizePartition::Below && warned.is_none() {
            warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy");
            warned = Some(usage_planned);
        }
        usage_planned.add_available_bytes(candidate.layer.layer_desc().file_size);
-
+        evicted_amount += 1;
        // FIXME: batching makes no sense anymore because of no layermap locking, should just spawn
        // tasks to evict all seen layers until we have evicted enough
        let batch = batched.entry(TimelineKey(candidate.timeline)).or_default();
        // semaphore will later be used to limit eviction concurrency, and we can express at
        // most u32 number of permits. unlikely we would have u32::MAX layers to be evicted,
        // but fail gracefully by not making batches larger.
        if batch.len() < u32::MAX as usize {
            batch.push(candidate.layer);
            max_batch_size = max_batch_size.max(batch.len());
        }
    }
    let usage_planned = match warned {
@@ -372,100 +369,79 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    };
    debug!(?usage_planned, "usage planned");
-    // phase2: evict victims batched by timeline
+    // phase2: evict layers
    let mut js = tokio::task::JoinSet::new();
    let limit = 1000;
-    // ratelimit to 1k files or any higher max batch size
+    let mut evicted = candidates.into_iter().take(evicted_amount).fuse();
-    let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));
+    let mut consumed_all = false;
-    for (timeline, batch) in batched {
+    // After the evictions, `usage_assumed` is the post-eviction usage,
-        let tenant_id = timeline.tenant_id;
+    // according to internal accounting.
-        let timeline_id = timeline.timeline_id;
+    let mut usage_assumed = usage_pre;
-        let batch_size =
+    let mut evictions_failed = LayerCount::default();
            u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
-        // I dislike naming of `available_permits` but it means current total amount of permits
+    let evict_layers = async move {
-        // because permits can be added
+        loop {
-        assert!(batch_size as usize <= limit.available_permits());
+            let next = if js.len() >= limit || consumed_all {
                js.join_next().await
            } else if !js.is_empty() {
                // opportunistically consume ready result, one per each new evicted
                futures::future::FutureExt::now_or_never(js.join_next()).and_then(|x| x)
            } else {
                None
            };
-        debug!(%timeline_id, "evicting batch for timeline");
+            if let Some(next) = next {
-
+                match next {
-        let evict = {
+                    Ok(Ok(file_size)) => {
-            let limit = limit.clone();
+                        usage_assumed.add_available_bytes(file_size);
            let cancel = cancel.clone();
            async move {
                let mut evicted_bytes = 0;
                let mut evictions_failed = LayerCount::default();
                let Ok(_permit) = limit.acquire_many_owned(batch_size).await else {
                    // semaphore closing means cancelled
                    return (evicted_bytes, evictions_failed);
                };
                let results = timeline.evict_layers(&batch).await;
                match results {
                    Ok(results) => {
                        assert_eq!(results.len(), batch.len());
                        for (result, layer) in results.into_iter().zip(batch.iter()) {
                            let file_size = layer.layer_desc().file_size;
                            match result {
                                Some(Ok(())) => {
                                    evicted_bytes += file_size;
                                }
                                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
                                    evictions_failed.file_sizes += file_size;
                                    evictions_failed.count += 1;
                                }
                                None => {
                                    assert!(cancel.is_cancelled());
                                }
                            }
                        }
                    }
-                    Err(e) => {
+                    Ok(Err((file_size, EvictionError::NotFound | EvictionError::Downloaded))) => {
-                        warn!("failed to evict batch: {:#}", e);
+                        evictions_failed.file_sizes += file_size;
                        evictions_failed.count += 1;
                    }
                    Err(je) if je.is_cancelled() => unreachable!("not used"),
                    Err(je) if je.is_panic() => { /* already logged */ }
                    Err(je) => tracing::error!("unknown JoinError: {je:?}"),
                }
                (evicted_bytes, evictions_failed)
            }
        }
        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));
-        js.spawn(evict);
+            if consumed_all && js.is_empty() {
-
+                break;
        // spwaning multiple thousands of these is essentially blocking, so give already spawned a
        // chance of making progress
        tokio::task::yield_now().await;
    }
    let join_all = async move {
        // After the evictions, `usage_assumed` is the post-eviction usage,
        // according to internal accounting.
        let mut usage_assumed = usage_pre;
        let mut evictions_failed = LayerCount::default();
        while let Some(res) = js.join_next().await {
            match res {
                Ok((evicted_bytes, failed)) => {
                    usage_assumed.add_available_bytes(evicted_bytes);
                    evictions_failed.file_sizes += failed.file_sizes;
                    evictions_failed.count += failed.count;
                }
                Err(je) if je.is_cancelled() => unreachable!("not used"),
                Err(je) if je.is_panic() => { /* already logged */ }
                Err(je) => tracing::error!("unknown JoinError: {je:?}"),
            }
            // calling again when consumed_all is fine as evicted is fused.
            let Some((_partition, candidate)) = evicted.next() else {
                consumed_all = true;
                continue;
            };
            js.spawn(async move {
                let rtc = candidate.timeline.remote_client.as_ref().expect(
                    "holding the witness, all timelines must have a remote timeline client",
                );
                let file_size = candidate.layer.layer_desc().file_size;
                candidate
                    .layer
                    .evict_and_wait(rtc)
                    .await
                    .map(|()| file_size)
                    .map_err(|e| (file_size, e))
            });
            tokio::task::yield_now().await;
        }
        (usage_assumed, evictions_failed)
    };
    let (usage_assumed, evictions_failed) = tokio::select! {
-        tuple = join_all => { tuple },
+        tuple = evict_layers => { tuple },
        _ = cancel.cancelled() => {
-            // close the semaphore to stop any pending acquires
+            // dropping joinset will abort all pending evict_and_waits and that is fine, our
-            limit.close();
+            // requests will still stand
            return Ok(IterationOutcome::Cancelled);
        }
    };
@@ -572,7 +548,7 @@ async fn collect_eviction_candidates(
                continue;
            }
            let info = tl.get_local_layers_for_disk_usage_eviction().await;
-            debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
+            debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
            tenant_candidates.extend(
                info.resident_layers
                    .into_iter()
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -84,7 +84,6 @@ paths:
        required: true
        schema:
          type: string
          format: hex
    get:
      description: Get tenant status
      responses:
@@ -181,7 +180,6 @@ paths:
        required: true
        schema:
          type: string
          format: hex
    get:
      description: Get timelines for tenant
      responses:
@@ -232,7 +230,6 @@ paths:
        required: true
        schema:
          type: string
          format: hex
      - name: timeline_id
        in: path
        required: true
@@ -338,7 +335,6 @@ paths:
        required: true
        schema:
          type: string
          format: hex
      - name: timeline_id
        in: path
        required: true
@@ -401,7 +397,6 @@ paths:
        required: true
        schema:
          type: string
          format: hex
      - name: timeline_id
        in: path
        required: true
@@ -469,7 +464,6 @@ paths:
        required: true
        schema:
          type: string
          format: hex
      - name: timeline_id
        in: path
        required: true
@@ -523,7 +517,6 @@ paths:
        required: true
        schema:
          type: string
          format: hex
    post:
      description: |
        Schedules attach operation to happen in the background for the given tenant.
@@ -624,6 +617,98 @@ paths:
                $ref: "#/components/schemas/ServiceUnavailableError"
  /v1/tenant/{tenant_id}/location_config:
    parameters:
      - name: tenant_id
        in: path
        required: true
        schema:
          type: string
      - name: flush_ms
        in: query
        required: false
        schema:
          type: integer
    put:
      description: |
        Configures a _tenant location_, that is how a particular pageserver handles
        a particular tenant.  This includes _attached_ tenants, i.e. those ingesting WAL
        and page service requests, and _secondary_ tenants, i.e. those which are just keeping
        a warm cache in anticipation of transitioning to attached state in the future.
        This is a declarative, idempotent API: there are not separate endpoints
        for different tenant location configurations.  Rather, this single endpoint accepts
        a description of the desired location configuration, and makes whatever changes
        are required to reach that state.
        In imperative terms, this API is used to attach and detach tenants, and
        to transition tenants to and from secondary mode.
        This is a synchronous API: there is no 202 response.  State transitions should always
        be fast (milliseconds), with the exception of requests setting `flush_ms`, in which case
        the caller controls the runtime of the request.
        In some state transitions, it makes sense to flush dirty data to remote storage: this includes transitions
        to AttachedStale and Detached.  Flushing is never necessary for correctness, but is an
        important optimization when doing migrations.  The `flush_ms` parameter controls whether
        flushing should be attempted, and how much time is allowed for flushing.  If the time limit expires,
        the requested transition will continue without waiting for any outstanding data to flush.  Callers
        should use a duration which is substantially less than their HTTP client's request
        timeout.  It is safe to supply flush_ms irrespective of the request body: in state transitions
        where flushing doesn't make sense, the server will ignore it.
        It is safe to retry requests, but if one receives a 409 or 503 response, it is not
        useful to retry aggressively: there is probably an existing request still ongoing.
      requestBody:
        required: false
        content:
          application/json:
            schema:
              $ref: "#/components/schemas/TenantLocationConfigRequest"
      responses:
        "200":
          description: Tenant is now in requested state
        "503":
          description: Tenant's state cannot be changed right now.  Wait a few seconds and retry.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
        "401":
          description: Unauthorized Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/UnauthorizedError"
        "403":
          description: Forbidden Error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ForbiddenError"
        "409":
          description: |
            The tenant is already known to Pageserver in some way,
            and hence this `/attach` call has been rejected.
            Some examples of how this can happen:
            - tenant was created on this pageserver
            - tenant attachment was started by an earlier call to `/attach`.
            Callers should poll the tenant status's `attachment_status` field,
            like for status 202. See the longer description for `POST /attach`
            for details.
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/ConflictError"
        "500":
          description: Generic operation error
          content:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
  /v1/tenant/{tenant_id}/detach:
    parameters:
      - name: tenant_id
@@ -631,7 +716,6 @@ paths:
        required: true
        schema:
          type: string
          format: hex
      - name: detach_ignored
        in: query
        required: false
@@ -691,7 +775,6 @@ paths:
        required: true
        schema:
          type: string
          format: hex
    post:
      description: |
        Remove tenant data (including all corresponding timelines) from pageserver's memory.
@@ -740,7 +823,6 @@ paths:
        required: true
        schema:
          type: string
          format: hex
    post:
      description: |
        Schedules an operation that attempts to load a tenant from the local disk and
@@ -797,7 +879,6 @@ paths:
        required: true
        schema:
          type: string
          format: hex
    get:
      description: |
        Calculate tenant's synthetic size
@@ -840,7 +921,6 @@ paths:
        required: true
        schema:
          type: string
          format: hex
      - name: inputs_only
        in: query
        required: false
@@ -910,7 +990,6 @@ paths:
        required: true
        schema:
          type: string
          format: hex
    post:
      description: |
        Create a timeline. Returns new timeline id on success.\
@@ -935,6 +1014,9 @@ paths:
                  format: hex
                pg_version:
                  type: integer
                existing_initdb_timeline_id:
                  type: string
                  format: hex
      responses:
        "201":
          description: TimelineInfo
@@ -1041,7 +1123,6 @@ paths:
            application/json:
              schema:
                type: string
                format: hex
        "400":
          description: Malformed tenant create request
          content:
@@ -1138,7 +1219,6 @@ paths:
        required: true
        schema:
          type: string
          format: hex
    get:
      description: |
        Returns tenant's config description: specific config overrides a tenant has
@@ -1244,7 +1324,6 @@ components:
          properties:
            new_tenant_id:
              type: string
              format: hex
            generation:
              type: integer
              description: Attachment generation number.
@@ -1273,7 +1352,30 @@ components:
          properties:
            tenant_id:
              type: string
-              format: hex
+    TenantLocationConfigRequest:
      type: object
      required:
        - tenant_id
      properties:
        tenant_id:
          type: string
        mode:
          type: string
          enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
          description: Mode of functionality that this pageserver will run in for this tenant.
        generation:
          type: integer
          description: Attachment generation number, mandatory when `mode` is an attached state
        secondary_conf:
          $ref: '#/components/schemas/SecondaryConfig'
        tenant_conf:
          $ref: '#/components/schemas/TenantConfig'
    SecondaryConfig:
      type: object
      properties:
        warm:
          type: boolean
          description: Whether to poll remote storage for layers to download.  If false, secondary locations don't download anything.
    TenantConfig:
      type: object
      properties:
@@ -1325,7 +1427,6 @@ components:
          format: hex
        tenant_id:
          type: string
          format: hex
        last_record_lsn:
          type: string
          format: hex
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -4,8 +4,10 @@
 use std::collections::HashMap;
 use std::str::FromStr;
 use std::sync::Arc;
 use std::time::Duration;
 use anyhow::{anyhow, Context, Result};
 use enumset::EnumSet;
 use futures::TryFutureExt;
 use humantime::format_rfc3339;
 use hyper::header;
@@ -42,6 +44,7 @@ use crate::tenant::mgr::{
 };
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::timeline::CompactFlags;
 use crate::tenant::timeline::Timeline;
 use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, TenantSharedResources};
 use crate::{config::PageServerConf, tenant::mgr};
@@ -133,11 +136,6 @@ impl From<PageReconstructError> for ApiError {
    fn from(pre: PageReconstructError) -> ApiError {
        match pre {
            PageReconstructError::Other(pre) => ApiError::InternalServerError(pre),
            PageReconstructError::NeedsDownload(_, _) => {
                // This shouldn't happen, because we use a RequestContext that requests to
                // download any missing layer files on-demand.
                ApiError::InternalServerError(anyhow::anyhow!("need to download remote layer file"))
            }
            PageReconstructError::Cancelled => {
                ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
            }
@@ -316,6 +314,7 @@ async fn build_timeline_info_common(
    ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
    let initdb_lsn = timeline.initdb_lsn;
    let last_record_lsn = timeline.get_last_record_lsn();
    let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
        let guard = timeline.last_received_wal.lock().unwrap();
@@ -335,13 +334,8 @@ async fn build_timeline_info_common(
        Lsn(0) => None,
        lsn @ Lsn(_) => Some(lsn),
    };
-    let current_logical_size = match timeline.get_current_logical_size(ctx) {
+    let current_logical_size =
-        Ok((size, _)) => Some(size),
+        timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx);
        Err(err) => {
            error!("Timeline info creation failed to get current logical size: {err:?}");
            None
        }
    };
    let current_physical_size = Some(timeline.layer_size_sum().await);
    let state = timeline.current_state();
    let remote_consistent_lsn_projected = timeline
@@ -354,17 +348,22 @@ async fn build_timeline_info_common(
    let walreceiver_status = timeline.walreceiver_status();
    let info = TimelineInfo {
-        tenant_id: timeline.tenant_id,
+        tenant_id: timeline.tenant_shard_id,
        timeline_id: timeline.timeline_id,
        ancestor_timeline_id,
        ancestor_lsn,
        disk_consistent_lsn: timeline.get_disk_consistent_lsn(),
        remote_consistent_lsn: remote_consistent_lsn_projected,
        remote_consistent_lsn_visible,
        initdb_lsn,
        last_record_lsn,
        prev_record_lsn: Some(timeline.get_prev_record_lsn()),
        latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
-        current_logical_size,
+        current_logical_size: current_logical_size.size_dont_care_about_accuracy(),
        current_logical_size_is_accurate: match current_logical_size.accuracy() {
            tenant::timeline::logical_size::Accuracy::Approximate => false,
            tenant::timeline::logical_size::Accuracy::Exact => true,
        },
        current_physical_size,
        current_logical_size_non_incremental: None,
        timeline_dir_layer_file_size_sum: None,
@@ -437,6 +436,7 @@ async fn timeline_create_handler(
            request_data.ancestor_timeline_id.map(TimelineId::from),
            request_data.ancestor_start_lsn,
            request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
            request_data.existing_initdb_timeline_id,
            state.broker_client.clone(),
            &ctx,
        )
@@ -476,15 +476,15 @@ async fn timeline_list_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let include_non_incremental_logical_size: Option<bool> =
        parse_query_param(&request, "include-non-incremental-logical-size")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
    let response_data = async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
        let timelines = tenant.list_timelines();
        let mut response_data = Vec::with_capacity(timelines.len());
@@ -503,7 +503,9 @@ async fn timeline_list_handler(
        }
        Ok::<Vec<TimelineInfo>, ApiError>(response_data)
    }
-    .instrument(info_span!("timeline_list", %tenant_id))
+    .instrument(info_span!("timeline_list",
                tenant_id = %tenant_shard_id.tenant_id,
                shard_id = %tenant_shard_id.shard_slug()))
    .await?;
    json_response(StatusCode::OK, response_data)
@@ -513,17 +515,17 @@ async fn timeline_detail_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let include_non_incremental_logical_size: Option<bool> =
        parse_query_param(&request, "include-non-incremental-logical-size")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    // Logical size calculation needs downloading.
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
    let timeline_info = async {
-        let tenant = mgr::get_tenant(tenant_id, true)?;
+        let tenant = mgr::get_tenant(tenant_shard_id, true)?;
        let timeline = tenant
            .get_timeline(timeline_id, false)
@@ -540,7 +542,10 @@ async fn timeline_detail_handler(
        Ok::<_, ApiError>(timeline_info)
    }
-    .instrument(info_span!("timeline_detail", %tenant_id, %timeline_id))
+    .instrument(info_span!("timeline_detail",
                tenant_id = %tenant_shard_id.tenant_id,
                shard_id = %tenant_shard_id.shard_slug(),
                %timeline_id))
    .await?;
    json_response(StatusCode::OK, timeline_info)
@@ -548,10 +553,17 @@ async fn timeline_detail_handler(
 async fn get_lsn_by_timestamp_handler(
    request: Request<Body>,
-    _cancel: CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    if !tenant_shard_id.is_zero() {
        // Requires SLRU contents, which are only stored on shard zero
        return Err(ApiError::BadRequest(anyhow!(
            "Size calculations are only available on shard zero"
        )));
    }
    let version: Option<u8> = parse_query_param(&request, "version")?;
@@ -563,8 +575,10 @@ async fn get_lsn_by_timestamp_handler(
    let timestamp_pg = postgres_ffi::to_pg_timestamp(timestamp);
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
-    let result = timeline.find_lsn_for_timestamp(timestamp_pg, &ctx).await?;
+    let result = timeline
        .find_lsn_for_timestamp(timestamp_pg, &cancel, &ctx)
        .await?;
    if version.unwrap_or(0) > 1 {
        #[derive(serde::Serialize)]
@@ -596,8 +610,15 @@ async fn get_timestamp_of_lsn_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    if !tenant_shard_id.is_zero() {
        // Requires SLRU contents, which are only stored on shard zero
        return Err(ApiError::BadRequest(anyhow!(
            "Size calculations are only available on shard zero"
        )));
    }
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -607,7 +628,7 @@ async fn get_timestamp_of_lsn_handler(
        .map_err(ApiError::BadRequest)?;
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    let result = timeline.get_timestamp_for_lsn(lsn, &ctx).await?;
    match result {
@@ -703,6 +724,26 @@ async fn tenant_detach_handler(
    json_response(StatusCode::OK, ())
 }
 async fn tenant_reset_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let drop_cache: Option<bool> = parse_query_param(&request, "drop_cache")?;
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
    let state = get_state(&request);
    state
        .tenant_manager
        .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
        .await
        .map_err(ApiError::InternalServerError)?;
    json_response(StatusCode::OK, ())
 }
 async fn tenant_load_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -779,11 +820,11 @@ async fn tenant_status(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let tenant_info = async {
-        let tenant = mgr::get_tenant(tenant_id, false)?;
+        let tenant = mgr::get_tenant(tenant_shard_id, false)?;
        // Calculate total physical size of all timelines
        let mut current_physical_size = 0;
@@ -793,13 +834,15 @@ async fn tenant_status(
        let state = tenant.current_state();
        Result::<_, ApiError>::Ok(TenantInfo {
-            id: tenant_id,
+            id: tenant_shard_id,
            state: state.clone(),
            current_physical_size: Some(current_physical_size),
            attachment_status: state.attachment_status(),
        })
    }
-    .instrument(info_span!("tenant_status_handler", %tenant_id))
+    .instrument(info_span!("tenant_status_handler",
                tenant_id = %tenant_shard_id.tenant_id,
                shard_id = %tenant_shard_id.shard_slug()))
    .await?;
    json_response(StatusCode::OK, tenant_info)
@@ -818,7 +861,7 @@ async fn tenant_delete_handler(
    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
        .instrument(info_span!("tenant_delete_handler",
            tenant_id = %tenant_shard_id.tenant_id,
-            shard = tenant_shard_id.shard_slug()
+            shard = %tenant_shard_id.shard_slug()
        ))
        .await?;
@@ -840,22 +883,29 @@ async fn tenant_delete_handler(
 /// without modifying anything anyway.
 async fn tenant_size_handler(
    request: Request<Body>,
-    _cancel: CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
    let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
    let headers = request.headers();
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let tenant = mgr::get_tenant(tenant_id, true)?;
+    let tenant = mgr::get_tenant(tenant_shard_id, true)?;
    if !tenant_shard_id.is_zero() {
        return Err(ApiError::BadRequest(anyhow!(
            "Size calculations are only available on shard zero"
        )));
    }
    // this can be long operation
    let inputs = tenant
        .gather_size_inputs(
            retention_period,
            LogicalSizeCalculationCause::TenantSizeHandler,
            &cancel,
            &ctx,
        )
        .await
@@ -900,7 +950,7 @@ async fn tenant_size_handler(
    json_response(
        StatusCode::OK,
        TenantHistorySize {
-            id: tenant_id,
+            id: tenant_shard_id.tenant_id,
            size: sizes.as_ref().map(|x| x.total_size),
            segment_sizes: sizes.map(|x| x.segments),
            inputs,
@@ -912,14 +962,14 @@ async fn layer_map_info_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let reset: LayerAccessStatsReset =
        parse_query_param(&request, "reset")?.unwrap_or(LayerAccessStatsReset::NoReset);
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    let layer_map_info = timeline.layer_map_info(reset).await;
    json_response(StatusCode::OK, layer_map_info)
@@ -929,13 +979,12 @@ async fn layer_download_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    check_permission(&request, Some(tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let layer_file_name = get_request_param(&request, "layer_file_name")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    let downloaded = timeline
        .download_layer(layer_file_name)
        .await
@@ -946,7 +995,7 @@ async fn layer_download_handler(
        Some(false) => json_response(StatusCode::NOT_MODIFIED, ()),
        None => json_response(
            StatusCode::BAD_REQUEST,
-            format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"),
+            format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"),
        ),
    }
 }
@@ -955,12 +1004,12 @@ async fn evict_timeline_layer_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let layer_file_name = get_request_param(&request, "layer_file_name")?;
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    let evicted = timeline
        .evict_layer(layer_file_name)
        .await
@@ -971,7 +1020,7 @@ async fn evict_timeline_layer_handler(
        Some(false) => json_response(StatusCode::NOT_MODIFIED, ()),
        None => json_response(
            StatusCode::BAD_REQUEST,
-            format!("Layer {tenant_id}/{timeline_id}/{layer_file_name} not found"),
+            format!("Layer {tenant_shard_id}/{timeline_id}/{layer_file_name} not found"),
        ),
    }
 }
@@ -1103,10 +1152,10 @@ async fn get_tenant_config_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let tenant = mgr::get_tenant(tenant_id, false)?;
+    let tenant = mgr::get_tenant(tenant_shard_id, false)?;
    let response = HashMap::from([
        (
@@ -1152,6 +1201,7 @@ async fn put_tenant_location_config_handler(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
    let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
@@ -1165,7 +1215,7 @@ async fn put_tenant_location_config_handler(
            mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
                .instrument(info_span!("tenant_detach",
                    tenant_id = %tenant_shard_id.tenant_id,
-                    shard = tenant_shard_id.shard_slug()
+                    shard = %tenant_shard_id.shard_slug()
                ))
                .await
        {
@@ -1184,7 +1234,7 @@ async fn put_tenant_location_config_handler(
    state
        .tenant_manager
-        .upsert_location(tenant_shard_id, location_conf, &ctx)
+        .upsert_location(tenant_shard_id, location_conf, flush, &ctx)
        .await
        // TODO: badrequest assumes the caller was asking for something unreasonable, but in
        // principle we might have hit something like concurrent API calls to the same tenant,
@@ -1199,9 +1249,9 @@ async fn handle_tenant_break(
    r: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&r, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&r, "tenant_shard_id")?;
-    let tenant = crate::tenant::mgr::get_tenant(tenant_id, true)
+    let tenant = crate::tenant::mgr::get_tenant(tenant_shard_id, true)
        .map_err(|_| ApiError::Conflict(String::from("no active tenant found")))?;
    tenant.set_broken("broken from test".to_owned()).await;
@@ -1240,16 +1290,17 @@ async fn failpoints_handler(
 // Run GC immediately on given timeline.
 async fn timeline_gc_handler(
    mut request: Request<Body>,
-    _cancel: CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let gc_req: TimelineGcRequest = json_request(&mut request).await?;
    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-    let wait_task_done = mgr::immediate_gc(tenant_id, timeline_id, gc_req, &ctx).await?;
+    let wait_task_done =
        mgr::immediate_gc(tenant_shard_id, timeline_id, gc_req, cancel, &ctx).await?;
    let gc_result = wait_task_done
        .await
        .context("wait for gc task")
@@ -1264,20 +1315,24 @@ async fn timeline_compact_handler(
    request: Request<Body>,
    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let mut flags = EnumSet::empty();
    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
        flags |= CompactFlags::ForceRepartition;
    }
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
        timeline
-            .compact(&cancel, &ctx)
+            .compact(&cancel, flags, &ctx)
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;
        json_response(StatusCode::OK, ())
    }
-    .instrument(info_span!("manual_compaction", %tenant_id, %timeline_id))
+    .instrument(info_span!("manual_compaction", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
    .await
 }
@@ -1286,24 +1341,29 @@ async fn timeline_checkpoint_handler(
    request: Request<Body>,
    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let mut flags = EnumSet::empty();
    if Some(true) == parse_query_param::<_, bool>(&request, "force_repartition")? {
        flags |= CompactFlags::ForceRepartition;
    }
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
        timeline
            .freeze_and_flush()
            .await
            .map_err(ApiError::InternalServerError)?;
        timeline
-            .compact(&cancel, &ctx)
+            .compact(&cancel, flags, &ctx)
            .await
            .map_err(|e| ApiError::InternalServerError(e.into()))?;
        json_response(StatusCode::OK, ())
    }
-    .instrument(info_span!("manual_checkpoint", %tenant_id, %timeline_id))
+    .instrument(info_span!("manual_checkpoint", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
    .await
 }
@@ -1311,12 +1371,12 @@ async fn timeline_download_remote_layers_handler_post(
    mut request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
    let body: DownloadRemoteLayersTaskSpawnRequest = json_request(&mut request).await?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    match timeline.spawn_download_all_remote_layers(body).await {
        Ok(st) => json_response(StatusCode::ACCEPTED, st),
        Err(st) => json_response(StatusCode::CONFLICT, st),
@@ -1327,11 +1387,11 @@ async fn timeline_download_remote_layers_handler_get(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+    let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
    let info = timeline
        .get_download_all_remote_layers_task_info()
        .context("task never started since last pageserver process start")
@@ -1377,9 +1437,9 @@ async fn getpage_at_lsn_handler(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    struct Key(crate::repository::Key);
@@ -1398,7 +1458,7 @@ async fn getpage_at_lsn_handler(
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
        let page = timeline.get(key.0, lsn, &ctx).await?;
@@ -1410,7 +1470,7 @@ async fn getpage_at_lsn_handler(
                .unwrap(),
        )
    }
-    .instrument(info_span!("timeline_get", %tenant_id, %timeline_id))
+    .instrument(info_span!("timeline_get", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
    .await
 }
@@ -1418,9 +1478,9 @@ async fn timeline_collect_keyspace(
    request: Request<Body>,
    _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
    let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
-    check_permission(&request, Some(tenant_id))?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
    struct Partitioning {
        keys: crate::keyspace::KeySpace,
@@ -1489,7 +1549,7 @@ async fn timeline_collect_keyspace(
    async {
        let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
-        let timeline = active_timeline_of_active_tenant(tenant_id, timeline_id).await?;
+        let timeline = active_timeline_of_active_tenant(tenant_shard_id, timeline_id).await?;
        let at_lsn = at_lsn.unwrap_or_else(|| timeline.get_last_record_lsn());
        let keys = timeline
            .collect_keyspace(at_lsn, &ctx)
@@ -1498,15 +1558,15 @@ async fn timeline_collect_keyspace(
        json_response(StatusCode::OK, Partitioning { keys, at_lsn })
    }
-    .instrument(info_span!("timeline_collect_keyspace", %tenant_id, %timeline_id))
+    .instrument(info_span!("timeline_collect_keyspace", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), %timeline_id))
    .await
 }
 async fn active_timeline_of_active_tenant(
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
 ) -> Result<Arc<Timeline>, ApiError> {
-    let tenant = mgr::get_tenant(tenant_id, true)?;
+    let tenant = mgr::get_tenant(tenant_shard_id, true)?;
    tenant
        .get_timeline(timeline_id, true)
        .map_err(|e| ApiError::NotFound(e.into()))
@@ -1528,7 +1588,7 @@ async fn always_panic_handler(
 async fn disk_usage_eviction_run(
    mut r: Request<Body>,
-    _cancel: CancellationToken,
+    cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
    check_permission(&r, None)?;
@@ -1565,48 +1625,26 @@ async fn disk_usage_eviction_run(
        freed_bytes: 0,
    };
    let (tx, rx) = tokio::sync::oneshot::channel();
    let state = get_state(&r);
-    if state.remote_storage.as_ref().is_none() {
+    let Some(storage) = state.remote_storage.as_ref() else {
        return Err(ApiError::InternalServerError(anyhow::anyhow!(
            "remote storage not configured, cannot run eviction iteration"
        )));
-    }
+    };
    let state = state.disk_usage_eviction_state.clone();
-    let cancel = CancellationToken::new();
+    let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
-    let child_cancel = cancel.clone();
+        &state, storage, usage, &cancel,
-    let _g = cancel.drop_guard();
+    )
    .await;
-    crate::task_mgr::spawn(
+    info!(?res, "disk_usage_eviction_task_iteration_impl finished");
        crate::task_mgr::BACKGROUND_RUNTIME.handle(),
        TaskKind::DiskUsageEviction,
        None,
        None,
        "ondemand disk usage eviction",
        false,
        async move {
            let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl(
                &state,
                usage,
                &child_cancel,
            )
            .await;
-            info!(?res, "disk_usage_eviction_task_iteration_impl finished");
+    let res = res.map_err(ApiError::InternalServerError)?;
-            let _ = tx.send(res);
+    json_response(StatusCode::OK, res)
            Ok(())
        }
        .in_current_span(),
    );
    let response = rx.await.unwrap().map_err(ApiError::InternalServerError)?;
    json_response(StatusCode::OK, response)
 }
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -1675,8 +1713,24 @@ where
                let token_cloned = token.clone();
                let result = handler(r, token).await;
                if token_cloned.is_cancelled() {
-                    info!("Cancelled request finished");
+                    // dropguard has executed: we will never turn this result into response.
                    //
                    // at least temporarily do {:?} logging; these failures are rare enough but
                    // could hide difficult errors.
                    match &result {
                        Ok(response) => {
                            let status = response.status();
                            info!(%status, "Cancelled request finished successfully")
                        }
                        Err(e) => error!("Cancelled request finished with an error: {e:?}"),
                    }
                }
                // only logging for cancelled panicked request handlers is the tracing_panic_hook,
                // which should suffice.
                //
                // there is still a chance to lose the result due to race between
                // returning from here and the actual connection closing happening
                // before outer task gets to execute. leaving that up for #5815.
                result
            }
            .in_current_span(),
@@ -1767,23 +1821,25 @@ pub fn make_router(
        })
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
-        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
+        .get("/v1/tenant/:tenant_shard_id", |r| {
            api_handler(r, tenant_status)
        })
        .delete("/v1/tenant/:tenant_shard_id", |r| {
            api_handler(r, tenant_delete_handler)
        })
-        .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
+        .get("/v1/tenant/:tenant_shard_id/synthetic_size", |r| {
            api_handler(r, tenant_size_handler)
        })
        .put("/v1/tenant/config", |r| {
            api_handler(r, update_tenant_config_handler)
        })
-        .get("/v1/tenant/:tenant_id/config", |r| {
+        .get("/v1/tenant/:tenant_shard_id/config", |r| {
            api_handler(r, get_tenant_config_handler)
        })
        .put("/v1/tenant/:tenant_shard_id/location_config", |r| {
            api_handler(r, put_tenant_location_config_handler)
        })
-        .get("/v1/tenant/:tenant_id/timeline", |r| {
+        .get("/v1/tenant/:tenant_shard_id/timeline", |r| {
            api_handler(r, timeline_list_handler)
        })
        .post("/v1/tenant/:tenant_shard_id/timeline", |r| {
@@ -1795,53 +1851,59 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_id/detach", |r| {
            api_handler(r, tenant_detach_handler)
        })
        .post("/v1/tenant/:tenant_shard_id/reset", |r| {
            api_handler(r, tenant_reset_handler)
        })
        .post("/v1/tenant/:tenant_id/load", |r| {
            api_handler(r, tenant_load_handler)
        })
        .post("/v1/tenant/:tenant_id/ignore", |r| {
            api_handler(r, tenant_ignore_handler)
        })
-        .get("/v1/tenant/:tenant_id/timeline/:timeline_id", |r| {
+        .get("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
            api_handler(r, timeline_detail_handler)
        })
        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_lsn_by_timestamp",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_lsn_by_timestamp",
            |r| api_handler(r, get_lsn_by_timestamp_handler),
        )
        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/get_timestamp_of_lsn",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/get_timestamp_of_lsn",
            |r| api_handler(r, get_timestamp_of_lsn_handler),
        )
        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc", |r| {
            api_handler(r, timeline_gc_handler)
        })
        .put("/v1/tenant/:tenant_id/timeline/:timeline_id/compact", |r| {
            testing_api_handler("run timeline compaction", r, timeline_compact_handler)
        })
        .put(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/checkpoint",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/do_gc",
            |r| api_handler(r, timeline_gc_handler),
        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/compact",
            |r| testing_api_handler("run timeline compaction", r, timeline_compact_handler),
        )
        .put(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/checkpoint",
            |r| testing_api_handler("run timeline checkpoint", r, timeline_checkpoint_handler),
        )
        .post(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
            |r| api_handler(r, timeline_download_remote_layers_handler_post),
        )
        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/download_remote_layers",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/download_remote_layers",
            |r| api_handler(r, timeline_download_remote_layers_handler_get),
        )
        .delete("/v1/tenant/:tenant_shard_id/timeline/:timeline_id", |r| {
            api_handler(r, timeline_delete_handler)
        })
        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/layer", |r| {
            api_handler(r, layer_map_info_handler)
        })
        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer",
            |r| api_handler(r, layer_map_info_handler),
        )
        .get(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| api_handler(r, layer_download_handler),
        )
        .delete(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/layer/:layer_file_name",
            |r| api_handler(r, evict_timeline_layer_handler),
        )
        .put("/v1/disk_usage_eviction/run", |r| {
@@ -1850,18 +1912,19 @@ pub fn make_router(
        .put("/v1/deletion_queue/flush", |r| {
            api_handler(r, deletion_queue_flush)
        })
-        .put("/v1/tenant/:tenant_id/break", |r| {
+        .put("/v1/tenant/:tenant_shard_id/break", |r| {
            testing_api_handler("set tenant state to broken", r, handle_tenant_break)
        })
        .get("/v1/panic", |r| api_handler(r, always_panic_handler))
        .post("/v1/tracing/event", |r| {
            testing_api_handler("emit a tracing event", r, post_tracing_event_handler)
        })
        .get("/v1/tenant/:tenant_id/timeline/:timeline_id/getpage", |r| {
            testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler)
        })
        .get(
-            "/v1/tenant/:tenant_id/timeline/:timeline_id/keyspace",
+            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/getpage",
            |r| testing_api_handler("getpage@lsn", r, getpage_at_lsn_handler),
        )
        .get(
            "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/keyspace",
            |r| testing_api_handler("read out the keyspace", r, timeline_collect_keyspace),
        )
        .any(handler_404))
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -2,19 +2,27 @@
 //! Import data and WAL from a PostgreSQL data directory and WAL segments into
 //! a neon Timeline.
 //!
 use std::io::SeekFrom;
 use std::path::{Path, PathBuf};
 use anyhow::{bail, ensure, Context, Result};
 use async_compression::tokio::bufread::ZstdDecoder;
 use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
-use tokio::io::{AsyncRead, AsyncReadExt};
+use nix::NixPath;
 use tokio::fs::{File, OpenOptions};
 use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncSeekExt, AsyncWriteExt};
 use tokio_tar::Archive;
 use tokio_tar::Builder;
 use tokio_tar::HeaderMode;
 use tracing::*;
 use walkdir::WalkDir;
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::remote_timeline_client::INITDB_PATH;
 use crate::tenant::Timeline;
 use crate::walingest::WalIngest;
 use crate::walrecord::DecodedWALRecord;
@@ -33,7 +41,9 @@ use utils::lsn::Lsn;
 pub fn get_lsn_from_controlfile(path: &Utf8Path) -> Result<Lsn> {
    // Read control file to extract the LSN
    let controlfile_path = path.join("global").join("pg_control");
-    let controlfile = ControlFileData::decode(&std::fs::read(controlfile_path)?)?;
+    let controlfile_buf = std::fs::read(&controlfile_path)
        .with_context(|| format!("reading controlfile: {controlfile_path}"))?;
    let controlfile = ControlFileData::decode(&controlfile_buf)?;
    let lsn = controlfile.checkPoint;
    Ok(Lsn(lsn))
@@ -618,3 +628,65 @@ async fn read_all_bytes(reader: &mut (impl AsyncRead + Unpin)) -> Result<Bytes>
    reader.read_to_end(&mut buf).await?;
    Ok(Bytes::from(buf))
 }
 pub async fn create_tar_zst(pgdata_path: &Utf8Path, tmp_path: &Utf8Path) -> Result<(File, u64)> {
    let file = OpenOptions::new()
        .create(true)
        .truncate(true)
        .read(true)
        .write(true)
        .open(&tmp_path)
        .await
        .with_context(|| format!("tempfile creation {tmp_path}"))?;
    let mut paths = Vec::new();
    for entry in WalkDir::new(pgdata_path) {
        let entry = entry?;
        let metadata = entry.metadata().expect("error getting dir entry metadata");
        // Also allow directories so that we also get empty directories
        if !(metadata.is_file() || metadata.is_dir()) {
            continue;
        }
        let path = entry.into_path();
        paths.push(path);
    }
    // Do a sort to get a more consistent listing
    paths.sort_unstable();
    let zstd = ZstdEncoder::with_quality_and_params(
        file,
        Level::Default,
        &[CParameter::enable_long_distance_matching(true)],
    );
    let mut builder = Builder::new(zstd);
    // Use reproducible header mode
    builder.mode(HeaderMode::Deterministic);
    for path in paths {
        let rel_path = path.strip_prefix(pgdata_path)?;
        if rel_path.is_empty() {
            // The top directory should not be compressed,
            // the tar crate doesn't like that
            continue;
        }
        builder.append_path_with_name(&path, rel_path).await?;
    }
    let mut zstd = builder.into_inner().await?;
    zstd.shutdown().await?;
    let mut compressed = zstd.into_inner();
    let compressed_len = compressed.metadata().await?.len();
    const INITDB_TAR_ZST_WARN_LIMIT: u64 = 2 * 1024 * 1024;
    if compressed_len > INITDB_TAR_ZST_WARN_LIMIT {
        warn!("compressed {INITDB_PATH} size of {compressed_len} is above limit {INITDB_TAR_ZST_WARN_LIMIT}.");
    }
    compressed.seek(SeekFrom::Start(0)).await?;
    Ok((compressed, compressed_len))
 }
 pub async fn extract_tar_zst(
    pgdata_path: &Utf8Path,
    tar_zst: impl AsyncBufRead + Unpin,
 ) -> Result<()> {
    let tar = Box::pin(ZstdDecoder::new(tar_zst));
    let mut archive = Archive::new(tar);
    archive.unpack(pgdata_path).await?;
    Ok(())
 }
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -49,11 +49,22 @@ pub const DELTA_FILE_MAGIC: u16 = 0x5A61;
 static ZERO_PAGE: bytes::Bytes = bytes::Bytes::from_static(&[0u8; 8192]);
 /// The main cancellation token for the process.
 ///
 /// Should only ever be used to create child tokens.
 pub static PAGESERVER_SHUTDOWN_TOKEN: std::sync::OnceLock<tokio_util::sync::CancellationToken> =
    std::sync::OnceLock::new();
 pub use crate::metrics::preinitialize_metrics;
 #[tracing::instrument(skip_all, fields(%exit_code))]
 pub async fn shutdown_pageserver(deletion_queue: Option<DeletionQueue>, exit_code: i32) {
    use std::time::Duration;
    if let Some(token) = PAGESERVER_SHUTDOWN_TOKEN.get() {
        token.cancel();
    }
    // Shut down the libpq endpoint task. This prevents new connections from
    // being accepted.
    timed(
@@ -186,13 +197,6 @@ pub struct InitializationOrder {
    /// Each initial tenant load task carries this until completion.
    pub initial_tenant_load: Option<utils::completion::Completion>,
    /// Barrier for when we can start initial logical size calculations.
    pub initial_logical_size_can_start: utils::completion::Barrier,
    /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
    /// attempt. It is important to drop this once the attempt has completed.
    pub initial_logical_size_attempt: Option<utils::completion::Completion>,
    /// Barrier for when we can start any background jobs.
    ///
    /// This can be broken up later on, but right now there is just one class of a background job.
@@ -212,7 +216,7 @@ async fn timed<Fut: std::future::Future>(
    match tokio::time::timeout(warn_at, &mut fut).await {
        Ok(ret) => {
            tracing::info!(
-                task = name,
+                stage = name,
                elapsed_ms = started.elapsed().as_millis(),
                "completed"
            );
@@ -220,7 +224,7 @@ async fn timed<Fut: std::future::Future>(
        }
        Err(_) => {
            tracing::info!(
-                task = name,
+                stage = name,
                elapsed_ms = started.elapsed().as_millis(),
                "still waiting, taking longer than expected..."
            );
@@ -229,7 +233,7 @@ async fn timed<Fut: std::future::Future>(
            // this has a global allowed_errors
            tracing::warn!(
-                task = name,
+                stage = name,
                elapsed_ms = started.elapsed().as_millis(),
                "completed, took longer than expected"
            );
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -7,6 +7,7 @@ use metrics::{
    HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use pageserver_api::shard::TenantShardId;
 use strum::{EnumCount, IntoEnumIterator, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
 use utils::id::{TenantId, TimelineId};
@@ -284,6 +285,63 @@ pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheS
    },
 });
 pub(crate) mod page_cache_eviction_metrics {
    use std::num::NonZeroUsize;
    use metrics::{register_int_counter_vec, IntCounter, IntCounterVec};
    use once_cell::sync::Lazy;
    #[derive(Clone, Copy)]
    pub(crate) enum Outcome {
        FoundSlotUnused { iters: NonZeroUsize },
        FoundSlotEvicted { iters: NonZeroUsize },
        ItersExceeded { iters: NonZeroUsize },
    }
    static ITERS_TOTAL_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
        register_int_counter_vec!(
            "pageserver_page_cache_find_victim_iters_total",
            "Counter for the number of iterations in the find_victim loop",
            &["outcome"],
        )
        .expect("failed to define a metric")
    });
    static CALLS_VEC: Lazy<IntCounterVec> = Lazy::new(|| {
        register_int_counter_vec!(
            "pageserver_page_cache_find_victim_calls",
            "Incremented at the end of each find_victim() call.\
             Filter by outcome to get e.g., eviction rate.",
            &["outcome"]
        )
        .unwrap()
    });
    pub(crate) fn observe(outcome: Outcome) {
        macro_rules! dry {
            ($label:literal, $iters:expr) => {{
                static LABEL: &'static str = $label;
                static ITERS_TOTAL: Lazy<IntCounter> =
                    Lazy::new(|| ITERS_TOTAL_VEC.with_label_values(&[LABEL]));
                static CALLS: Lazy<IntCounter> =
                    Lazy::new(|| CALLS_VEC.with_label_values(&[LABEL]));
                ITERS_TOTAL.inc_by(($iters.get()) as u64);
                CALLS.inc();
            }};
        }
        match outcome {
            Outcome::FoundSlotUnused { iters } => dry!("found_empty", iters),
            Outcome::FoundSlotEvicted { iters } => {
                dry!("found_evicted", iters)
            }
            Outcome::ItersExceeded { iters } => {
                dry!("err_iters_exceeded", iters);
                super::page_cache_errors_inc(super::PageCacheErrorKind::EvictIterLimit);
            }
        }
    }
 }
 pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_page_cache_acquire_pinned_slot_seconds",
@@ -293,14 +351,6 @@ pub(crate) static PAGE_CACHE_ACQUIRE_PINNED_SLOT_TIME: Lazy<Histogram> = Lazy::n
    .expect("failed to define a metric")
 });
 pub(crate) static PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL: Lazy<IntCounter> = Lazy::new(|| {
    register_int_counter!(
        "pageserver_page_cache_find_victim_iters_total",
        "Counter for the number of iterations in the find_victim loop",
    )
    .expect("failed to define a metric")
 });
 static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
    register_int_counter_vec!(
        "page_cache_errors_total",
@@ -402,6 +452,129 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define current logical size metric")
 });
 pub(crate) mod initial_logical_size {
    use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
    use once_cell::sync::Lazy;
    pub(crate) struct StartCalculation(IntCounterVec);
    pub(crate) static START_CALCULATION: Lazy<StartCalculation> = Lazy::new(|| {
        StartCalculation(
            register_int_counter_vec!(
                "pageserver_initial_logical_size_start_calculation",
                "Incremented each time we start an initial logical size calculation attempt. \
                 The `circumstances` label provides some additional details.",
                &["attempt", "circumstances"]
            )
            .unwrap(),
        )
    });
    struct DropCalculation {
        first: IntCounter,
        retry: IntCounter,
    }
    static DROP_CALCULATION: Lazy<DropCalculation> = Lazy::new(|| {
        let vec = register_int_counter_vec!(
            "pageserver_initial_logical_size_drop_calculation",
            "Incremented each time we abort a started size calculation attmpt.",
            &["attempt"]
        )
        .unwrap();
        DropCalculation {
            first: vec.with_label_values(&["first"]),
            retry: vec.with_label_values(&["retry"]),
        }
    });
    pub(crate) struct Calculated {
        pub(crate) births: IntCounter,
        pub(crate) deaths: IntCounter,
    }
    pub(crate) static CALCULATED: Lazy<Calculated> = Lazy::new(|| Calculated {
        births: register_int_counter!(
            "pageserver_initial_logical_size_finish_calculation",
            "Incremented every time we finish calculation of initial logical size.\
             If everything is working well, this should happen at most once per Timeline object."
        )
        .unwrap(),
        deaths: register_int_counter!(
            "pageserver_initial_logical_size_drop_finished_calculation",
            "Incremented when we drop a finished initial logical size calculation result.\
             Mainly useful to turn pageserver_initial_logical_size_finish_calculation into a gauge."
        )
        .unwrap(),
    });
    pub(crate) struct OngoingCalculationGuard {
        inc_drop_calculation: Option<IntCounter>,
    }
    #[derive(strum_macros::IntoStaticStr)]
    pub(crate) enum StartCircumstances {
        EmptyInitial,
        SkippedConcurrencyLimiter,
        AfterBackgroundTasksRateLimit,
    }
    impl StartCalculation {
        pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
            let circumstances_label: &'static str = circumstances.into();
            self.0.with_label_values(&["first", circumstances_label]);
            OngoingCalculationGuard {
                inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
            }
        }
        pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
            let circumstances_label: &'static str = circumstances.into();
            self.0.with_label_values(&["retry", circumstances_label]);
            OngoingCalculationGuard {
                inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
            }
        }
    }
    impl Drop for OngoingCalculationGuard {
        fn drop(&mut self) {
            if let Some(counter) = self.inc_drop_calculation.take() {
                counter.inc();
            }
        }
    }
    impl OngoingCalculationGuard {
        pub(crate) fn calculation_result_saved(mut self) -> FinishedCalculationGuard {
            drop(self.inc_drop_calculation.take());
            CALCULATED.births.inc();
            FinishedCalculationGuard {
                inc_on_drop: CALCULATED.deaths.clone(),
            }
        }
    }
    pub(crate) struct FinishedCalculationGuard {
        inc_on_drop: IntCounter,
    }
    impl Drop for FinishedCalculationGuard {
        fn drop(&mut self) {
            self.inc_on_drop.inc();
        }
    }
    // context: https://github.com/neondatabase/neon/issues/5963
    pub(crate) static TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE: Lazy<IntCounter> =
        Lazy::new(|| {
            register_int_counter!(
                "pageserver_initial_logical_size_timelines_where_walreceiver_got_approximate_size",
                "Counter for the following event: walreceiver calls\
                 Timeline::get_current_logical_size() and it returns `Approximate` for the first time."
            )
            .unwrap()
        });
 }
 pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_states_count",
@@ -477,7 +650,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
        "pageserver_evictions_with_low_residence_duration",
        "If a layer is evicted that was resident for less than `low_threshold`, it is counted to this counter. \
         Residence duration is determined using the `residence_duration_data_source`.",
-        &["tenant_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
+        &["tenant_id", "shard_id", "timeline_id", "residence_duration_data_source", "low_threshold_secs"]
    )
    .expect("failed to define a metric")
 });
@@ -541,10 +714,16 @@ impl EvictionsWithLowResidenceDurationBuilder {
        }
    }
-    fn build(&self, tenant_id: &str, timeline_id: &str) -> EvictionsWithLowResidenceDuration {
+    fn build(
        &self,
        tenant_id: &str,
        shard_id: &str,
        timeline_id: &str,
    ) -> EvictionsWithLowResidenceDuration {
        let counter = EVICTIONS_WITH_LOW_RESIDENCE_DURATION
            .get_metric_with_label_values(&[
                tenant_id,
                shard_id,
                timeline_id,
                self.data_source,
                &EvictionsWithLowResidenceDuration::threshold_label_value(self.threshold),
@@ -575,21 +754,24 @@ impl EvictionsWithLowResidenceDuration {
    pub fn change_threshold(
        &mut self,
        tenant_id: &str,
        shard_id: &str,
        timeline_id: &str,
        new_threshold: Duration,
    ) {
        if new_threshold == self.threshold {
            return;
        }
-        let mut with_new =
+        let mut with_new = EvictionsWithLowResidenceDurationBuilder::new(
-            EvictionsWithLowResidenceDurationBuilder::new(self.data_source, new_threshold)
+            self.data_source,
-                .build(tenant_id, timeline_id);
+            new_threshold,
        )
        .build(tenant_id, shard_id, timeline_id);
        std::mem::swap(self, &mut with_new);
-        with_new.remove(tenant_id, timeline_id);
+        with_new.remove(tenant_id, shard_id, timeline_id);
    }
    // This could be a `Drop` impl, but, we need the `tenant_id` and `timeline_id`.
-    fn remove(&mut self, tenant_id: &str, timeline_id: &str) {
+    fn remove(&mut self, tenant_id: &str, shard_id: &str, timeline_id: &str) {
        let Some(_counter) = self.counter.take() else {
            return;
        };
@@ -598,6 +780,7 @@ impl EvictionsWithLowResidenceDuration {
        let removed = EVICTIONS_WITH_LOW_RESIDENCE_DURATION.remove_label_values(&[
            tenant_id,
            shard_id,
            timeline_id,
            self.data_source,
            &threshold,
@@ -638,7 +821,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
 ///
 /// Operations:
 /// - open ([`std::fs::OpenOptions::open`])
-/// - close (dropping [`std::fs::File`])
+/// - close (dropping [`crate::virtual_file::VirtualFile`])
 /// - close-by-replace (close by replacement algorithm)
 /// - read (`read_at`)
 /// - write (`write_at`)
@@ -650,6 +833,7 @@ const STORAGE_IO_TIME_BUCKETS: &[f64] = &[
 )]
 pub(crate) enum StorageIoOperation {
    Open,
    OpenAfterReplace,
    Close,
    CloseByReplace,
    Read,
@@ -663,6 +847,7 @@ impl StorageIoOperation {
    pub fn as_str(&self) -> &'static str {
        match self {
            StorageIoOperation::Open => "open",
            StorageIoOperation::OpenAfterReplace => "open-after-replace",
            StorageIoOperation::Close => "close",
            StorageIoOperation::CloseByReplace => "close-by-replace",
            StorageIoOperation::Read => "read",
@@ -717,6 +902,25 @@ pub(crate) static STORAGE_IO_SIZE: Lazy<IntGaugeVec> = Lazy::new(|| {
    .expect("failed to define a metric")
 });
 pub(crate) mod virtual_file_descriptor_cache {
    use super::*;
    pub(crate) static SIZE_MAX: Lazy<UIntGauge> = Lazy::new(|| {
        register_uint_gauge!(
            "pageserver_virtual_file_descriptor_cache_size_max",
            "Maximum number of open file descriptors in the cache."
        )
        .unwrap()
    });
    // SIZE_CURRENT: derive it like so:
    // ```
    // sum (pageserver_io_operations_seconds_count{operation=~"^(open|open-after-replace)$")
    // -ignoring(operation)
    // sum(pageserver_io_operations_seconds_count{operation=~"^(close|close-by-replace)$"}
    // ```
 }
 #[derive(Debug)]
 struct GlobalAndPerTimelineHistogram {
    global: Histogram,
@@ -1043,6 +1247,30 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
 }
 });
 pub(crate) struct WalIngestMetrics {
    pub(crate) records_received: IntCounter,
    pub(crate) records_committed: IntCounter,
    pub(crate) records_filtered: IntCounter,
 }
 pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
    records_received: register_int_counter!(
        "pageserver_wal_ingest_records_received",
        "Number of WAL records received from safekeepers"
    )
    .expect("failed to define a metric"),
    records_committed: register_int_counter!(
        "pageserver_wal_ingest_records_committed",
        "Number of WAL records which resulted in writes to pageserver storage"
    )
    .expect("failed to define a metric"),
    records_filtered: register_int_counter!(
        "pageserver_wal_ingest_records_filtered",
        "Number of WAL records filtered out due to sharding"
    )
    .expect("failed to define a metric"),
 });
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
    Upload,
@@ -1252,9 +1480,20 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    .unwrap()
 });
 pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
        "pageserver_wal_redo_process_launch_duration",
        "Histogram of the duration of successful WalRedoProcess::launch calls",
        redo_histogram_time_buckets!(),
    )
    .expect("failed to define a metric")
 });
 pub(crate) struct WalRedoProcessCounters {
    pub(crate) started: IntCounter,
    pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
    pub(crate) active_stderr_logger_tasks_started: IntCounter,
    pub(crate) active_stderr_logger_tasks_finished: IntCounter,
 }
 #[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
@@ -1278,6 +1517,19 @@ impl Default for WalRedoProcessCounters {
            &["cause"],
        )
        .unwrap();
        let active_stderr_logger_tasks_started = register_int_counter!(
            "pageserver_walredo_stderr_logger_tasks_started_total",
            "Number of active walredo stderr logger tasks that have started",
        )
        .unwrap();
        let active_stderr_logger_tasks_finished = register_int_counter!(
            "pageserver_walredo_stderr_logger_tasks_finished_total",
            "Number of active walredo stderr logger tasks that have finished",
        )
        .unwrap();
        Self {
            started,
            killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
@@ -1285,6 +1537,8 @@ impl Default for WalRedoProcessCounters {
                let cause_str: &'static str = cause.into();
                killed.with_label_values(&[cause_str])
            })),
            active_stderr_logger_tasks_started,
            active_stderr_logger_tasks_finished,
        }
    }
 }
@@ -1359,6 +1613,7 @@ impl StorageTimeMetrics {
 #[derive(Debug)]
 pub struct TimelineMetrics {
    tenant_id: String,
    shard_id: String,
    timeline_id: String,
    pub flush_time_histo: StorageTimeMetrics,
    pub compact_time_histo: StorageTimeMetrics,
@@ -1379,11 +1634,12 @@ pub struct TimelineMetrics {
 impl TimelineMetrics {
    pub fn new(
-        tenant_id: &TenantId,
+        tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
        evictions_with_low_residence_duration_builder: EvictionsWithLowResidenceDurationBuilder,
    ) -> Self {
-        let tenant_id = tenant_id.to_string();
+        let tenant_id = tenant_shard_id.tenant_id.to_string();
        let shard_id = format!("{}", tenant_shard_id.shard_slug());
        let timeline_id = timeline_id.to_string();
        let flush_time_histo =
            StorageTimeMetrics::new(StorageTimeOperation::LayerFlush, &tenant_id, &timeline_id);
@@ -1420,11 +1676,12 @@ impl TimelineMetrics {
        let evictions = EVICTIONS
            .get_metric_with_label_values(&[&tenant_id, &timeline_id])
            .unwrap();
-        let evictions_with_low_residence_duration =
+        let evictions_with_low_residence_duration = evictions_with_low_residence_duration_builder
-            evictions_with_low_residence_duration_builder.build(&tenant_id, &timeline_id);
+            .build(&tenant_id, &shard_id, &timeline_id);
        TimelineMetrics {
            tenant_id,
            shard_id,
            timeline_id,
            flush_time_histo,
            compact_time_histo,
@@ -1470,6 +1727,7 @@ impl Drop for TimelineMetrics {
    fn drop(&mut self) {
        let tenant_id = &self.tenant_id;
        let timeline_id = &self.timeline_id;
        let shard_id = &self.shard_id;
        let _ = LAST_RECORD_LSN.remove_label_values(&[tenant_id, timeline_id]);
        {
            RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
@@ -1483,7 +1741,7 @@ impl Drop for TimelineMetrics {
        self.evictions_with_low_residence_duration
            .write()
            .unwrap()
-            .remove(tenant_id, timeline_id);
+            .remove(tenant_id, shard_id, timeline_id);
        // The following metrics are born outside of the TimelineMetrics lifecycle but still
        // removed at the end of it. The idea is to have the metrics outlive the
@@ -1571,9 +1829,9 @@ pub struct RemoteTimelineClientMetrics {
 }
 impl RemoteTimelineClientMetrics {
-    pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
+    pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
        RemoteTimelineClientMetrics {
-            tenant_id: tenant_id.to_string(),
+            tenant_id: tenant_shard_id.tenant_id.to_string(),
            timeline_id: timeline_id.to_string(),
            calls_unfinished_gauge: Mutex::new(HashMap::default()),
            bytes_started_counter: Mutex::new(HashMap::default()),
@@ -1944,6 +2202,8 @@ pub fn preinitialize_metrics() {
    // Tenant manager stats
    Lazy::force(&TENANT_MANAGER);
    Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
    // countervecs
    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
        .into_iter()
@@ -1961,6 +2221,7 @@ pub fn preinitialize_metrics() {
        &WAL_REDO_TIME,
        &WAL_REDO_RECORDS_HISTOGRAM,
        &WAL_REDO_BYTES_HISTOGRAM,
        &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
    ]
    .into_iter()
    .for_each(|h| {
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -28,7 +28,7 @@
 //! Page cache maps from a cache key to a buffer slot.
 //! The cache key uniquely identifies the piece of data that is being cached.
 //!
-//! The cache key for **materialized pages** is  [`TenantId`], [`TimelineId`], [`Key`], and [`Lsn`].
+//! The cache key for **materialized pages** is  [`TenantShardId`], [`TimelineId`], [`Key`], and [`Lsn`].
 //! Use [`PageCache::memorize_materialized_page`] and [`PageCache::lookup_materialized_page`] for fill & access.
 //!
 //! The cache key for **immutable file** pages is [`FileId`] and a block number.
@@ -83,12 +83,14 @@ use std::{
 use anyhow::Context;
 use once_cell::sync::OnceCell;
-use utils::{
+use pageserver_api::shard::TenantShardId;
-    id::{TenantId, TimelineId},
+use utils::{id::TimelineId, lsn::Lsn};
    lsn::Lsn,
 };
-use crate::{context::RequestContext, metrics::PageCacheSizeMetrics, repository::Key};
+use crate::{
    context::RequestContext,
    metrics::{page_cache_eviction_metrics, PageCacheSizeMetrics},
    repository::Key,
 };
 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
 const TEST_PAGE_CACHE_SIZE: usize = 50;
@@ -150,7 +152,13 @@ enum CacheKey {
 #[derive(Debug, PartialEq, Eq, Hash, Clone)]
 struct MaterializedPageHashKey {
-    tenant_id: TenantId,
+    /// Why is this TenantShardId rather than TenantId?
    ///
    /// Usually, the materialized value of a page@lsn is identical on any shard in the same tenant.  However, this
    /// this not the case for certain internally-generated pages (e.g. relation sizes).  In future, we may make this
    /// key smaller by omitting the shard, if we ensure that reads to such pages always skip the cache, or are
    /// special-cased in some other way.
    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    key: Key,
 }
@@ -374,7 +382,7 @@ impl PageCache {
    /// returned page.
    pub async fn lookup_materialized_page(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        key: &Key,
        lsn: Lsn,
@@ -391,7 +399,7 @@ impl PageCache {
        let mut cache_key = CacheKey::MaterializedPage {
            hash_key: MaterializedPageHashKey {
-                tenant_id,
+                tenant_shard_id,
                timeline_id,
                key: *key,
            },
@@ -432,7 +440,7 @@ impl PageCache {
    ///
    pub async fn memorize_materialized_page(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        key: Key,
        lsn: Lsn,
@@ -440,7 +448,7 @@ impl PageCache {
    ) -> anyhow::Result<()> {
        let cache_key = CacheKey::MaterializedPage {
            hash_key: MaterializedPageHashKey {
-                tenant_id,
+                tenant_shard_id,
                timeline_id,
                key,
            },
@@ -897,8 +905,10 @@ impl PageCache {
                            // Note that just yielding to tokio during iteration without such
                            // priority boosting is likely counter-productive. We'd just give more opportunities
                            // for B to bump usage count, further starving A.
-                            crate::metrics::page_cache_errors_inc(
+                            page_cache_eviction_metrics::observe(
-                                crate::metrics::PageCacheErrorKind::EvictIterLimit,
+                                page_cache_eviction_metrics::Outcome::ItersExceeded {
                                    iters: iters.try_into().unwrap(),
                                },
                            );
                            anyhow::bail!("exceeded evict iter limit");
                        }
@@ -909,8 +919,18 @@ impl PageCache {
                    // remove mapping for old buffer
                    self.remove_mapping(old_key);
                    inner.key = None;
                    page_cache_eviction_metrics::observe(
                        page_cache_eviction_metrics::Outcome::FoundSlotEvicted {
                            iters: iters.try_into().unwrap(),
                        },
                    );
                } else {
                    page_cache_eviction_metrics::observe(
                        page_cache_eviction_metrics::Outcome::FoundSlotUnused {
                            iters: iters.try_into().unwrap(),
                        },
                    );
                }
                crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
                return Ok((slot_idx, inner));
            }
        }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -53,21 +53,23 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
 use crate::pgdatadir_mapping::rel_block_to_key;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
 use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
 use crate::tenant::mgr::ShardSelector;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;
-// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
+// How long we may wait for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
 // is not yet in state [`TenantState::Active`].
-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);
+const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
 /// Read the end of a tar archive.
 ///
@@ -164,6 +166,7 @@ pub async fn libpq_listener_main(
                    None,
                    "serving compute connection task",
                    false,
                    cancel.child_token(),
                    page_service_conn_main(
                        conf,
                        broker_client.clone(),
@@ -399,18 +402,25 @@ impl PageServerHandler {
    {
        debug_assert_current_span_has_tenant_and_timeline_id();
-        // Make request tracer if needed
+        // Note that since one connection may contain getpage requests that target different
        // shards (e.g. during splitting when the compute is not yet aware of the split), the tenant
        // that we look up here may not be the one that serves all the actual requests: we will double
        // check the mapping of key->shard later before calling into Timeline for getpage requests.
        let tenant = mgr::get_active_tenant_with_timeout(
            tenant_id,
            ShardSelector::First,
            ACTIVE_TENANT_TIMEOUT,
            &task_mgr::shutdown_token(),
        )
        .await?;
        // Make request tracer if needed
        let mut tracer = if tenant.get_trace_read_requests() {
            let connection_id = ConnectionId::generate();
-            let path = tenant
+            let path =
-                .conf
+                tenant
-                .trace_path(&tenant_id, &timeline_id, &connection_id);
+                    .conf
                    .trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id);
            Some(Tracer::new(path))
        } else {
            None
@@ -562,6 +572,7 @@ impl PageServerHandler {
        info!("creating new timeline");
        let tenant = get_active_tenant_with_timeout(
            tenant_id,
            ShardSelector::Zero,
            ACTIVE_TENANT_TIMEOUT,
            &task_mgr::shutdown_token(),
        )
@@ -624,7 +635,7 @@ impl PageServerHandler {
        debug_assert_current_span_has_tenant_and_timeline_id();
        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id)
+            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
            .await?;
        let last_record_lsn = timeline.get_last_record_lsn();
        if last_record_lsn != start_lsn {
@@ -803,9 +814,49 @@ impl PageServerHandler {
        }
        */
-        let page = timeline
+        let key = rel_block_to_key(req.rel, req.blkno);
-            .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
+        let page = if timeline.get_shard_identity().is_key_local(&key) {
-            .await?;
+            timeline
                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
                .await?
        } else {
            // The Tenant shard we looked up at connection start does not hold this particular
            // key: look for other shards in this tenant.  This scenario occurs if a pageserver
            // has multiple shards for the same tenant.
            //
            // TODO: optimize this (https://github.com/neondatabase/neon/pull/6037)
            let timeline = match self
                .get_active_tenant_timeline(
                    timeline.tenant_shard_id.tenant_id,
                    timeline.timeline_id,
                    ShardSelector::Page(key),
                )
                .await
            {
                Ok(t) => t,
                Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
                    // We already know this tenant exists in general, because we resolved it at
                    // start of connection.  Getting a NotFound here indicates that the shard containing
                    // the requested page is not present on this node.
                    // TODO: this should be some kind of structured error that the client will understand,
                    // so that it can block until its config is updated: this error is expected in the case
                    // that the Tenant's shards' placements are being updated and the client hasn't been
                    // informed yet.
                    //
                    // https://github.com/neondatabase/neon/issues/6038
                    return Err(anyhow::anyhow!("Request routed to wrong shard"));
                }
                Err(e) => return Err(e.into()),
            };
            // Take a GateGuard for the duration of this request.  If we were using our main Timeline object,
            // the GateGuard was already held over the whole connection.
            let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
            timeline
                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
                .await?
        };
        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
            page,
@@ -834,7 +885,7 @@ impl PageServerHandler {
        // check that the timeline exists
        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id)
+            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
            .await?;
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
@@ -940,9 +991,11 @@ impl PageServerHandler {
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        selector: ShardSelector,
    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
        let tenant = get_active_tenant_with_timeout(
            tenant_id,
            selector,
            ACTIVE_TENANT_TIMEOUT,
            &task_mgr::shutdown_token(),
        )
@@ -1116,7 +1169,7 @@ where
            self.check_permission(Some(tenant_id))?;
            let timeline = self
-                .get_active_tenant_timeline(tenant_id, timeline_id)
+                .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
                .await?;
            let end_of_timeline = timeline.get_last_record_rlsn();
@@ -1303,6 +1356,7 @@ where
            let tenant = get_active_tenant_with_timeout(
                tenant_id,
                ShardSelector::Zero,
                ACTIVE_TENANT_TIMEOUT,
                &task_mgr::shutdown_token(),
            )
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -13,6 +13,7 @@ use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use bytes::{Buf, Bytes};
 use pageserver_api::key::is_rel_block_key;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -21,6 +22,7 @@ use serde::{Deserialize, Serialize};
 use std::collections::{hash_map, HashMap, HashSet};
 use std::ops::ControlFlow;
 use std::ops::Range;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, trace, warn};
 use utils::bin_ser::DeserializeError;
 use utils::{bin_ser::BeSer, lsn::Lsn};
@@ -281,6 +283,10 @@ impl Timeline {
    }
    /// Get a list of all existing relations in given tablespace and database.
    ///
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
    pub async fn list_rels(
        &self,
        spcnode: Oid,
@@ -365,6 +371,7 @@ impl Timeline {
    pub async fn find_lsn_for_timestamp(
        &self,
        search_timestamp: TimestampTz,
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) -> Result<LsnForTimestamp, PageReconstructError> {
        let gc_cutoff_lsn_guard = self.get_latest_gc_cutoff_lsn();
@@ -383,6 +390,9 @@ impl Timeline {
        let mut found_smaller = false;
        let mut found_larger = false;
        while low < high {
            if cancel.is_cancelled() {
                return Err(PageReconstructError::Cancelled);
            }
            // cannot overflow, high and low are both smaller than u64::MAX / 2
            let mid = (high + low) / 2;
@@ -625,6 +635,10 @@ impl Timeline {
    ///
    /// Only relation blocks are counted currently. That excludes metadata,
    /// SLRUs, twophase files etc.
    ///
    /// # Cancel-Safety
    ///
    /// This method is cancellation-safe.
    pub async fn get_current_logical_size_non_incremental(
        &self,
        lsn: Lsn,
@@ -808,10 +822,7 @@ impl<'a> DatadirModification<'a> {
        self.put(DBDIR_KEY, Value::Image(buf.into()));
        // Create AuxFilesDirectory
-        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
+        self.init_aux_dir()?;
            files: HashMap::new(),
        })?;
        self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
        let buf = TwoPhaseDirectory::ser(&TwoPhaseDirectory {
            xids: HashSet::new(),
@@ -919,10 +930,7 @@ impl<'a> DatadirModification<'a> {
            self.put(DBDIR_KEY, Value::Image(buf.into()));
            // Create AuxFilesDirectory as well
-            let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
+            self.init_aux_dir()?;
                files: HashMap::new(),
            })?;
            self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
        }
        if r.is_none() {
            // Create RelDirectory
@@ -1247,6 +1255,14 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }
    pub fn init_aux_dir(&mut self) -> anyhow::Result<()> {
        let buf = AuxFilesDirectory::ser(&AuxFilesDirectory {
            files: HashMap::new(),
        })?;
        self.put(AUX_FILES_KEY, Value::Image(Bytes::from(buf)));
        Ok(())
    }
    pub async fn put_file(
        &mut self,
        path: &str,
@@ -1309,7 +1325,7 @@ impl<'a> DatadirModification<'a> {
        // Flush relation and  SLRU data blocks, keep metadata.
        let mut retained_pending_updates = HashMap::new();
        for (key, value) in self.pending_updates.drain() {
-            if is_rel_block_key(key) || is_slru_block_key(key) {
+            if is_rel_block_key(&key) || is_slru_block_key(key) {
                // This bails out on first error without modifying pending_updates.
                // That's Ok, cf this function's doc comment.
                writer.put(key, self.lsn, &value, ctx).await?;
@@ -1354,6 +1370,10 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }
    pub(crate) fn is_empty(&self) -> bool {
        self.pending_updates.is_empty() && self.pending_deletions.is_empty()
    }
    // Internal helper functions to batch the modifications
    async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
@@ -1565,7 +1585,7 @@ fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
    }
 }
-fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
+pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
    Key {
        field1: 0x00,
        field2: rel.spcnode,
@@ -1749,6 +1769,13 @@ const AUX_FILES_KEY: Key = Key {
 // Reverse mappings for a few Keys.
 // These are needed by WAL redo manager.
 // AUX_FILES currently stores only data for logical replication (slots etc), and
 // we don't preserve these on a branch because safekeepers can't follow timeline
 // switch (and generally it likely should be optional), so ignore these.
 pub fn is_inherited_key(key: Key) -> bool {
    key != AUX_FILES_KEY
 }
 pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
    Ok(match key.field1 {
        0x00 => (
@@ -1764,10 +1791,6 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
    })
 }
 fn is_rel_block_key(key: Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0
 }
 pub fn is_rel_fsm_block_key(key: Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
 }
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -138,6 +138,14 @@ pub struct GcResult {
    #[serde(serialize_with = "serialize_duration_as_millis")]
    pub elapsed: Duration,
    /// The layers which were garbage collected.
    ///
    /// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
    /// dropped in tests.
    #[cfg(feature = "testing")]
    #[serde(skip)]
    pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
 }
 // helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
@@ -158,5 +166,11 @@ impl AddAssign for GcResult {
        self.layers_removed += other.layers_removed;
        self.elapsed += other.elapsed;
        #[cfg(feature = "testing")]
        {
            let mut other = other;
            self.doomed_layers.append(&mut other.doomed_layers);
        }
    }
 }
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -42,6 +42,7 @@ use std::sync::atomic::{AtomicU64, Ordering};
 use std::sync::{Arc, Mutex};
 use futures::FutureExt;
 use pageserver_api::shard::TenantShardId;
 use tokio::runtime::Runtime;
 use tokio::task::JoinHandle;
 use tokio::task_local;
@@ -51,7 +52,7 @@ use tracing::{debug, error, info, warn};
 use once_cell::sync::Lazy;
-use utils::id::{TenantId, TimelineId};
+use utils::id::TimelineId;
 use crate::shutdown_pageserver;
@@ -317,7 +318,7 @@ struct PageServerTask {
    /// Tasks may optionally be launched for a particular tenant/timeline, enabling
    /// later cancelling tasks for that tenant/timeline in [`shutdown_tasks`]
-    tenant_id: Option<TenantId>,
+    tenant_shard_id: Option<TenantShardId>,
    timeline_id: Option<TimelineId>,
    mutable: Mutex<MutableTaskState>,
@@ -326,26 +327,28 @@ struct PageServerTask {
 /// Launch a new task
 /// Note: if shutdown_process_on_error is set to true failure
 ///   of the task will lead to shutdown of entire process
 #[allow(clippy::too_many_arguments)]
 pub fn spawn<F>(
    runtime: &tokio::runtime::Handle,
    kind: TaskKind,
-    tenant_id: Option<TenantId>,
+    tenant_shard_id: Option<TenantShardId>,
    timeline_id: Option<TimelineId>,
    name: &str,
    shutdown_process_on_error: bool,
    cancel: CancellationToken,
    future: F,
 ) -> PageserverTaskId
 where
    F: Future<Output = anyhow::Result<()>> + Send + 'static,
 {
-    let cancel = CancellationToken::new();
+    // let cancel = CancellationToken::new();
    let task_id = NEXT_TASK_ID.fetch_add(1, Ordering::Relaxed);
    let task = Arc::new(PageServerTask {
        task_id: PageserverTaskId(task_id),
        kind,
        name: name.to_string(),
        cancel: cancel.clone(),
-        tenant_id,
+        tenant_shard_id,
        timeline_id,
        mutable: Mutex::new(MutableTaskState { join_handle: None }),
    });
@@ -424,28 +427,28 @@ async fn task_finish(
            Ok(Err(err)) => {
                if shutdown_process_on_error {
                    error!(
-                        "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
+                        "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        task_name, task.tenant_shard_id, task.timeline_id, err
                    );
                    shutdown_process = true;
                } else {
                    error!(
-                        "Task '{}' tenant_id: {:?}, timeline_id: {:?} exited with error: {:?}",
+                        "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} exited with error: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        task_name, task.tenant_shard_id, task.timeline_id, err
                    );
                }
            }
            Err(err) => {
                if shutdown_process_on_error {
                    error!(
-                        "Shutting down: task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
+                        "Shutting down: task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        task_name, task.tenant_shard_id, task.timeline_id, err
                    );
                    shutdown_process = true;
                } else {
                    error!(
-                        "Task '{}' tenant_id: {:?}, timeline_id: {:?} panicked: {:?}",
+                        "Task '{}' tenant_shard_id: {:?}, timeline_id: {:?} panicked: {:?}",
-                        task_name, task.tenant_id, task.timeline_id, err
+                        task_name, task.tenant_shard_id, task.timeline_id, err
                    );
                }
            }
@@ -467,11 +470,11 @@ async fn task_finish(
 ///
 /// Or to shut down all tasks for given timeline:
 ///
-///   shutdown_tasks(None, Some(tenant_id), Some(timeline_id))
+///   shutdown_tasks(None, Some(tenant_shard_id), Some(timeline_id))
 ///
 pub async fn shutdown_tasks(
    kind: Option<TaskKind>,
-    tenant_id: Option<TenantId>,
+    tenant_shard_id: Option<TenantShardId>,
    timeline_id: Option<TimelineId>,
 ) {
    let mut victim_tasks = Vec::new();
@@ -480,35 +483,35 @@ pub async fn shutdown_tasks(
        let tasks = TASKS.lock().unwrap();
        for task in tasks.values() {
            if (kind.is_none() || Some(task.kind) == kind)
-                && (tenant_id.is_none() || task.tenant_id == tenant_id)
+                && (tenant_shard_id.is_none() || task.tenant_shard_id == tenant_shard_id)
                && (timeline_id.is_none() || task.timeline_id == timeline_id)
            {
                task.cancel.cancel();
                victim_tasks.push((
                    Arc::clone(task),
                    task.kind,
-                    task.tenant_id,
+                    task.tenant_shard_id,
                    task.timeline_id,
                ));
            }
        }
    }
-    let log_all = kind.is_none() && tenant_id.is_none() && timeline_id.is_none();
+    let log_all = kind.is_none() && tenant_shard_id.is_none() && timeline_id.is_none();
-    for (task, task_kind, tenant_id, timeline_id) in victim_tasks {
+    for (task, task_kind, tenant_shard_id, timeline_id) in victim_tasks {
        let join_handle = {
            let mut task_mut = task.mutable.lock().unwrap();
            task_mut.join_handle.take()
        };
        if let Some(mut join_handle) = join_handle {
            if log_all {
-                if tenant_id.is_none() {
+                if tenant_shard_id.is_none() {
                    // there are quite few of these
                    info!(name = task.name, kind = ?task_kind, "stopping global task");
                } else {
                    // warn to catch these in tests; there shouldn't be any
-                    warn!(name = task.name, tenant_id = ?tenant_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
+                    warn!(name = task.name, tenant_shard_id = ?tenant_shard_id, timeline_id = ?timeline_id, kind = ?task_kind, "stopping left-over");
                }
            }
            if tokio::time::timeout(std::time::Duration::from_secs(1), &mut join_handle)
@@ -517,12 +520,13 @@ pub async fn shutdown_tasks(
            {
                // allow some time to elapse before logging to cut down the number of log
                // lines.
-                info!("waiting for {} to shut down", task.name);
+                info!("waiting for task {} to shut down", task.name);
                // we never handled this return value, but:
                // - we don't deschedule which would lead to is_cancelled
                // - panics are already logged (is_panicked)
                // - task errors are already logged in the wrapper
                let _ = join_handle.await;
                info!("task {} completed", task.name);
            }
        } else {
            // Possibly one of:
@@ -556,9 +560,14 @@ pub async fn shutdown_watcher() {
 /// cancelled. It can however be moved to other tasks, such as `tokio::task::spawn_blocking` or
 /// `tokio::task::JoinSet::spawn`.
 pub fn shutdown_token() -> CancellationToken {
-    SHUTDOWN_TOKEN
+    let res = SHUTDOWN_TOKEN.try_with(|t| t.clone());
-        .try_with(|t| t.clone())
+
-        .expect("shutdown_token() called in an unexpected task or thread")
+    if cfg!(test) {
        res.unwrap_or_default()
    } else {
        // tests need to call the same paths which need to use get the shutdown token
        res.expect("shutdown_token() called in an unexpected task or thread")
    }
 }
 /// Has the current task been requested to shut down?
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -8,9 +8,12 @@
 //! We cannot use global or default config instead, because wrong settings
 //! may lead to a data loss.
 //!
-use anyhow::Context;
+use anyhow::bail;
 use pageserver_api::models;
 use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
 use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
 use serde_json::Value;
 use std::num::NonZeroU64;
 use std::time::Duration;
 use utils::generation::Generation;
@@ -88,6 +91,14 @@ pub(crate) struct LocationConf {
    /// The location-specific part of the configuration, describes the operating
    /// mode of this pageserver for this tenant.
    pub(crate) mode: LocationMode,
    /// The detailed shard identity.  This structure is already scoped within
    /// a TenantShardId, but we need the full ShardIdentity to enable calculating
    /// key->shard mappings.
    #[serde(default = "ShardIdentity::unsharded")]
    #[serde(skip_serializing_if = "ShardIdentity::is_unsharded")]
    pub(crate) shard: ShardIdentity,
    /// The pan-cluster tenant configuration, the same on all locations
    pub(crate) tenant_conf: TenantConfOpt,
 }
@@ -160,6 +171,8 @@ impl LocationConf {
                generation,
                attach_mode: AttachmentMode::Single,
            }),
            // Legacy configuration loads are always from tenants created before sharding existed.
            shard: ShardIdentity::unsharded(),
            tenant_conf,
        }
    }
@@ -187,6 +200,7 @@ impl LocationConf {
        fn get_generation(conf: &'_ models::LocationConfig) -> Result<Generation, anyhow::Error> {
            conf.generation
                .map(Generation::new)
                .ok_or_else(|| anyhow::anyhow!("Generation must be set when attaching"))
        }
@@ -226,7 +240,21 @@ impl LocationConf {
            }
        };
-        Ok(Self { mode, tenant_conf })
+        let shard = if conf.shard_count == 0 {
            ShardIdentity::unsharded()
        } else {
            ShardIdentity::new(
                ShardNumber(conf.shard_number),
                ShardCount(conf.shard_count),
                ShardStripeSize(conf.shard_stripe_size),
            )?
        };
        Ok(Self {
            shard,
            mode,
            tenant_conf,
        })
    }
 }
@@ -241,6 +269,7 @@ impl Default for LocationConf {
                attach_mode: AttachmentMode::Single,
            }),
            tenant_conf: TenantConfOpt::default(),
            shard: ShardIdentity::unsharded(),
        }
    }
 }
@@ -494,105 +523,49 @@ impl Default for TenantConf {
    }
 }
 // Helper function to standardize the error messages we produce on bad durations
 //
 // Intended to be used with anyhow's `with_context`, e.g.:
 //
 //   let value = result.with_context(bad_duration("name", &value))?;
 //
 fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String {
    move || format!("Cannot parse `{field_name}` duration {value:?}")
 }
 impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
    type Error = anyhow::Error;
    fn try_from(request_data: &'_ models::TenantConfig) -> Result<Self, Self::Error> {
-        let mut tenant_conf = TenantConfOpt::default();
+        // Convert the request_data to a JSON Value
        let json_value: Value = serde_json::to_value(request_data)?;
-        if let Some(gc_period) = &request_data.gc_period {
+        // Create a Deserializer from the JSON Value
-            tenant_conf.gc_period = Some(
+        let deserializer = json_value.into_deserializer();
                humantime::parse_duration(gc_period)
                    .with_context(bad_duration("gc_period", gc_period))?,
            );
        }
        tenant_conf.gc_horizon = request_data.gc_horizon;
        tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
-        if let Some(pitr_interval) = &request_data.pitr_interval {
+        // Use serde_path_to_error to deserialize the JSON Value into TenantConfOpt
-            tenant_conf.pitr_interval = Some(
+        let tenant_conf: TenantConfOpt = serde_path_to_error::deserialize(deserializer)?;
                humantime::parse_duration(pitr_interval)
                    .with_context(bad_duration("pitr_interval", pitr_interval))?,
            );
        }
        if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout {
            tenant_conf.walreceiver_connect_timeout = Some(
                humantime::parse_duration(walreceiver_connect_timeout).with_context(
                    bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout),
                )?,
            );
        }
        if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout {
            tenant_conf.lagging_wal_timeout = Some(
                humantime::parse_duration(lagging_wal_timeout)
                    .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?,
            );
        }
        if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
            tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
        }
        if let Some(trace_read_requests) = request_data.trace_read_requests {
            tenant_conf.trace_read_requests = Some(trace_read_requests);
        }
        tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
        if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout {
            tenant_conf.checkpoint_timeout = Some(
                humantime::parse_duration(checkpoint_timeout)
                    .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?,
            );
        }
        tenant_conf.compaction_target_size = request_data.compaction_target_size;
        tenant_conf.compaction_threshold = request_data.compaction_threshold;
        if let Some(compaction_period) = &request_data.compaction_period {
            tenant_conf.compaction_period = Some(
                humantime::parse_duration(compaction_period)
                    .with_context(bad_duration("compaction_period", compaction_period))?,
            );
        }
        if let Some(eviction_policy) = &request_data.eviction_policy {
            tenant_conf.eviction_policy = Some(
                serde::Deserialize::deserialize(eviction_policy)
                    .context("parse field `eviction_policy`")?,
            );
        }
        tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
        if let Some(evictions_low_residence_duration_metric_threshold) =
            &request_data.evictions_low_residence_duration_metric_threshold
        {
            tenant_conf.evictions_low_residence_duration_metric_threshold = Some(
                humantime::parse_duration(evictions_low_residence_duration_metric_threshold)
                    .with_context(bad_duration(
                        "evictions_low_residence_duration_metric_threshold",
                        evictions_low_residence_duration_metric_threshold,
                    ))?,
            );
        }
        tenant_conf.gc_feedback = request_data.gc_feedback;
        Ok(tenant_conf)
    }
 }
 impl TryFrom<toml_edit::Item> for TenantConfOpt {
    type Error = anyhow::Error;
    fn try_from(item: toml_edit::Item) -> Result<Self, Self::Error> {
        match item {
            toml_edit::Item::Value(value) => {
                let d = value.into_deserializer();
                return serde_path_to_error::deserialize(d)
                    .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
            }
            toml_edit::Item::Table(table) => {
                let deserializer = toml_edit::de::Deserializer::new(table.into());
                return serde_path_to_error::deserialize(deserializer)
                    .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
            }
            _ => {
                bail!("expected non-inline table but found {item}")
            }
        }
    }
 }
 #[cfg(test)]
 mod tests {
    use super::*;
    use models::TenantConfig;
    #[test]
    fn de_serializing_pageserver_config_omits_empty_values() {
@@ -609,4 +582,38 @@ mod tests {
        assert_eq!(json_form, "{\"gc_horizon\":42}");
        assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap());
    }
    #[test]
    fn test_try_from_models_tenant_config_err() {
        let tenant_config = models::TenantConfig {
            lagging_wal_timeout: Some("5a".to_string()),
            ..TenantConfig::default()
        };
        let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config);
        assert!(
            tenant_conf_opt.is_err(),
            "Suceeded to convert TenantConfig to TenantConfOpt"
        );
        let expected_error_str =
            "lagging_wal_timeout: invalid value: string \"5a\", expected a duration";
        assert_eq!(tenant_conf_opt.unwrap_err().to_string(), expected_error_str);
    }
    #[test]
    fn test_try_from_models_tenant_config_success() {
        let tenant_config = models::TenantConfig {
            lagging_wal_timeout: Some("5s".to_string()),
            ..TenantConfig::default()
        };
        let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config).unwrap();
        assert_eq!(
            tenant_conf_opt.lagging_wal_timeout,
            Some(Duration::from_secs(5))
        );
    }
 }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -2,22 +2,19 @@ use std::sync::Arc;
 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
-use pageserver_api::models::TenantState;
+use pageserver_api::{models::TenantState, shard::TenantShardId};
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, instrument, warn, Instrument, Span};
+use tracing::{error, instrument, Instrument, Span};
-use utils::{
+use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};
    backoff, completion, crashsafe, fs_ext,
    id::{TenantId, TimelineId},
 };
 use crate::{
    config::PageServerConf,
    context::RequestContext,
    task_mgr::{self, TaskKind},
-    InitializationOrder,
+    tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
 };
 use super::{
@@ -59,10 +56,10 @@ type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
 fn remote_tenant_delete_mark_path(
    conf: &PageServerConf,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
 ) -> anyhow::Result<RemotePath> {
    let tenant_remote_path = conf
-        .tenant_path(tenant_id)
+        .tenant_path(tenant_shard_id)
        .strip_prefix(&conf.workdir)
        .context("Failed to strip workdir prefix")
        .and_then(RemotePath::new)
@@ -73,15 +70,17 @@ fn remote_tenant_delete_mark_path(
 async fn create_remote_delete_mark(
    conf: &PageServerConf,
    remote_storage: &GenericRemoteStorage,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
 ) -> Result<(), DeleteTenantError> {
-    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
+    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
    let data: &[u8] = &[];
    backoff::retry(
        || async {
            let data = bytes::Bytes::from_static(data);
            let stream = futures::stream::once(futures::future::ready(Ok(data)));
            remote_storage
-                .upload(data, 0, &remote_mark_path, None)
+                .upload(stream, 0, &remote_mark_path, None)
                .await
        },
        |_e| false,
@@ -99,9 +98,9 @@ async fn create_remote_delete_mark(
 async fn create_local_delete_mark(
    conf: &PageServerConf,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
 ) -> Result<(), DeleteTenantError> {
-    let marker_path = conf.tenant_deleted_mark_file_path(tenant_id);
+    let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id);
    // Note: we're ok to replace existing file.
    let _ = std::fs::OpenOptions::new()
@@ -170,10 +169,10 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
 async fn remove_tenant_remote_delete_mark(
    conf: &PageServerConf,
    remote_storage: Option<&GenericRemoteStorage>,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
 ) -> Result<(), DeleteTenantError> {
    if let Some(remote_storage) = remote_storage {
-        let path = remote_tenant_delete_mark_path(conf, tenant_id)?;
+        let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
        backoff::retry(
            || async { remote_storage.delete(&path).await },
            |_e| false,
@@ -192,7 +191,7 @@ async fn remove_tenant_remote_delete_mark(
 // Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
 async fn cleanup_remaining_fs_traces(
    conf: &PageServerConf,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
 ) -> Result<(), DeleteTenantError> {
    let rm = |p: Utf8PathBuf, is_dir: bool| async move {
        if is_dir {
@@ -204,8 +203,8 @@ async fn cleanup_remaining_fs_traces(
        .with_context(|| format!("failed to delete {p}"))
    };
-    rm(conf.tenant_config_path(tenant_id), false).await?;
+    rm(conf.tenant_config_path(tenant_shard_id), false).await?;
-    rm(conf.tenant_location_config_path(tenant_id), false).await?;
+    rm(conf.tenant_location_config_path(tenant_shard_id), false).await?;
    fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
        Err(anyhow::anyhow!(
@@ -213,7 +212,7 @@ async fn cleanup_remaining_fs_traces(
        ))?
    });
-    rm(conf.timelines_path(tenant_id), true).await?;
+    rm(conf.timelines_path(tenant_shard_id), true).await?;
    fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
        Err(anyhow::anyhow!(
@@ -227,14 +226,14 @@ async fn cleanup_remaining_fs_traces(
    // to be reordered later and thus missed if a crash occurs.
    // Note that we dont need to sync after mark file is removed
    // because we can tolerate the case when mark file reappears on startup.
-    let tenant_path = &conf.tenant_path(tenant_id);
+    let tenant_path = &conf.tenant_path(tenant_shard_id);
    if tenant_path.exists() {
-        crashsafe::fsync_async(&conf.tenant_path(tenant_id))
+        crashsafe::fsync_async(&conf.tenant_path(tenant_shard_id))
            .await
            .context("fsync_pre_mark_remove")?;
    }
-    rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
+    rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;
    fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
        Err(anyhow::anyhow!(
@@ -242,7 +241,7 @@ async fn cleanup_remaining_fs_traces(
        ))?
    });
-    rm(conf.tenant_path(tenant_id), true).await?;
+    rm(conf.tenant_path(tenant_shard_id), true).await?;
    Ok(())
 }
@@ -287,6 +286,8 @@ impl DeleteTenantFlow {
    ) -> Result<(), DeleteTenantError> {
        span::debug_assert_current_span_has_tenant_id();
        pausable_failpoint!("tenant-delete-before-run");
        let mut guard = Self::prepare(&tenant).await?;
        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
@@ -321,7 +322,7 @@ impl DeleteTenantFlow {
        // Though sounds scary, different mark name?
        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
        if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
+            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id)
                .await
                .context("remote_mark")?
        }
@@ -332,7 +333,7 @@ impl DeleteTenantFlow {
            ))?
        });
-        create_local_delete_mark(conf, &tenant.tenant_id)
+        create_local_delete_mark(conf, &tenant.tenant_shard_id)
            .await
            .context("local delete mark")?;
@@ -374,9 +375,11 @@ impl DeleteTenantFlow {
            return Ok(acquire(tenant));
        }
        let tenant_id = tenant.tenant_id;
        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
-        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
+        if conf
            .tenant_deleted_mark_file_path(&tenant.tenant_shard_id)
            .exists()
        {
            Ok(acquire(tenant))
        } else {
            Ok(None)
@@ -388,7 +391,6 @@ impl DeleteTenantFlow {
        tenant: &Arc<Tenant>,
        preload: Option<TenantPreload>,
        tenants: &'static std::sync::RwLock<TenantsMap>,
        init_order: Option<InitializationOrder>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
        let (_, progress) = completion::channel();
@@ -398,10 +400,7 @@ impl DeleteTenantFlow {
            .await
            .expect("cant be stopping or broken");
-        tenant
+        tenant.attach(preload, ctx).await.context("attach")?;
            .attach(init_order, preload, ctx)
            .await
            .context("attach")?;
        Self::background(
            guard,
@@ -459,15 +458,22 @@ impl DeleteTenantFlow {
        tenants: &'static std::sync::RwLock<TenantsMap>,
        tenant: Arc<Tenant>,
    ) {
-        let tenant_id = tenant.tenant_id;
+        let tenant_shard_id = tenant.tenant_shard_id;
        let cancel = crate::PAGESERVER_SHUTDOWN_TOKEN
            .get()
            .cloned()
            .unwrap_or_default()
            .child_token();
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            TaskKind::TimelineDeletionWorker,
-            Some(tenant_id),
+            Some(tenant_shard_id),
            None,
            "tenant_delete",
            false,
            cancel,
            async move {
                if let Err(err) =
                    Self::background(guard, conf, remote_storage, tenants, &tenant).await
@@ -478,7 +484,7 @@ impl DeleteTenantFlow {
                Ok(())
            }
            .instrument({
-                let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
+                let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug());
                span.follows_from(Span::current());
                span
            }),
@@ -516,7 +522,7 @@ impl DeleteTenantFlow {
            }
        }
-        let timelines_path = conf.timelines_path(&tenant.tenant_id);
+        let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
        // May not exist if we fail in cleanup_remaining_fs_traces after removing it
        if timelines_path.exists() {
            // sanity check to guard against layout changes
@@ -525,7 +531,8 @@ impl DeleteTenantFlow {
                .context("timelines dir not empty")?;
        }
-        remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
+        remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_shard_id)
            .await?;
        fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
            Err(anyhow::anyhow!(
@@ -533,21 +540,73 @@ impl DeleteTenantFlow {
            ))?
        });
-        cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
+        cleanup_remaining_fs_traces(conf, &tenant.tenant_shard_id)
            .await
            .context("cleanup_remaining_fs_traces")?;
        {
-            let mut locked = tenants.write().unwrap();
+            pausable_failpoint!("tenant-delete-before-map-remove");
            if locked.remove(&tenant.tenant_id).is_none() {
                warn!("Tenant got removed from tenants map during deletion");
            };
-            // FIXME: we should not be modifying this from outside of mgr.rs.
+            // This block is simply removing the TenantSlot for this tenant.  It requires a loop because
-            // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
+            // we might conflict with a TenantSlot::InProgress marker and need to wait for it.
-            crate::metrics::TENANT_MANAGER
+            //
-                .tenant_slots
+            // This complexity will go away when we simplify how deletion works:
-                .set(locked.len() as u64);
+            // https://github.com/neondatabase/neon/issues/5080
            loop {
                // Under the TenantMap lock, try to remove the tenant.  We usually succeed, but if
                // we encounter an InProgress marker, yield the barrier it contains and wait on it.
                let barrier = {
                    let mut locked = tenants.write().unwrap();
                    let removed = locked.remove(tenant.tenant_shard_id);
                    // FIXME: we should not be modifying this from outside of mgr.rs.
                    // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
                    crate::metrics::TENANT_MANAGER
                        .tenant_slots
                        .set(locked.len() as u64);
                    match removed {
                        TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {
                            match tenant.current_state() {
                                TenantState::Stopping { .. } | TenantState::Broken { .. } => {
                                    // Expected: we put the tenant into stopping state before we start deleting it
                                }
                                state => {
                                    // Unexpected state
                                    tracing::warn!(
                                        "Tenant in unexpected state {state} after deletion"
                                    );
                                }
                            }
                            break;
                        }
                        TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
                            // This is unexpected: this secondary tenants should not have been created, and we
                            // are not in a position to shut it down from here.
                            tracing::warn!("Tenant transitioned to secondary mode while deleting!");
                            break;
                        }
                        TenantsMapRemoveResult::Occupied(TenantSlot::InProgress(_)) => {
                            unreachable!("TenantsMap::remove handles InProgress separately, should never return it here");
                        }
                        TenantsMapRemoveResult::Vacant => {
                            tracing::warn!(
                                "Tenant removed from TenantsMap before deletion completed"
                            );
                            break;
                        }
                        TenantsMapRemoveResult::InProgress(barrier) => {
                            // An InProgress entry was found, we must wait on its barrier
                            barrier
                        }
                    }
                };
                tracing::info!(
                    "Waiting for competing operation to complete before deleting state for tenant"
                );
                barrier.wait().await;
            }
        }
        *guard = Self::Finished;
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -7,18 +7,19 @@ use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::VirtualFile;
 use camino::Utf8PathBuf;
 use pageserver_api::shard::TenantShardId;
 use std::cmp::min;
 use std::fs::OpenOptions;
 use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
 use std::sync::atomic::AtomicU64;
 use tracing::*;
-use utils::id::{TenantId, TimelineId};
+use utils::id::TimelineId;
 pub struct EphemeralFile {
    page_cache_file_id: page_cache::FileId,
-    _tenant_id: TenantId,
+    _tenant_shard_id: TenantShardId,
    _timeline_id: TimelineId,
    file: VirtualFile,
    len: u64,
@@ -31,7 +32,7 @@ pub struct EphemeralFile {
 impl EphemeralFile {
    pub async fn create(
        conf: &PageServerConf,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
    ) -> Result<EphemeralFile, io::Error> {
        static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
@@ -39,7 +40,7 @@ impl EphemeralFile {
            NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);
        let filename = conf
-            .timeline_path(&tenant_id, &timeline_id)
+            .timeline_path(&tenant_shard_id, &timeline_id)
            .join(Utf8PathBuf::from(format!(
                "ephemeral-{filename_disambiguator}"
            )));
@@ -52,7 +53,7 @@ impl EphemeralFile {
        Ok(EphemeralFile {
            page_cache_file_id: page_cache::next_file_id(),
-            _tenant_id: tenant_id,
+            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
            file,
            len: 0,
@@ -282,7 +283,7 @@ mod tests {
    ) -> Result<
        (
            &'static PageServerConf,
-            TenantId,
+            TenantShardId,
            TimelineId,
            RequestContext,
        ),
@@ -295,13 +296,13 @@ mod tests {
        // OK in a test.
        let conf: &'static PageServerConf = Box::leak(Box::new(conf));
-        let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
+        let tenant_shard_id = TenantShardId::from_str("11000000000000000000000000000000").unwrap();
        let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
-        fs::create_dir_all(conf.timeline_path(&tenant_id, &timeline_id))?;
+        fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id))?;
        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);
-        Ok((conf, tenant_id, timeline_id, ctx))
+        Ok((conf, tenant_shard_id, timeline_id, ctx))
    }
    #[tokio::test]
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -11,15 +11,12 @@
 use std::io::{self};
 use anyhow::{ensure, Context};
 use pageserver_api::shard::TenantShardId;
 use serde::{de::Error, Deserialize, Serialize, Serializer};
 use thiserror::Error;
 use utils::bin_ser::SerializeError;
 use utils::crashsafe::path_with_suffix_extension;
-use utils::{
+use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};
    bin_ser::BeSer,
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };
 use crate::config::PageServerConf;
 use crate::virtual_file::VirtualFile;
@@ -272,14 +269,14 @@ impl Serialize for TimelineMetadata {
 }
 /// Save timeline metadata to file
-#[tracing::instrument(skip_all, fields(%tenant_id, %timeline_id))]
+#[tracing::instrument(skip_all, fields(%tenant_id=tenant_shard_id.tenant_id, %shard_id=tenant_shard_id.shard_slug(), %timeline_id))]
 pub async fn save_metadata(
    conf: &'static PageServerConf,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    data: &TimelineMetadata,
 ) -> anyhow::Result<()> {
-    let path = conf.metadata_path(tenant_id, timeline_id);
+    let path = conf.metadata_path(tenant_shard_id, timeline_id);
    let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
    let metadata_bytes = data.to_bytes().context("serialize metadata")?;
    VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
@@ -299,10 +296,10 @@ pub enum LoadMetadataError {
 pub fn load_metadata(
    conf: &'static PageServerConf,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
 ) -> Result<TimelineMetadata, LoadMetadataError> {
-    let metadata_path = conf.metadata_path(tenant_id, timeline_id);
+    let metadata_path = conf.metadata_path(tenant_shard_id, timeline_id);
    let metadata_bytes = std::fs::read(metadata_path)?;
    Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,7 +2,8 @@
 //! page server.
 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::key::Key;
 use pageserver_api::shard::{ShardIdentity, ShardNumber, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::collections::{BTreeMap, HashMap};
@@ -29,7 +30,9 @@ use crate::control_plane_client::{
 use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::TENANT_MANAGER as METRICS;
 use crate::task_mgr::{self, TaskKind};
-use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
+use crate::tenant::config::{
    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
 };
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState};
@@ -95,57 +98,100 @@ pub(crate) enum TenantsMap {
    ShuttingDown(BTreeMap<TenantShardId, TenantSlot>),
 }
-/// Helper for mapping shard-unaware functions to a sharding-aware map
+pub(crate) enum TenantsMapRemoveResult {
-/// TODO(sharding): all users of this must be made shard-aware.
+    Occupied(TenantSlot),
-fn exactly_one_or_none<'a>(
+    Vacant,
-    map: &'a BTreeMap<TenantShardId, TenantSlot>,
+    InProgress(utils::completion::Barrier),
-    tenant_id: &TenantId,
+}
 ) -> Option<(&'a TenantShardId, &'a TenantSlot)> {
    let mut slots = map.range(TenantShardId::tenant_range(*tenant_id));
-    // Retrieve the first two slots in the range: if both are populated, we must panic because the caller
+/// When resolving a TenantId to a shard, we may be looking for the 0th
-    // needs a shard-naive view of the world in which only one slot can exist for a TenantId at a time.
+/// shard, or we might be looking for whichever shard holds a particular page.
-    let slot_a = slots.next();
+pub(crate) enum ShardSelector {
-    let slot_b = slots.next();
+    /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
-    match (slot_a, slot_b) {
+    /// ignore it.
-        (None, None) => None,
+    Zero,
-        (Some(slot), None) => {
+    /// Pick the first shard we find for the TenantId
-            // Exactly one matching slot
+    First,
-            Some(slot)
+    /// Pick the shard that holds this key
-        }
+    Page(Key),
        (Some(_slot_a), Some(_slot_b)) => {
            // Multiple shards for this tenant: cannot handle this yet.
            // TODO(sharding): callers of get() should be shard-aware.
            todo!("Attaching multiple shards in teh same tenant to the same pageserver")
        }
        (None, Some(_)) => unreachable!(),
    }
 }
 impl TenantsMap {
    /// Convenience function for typical usage, where we want to get a `Tenant` object, for
    /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
    /// None is returned.
-    pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
+    pub(crate) fn get(&self, tenant_shard_id: &TenantShardId) -> Option<&Arc<Tenant>> {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
-                // TODO(sharding): callers of get() should be shard-aware.
+                m.get(tenant_shard_id).and_then(|slot| slot.get_attached())
                exactly_one_or_none(m, tenant_id).and_then(|(_, slot)| slot.get_attached())
            }
        }
    }
-    pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<TenantSlot> {
+    /// A page service client sends a TenantId, and to look up the correct Tenant we must
    /// resolve this to a fully qualified TenantShardId.
    fn resolve_shard(
        &self,
        tenant_id: &TenantId,
        selector: ShardSelector,
    ) -> Option<TenantShardId> {
        let mut want_shard = None;
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
-                let key = exactly_one_or_none(m, tenant_id).map(|(k, _)| *k);
+                for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
-                key.and_then(|key| m.remove(&key))
+                    match selector {
                        ShardSelector::First => return Some(*slot.0),
                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
                            return Some(*slot.0)
                        }
                        ShardSelector::Page(key) => {
                            if let Some(tenant) = slot.1.get_attached() {
                                // First slot we see for this tenant, calculate the expected shard number
                                // for the key: we will use this for checking if this and subsequent
                                // slots contain the key, rather than recalculating the hash each time.
                                if want_shard.is_none() {
                                    want_shard = Some(tenant.shard_identity.get_shard_number(&key));
                                }
                                if Some(tenant.shard_identity.number) == want_shard {
                                    return Some(*slot.0);
                                }
                            } else {
                                continue;
                            }
                        }
                        _ => continue,
                    }
                }
                // Fall through: we didn't find an acceptable shard
                None
            }
        }
    }
    /// Only for use from DeleteTenantFlow.  This method directly removes a TenantSlot from the map.
    ///
    /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
    /// slot if the enclosed tenant is shutdown.
    pub(crate) fn remove(&mut self, tenant_shard_id: TenantShardId) -> TenantsMapRemoveResult {
        use std::collections::btree_map::Entry;
        match self {
            TenantsMap::Initializing => TenantsMapRemoveResult::Vacant,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => match m.entry(tenant_shard_id) {
                Entry::Occupied(entry) => match entry.get() {
                    TenantSlot::InProgress(barrier) => {
                        TenantsMapRemoveResult::InProgress(barrier.clone())
                    }
                    _ => TenantsMapRemoveResult::Occupied(entry.remove()),
                },
                Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant,
            },
        }
    }
    pub(crate) fn len(&self) -> usize {
        match self {
            TenantsMap::Initializing => 0,
@@ -190,49 +236,6 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
    Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));
 /// Create a directory, including parents.  This does no fsyncs and makes
 /// no guarantees about the persistence of the resulting metadata: for
 /// use when creating dirs for use as cache.
 async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
    let mut dirs_to_create = Vec::new();
    let mut path: &Utf8Path = path.as_ref();
    // Figure out which directories we need to create.
    loop {
        let meta = tokio::fs::metadata(path).await;
        match meta {
            Ok(metadata) if metadata.is_dir() => break,
            Ok(_) => {
                return Err(std::io::Error::new(
                    std::io::ErrorKind::AlreadyExists,
                    format!("non-directory found in path: {path}"),
                ));
            }
            Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {}
            Err(e) => return Err(e),
        }
        dirs_to_create.push(path);
        match path.parent() {
            Some(parent) => path = parent,
            None => {
                return Err(std::io::Error::new(
                    std::io::ErrorKind::InvalidInput,
                    format!("can't find parent of path '{path}'"),
                ));
            }
        }
    }
    // Create directories from parent to child.
    for &path in dirs_to_create.iter().rev() {
        tokio::fs::create_dir(path).await?;
    }
    Ok(())
 }
 /// The TenantManager is responsible for storing and mutating the collection of all tenants
 /// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
 /// lives inside the TenantManager.
@@ -250,8 +253,8 @@ pub struct TenantManager {
 }
 fn emergency_generations(
-    tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
+    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
-) -> HashMap<TenantId, Generation> {
+) -> HashMap<TenantShardId, Generation> {
    tenant_confs
        .iter()
        .filter_map(|(tid, lc)| {
@@ -271,16 +274,16 @@ fn emergency_generations(
 async fn init_load_generations(
    conf: &'static PageServerConf,
-    tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
+    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
    resources: &TenantSharedResources,
    cancel: &CancellationToken,
-) -> anyhow::Result<Option<HashMap<TenantId, Generation>>> {
+) -> anyhow::Result<Option<HashMap<TenantShardId, Generation>>> {
    let generations = if conf.control_plane_emergency_mode {
        error!(
            "Emergency mode!  Tenants will be attached unsafely using their last known generation"
        );
        emergency_generations(tenant_confs)
-    } else if let Some(client) = ControlPlaneClient::new(conf, cancel) {
+    } else if let Some(client) = ControlPlaneClient::new(conf, cancel.child_token()) {
        info!("Calling control plane API to re-attach tenants");
        // If we are configured to use the control plane API, then it is the source of truth for what tenants to load.
        match client.re_attach().await {
@@ -317,7 +320,7 @@ async fn init_load_generations(
 fn load_tenant_config(
    conf: &'static PageServerConf,
    dentry: Utf8DirEntry,
-) -> anyhow::Result<Option<(TenantId, anyhow::Result<LocationConf>)>> {
+) -> anyhow::Result<Option<(TenantShardId, anyhow::Result<LocationConf>)>> {
    let tenant_dir_path = dentry.path().to_path_buf();
    if crate::is_temporary(&tenant_dir_path) {
        info!("Found temporary tenant directory, removing: {tenant_dir_path}");
@@ -353,10 +356,10 @@ fn load_tenant_config(
        return Ok(None);
    }
-    let tenant_id = match tenant_dir_path
+    let tenant_shard_id = match tenant_dir_path
        .file_name()
        .unwrap_or_default()
-        .parse::<TenantId>()
+        .parse::<TenantShardId>()
    {
        Ok(id) => id,
        Err(_) => {
@@ -366,8 +369,8 @@ fn load_tenant_config(
    };
    Ok(Some((
-        tenant_id,
+        tenant_shard_id,
-        Tenant::load_tenant_config(conf, &tenant_id),
+        Tenant::load_tenant_config(conf, &tenant_shard_id),
    )))
 }
@@ -378,7 +381,7 @@ fn load_tenant_config(
 /// seconds even on reasonably fast drives.
 async fn init_load_tenant_configs(
    conf: &'static PageServerConf,
-) -> anyhow::Result<HashMap<TenantId, anyhow::Result<LocationConf>>> {
+) -> anyhow::Result<HashMap<TenantShardId, anyhow::Result<LocationConf>>> {
    let tenants_dir = conf.tenants_path();
    let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<Utf8DirEntry>> {
@@ -428,19 +431,19 @@ pub async fn init_tenant_mgr(
        init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;
    // Construct `Tenant` objects and start them running
-    for (tenant_id, location_conf) in tenant_configs {
+    for (tenant_shard_id, location_conf) in tenant_configs {
-        let tenant_dir_path = conf.tenant_path(&tenant_id);
+        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);
        let mut location_conf = match location_conf {
            Ok(l) => l,
            Err(e) => {
-                warn!(%tenant_id, "Marking tenant broken, failed to {e:#}");
+                warn!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Marking tenant broken, failed to {e:#}");
                tenants.insert(
-                    TenantShardId::unsharded(tenant_id),
+                    tenant_shard_id,
                    TenantSlot::Attached(Tenant::create_broken_tenant(
                        conf,
-                        tenant_id,
+                        tenant_shard_id,
                        format!("{}", e),
                    )),
                );
@@ -451,7 +454,7 @@ pub async fn init_tenant_mgr(
        let generation = if let Some(generations) = &tenant_generations {
            // We have a generation map: treat it as the authority for whether
            // this tenant is really attached.
-            if let Some(gen) = generations.get(&tenant_id) {
+            if let Some(gen) = generations.get(&tenant_shard_id) {
                *gen
            } else {
                match &location_conf.mode {
@@ -459,8 +462,8 @@ pub async fn init_tenant_mgr(
                        // We do not require the control plane's permission for secondary mode
                        // tenants, because they do no remote writes and hence require no
                        // generation number
-                        info!(%tenant_id, "Loaded tenant in secondary mode");
+                        info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
-                        tenants.insert(TenantShardId::unsharded(tenant_id), TenantSlot::Secondary);
+                        tenants.insert(tenant_shard_id, TenantSlot::Secondary);
                    }
                    LocationMode::Attached(_) => {
                        // TODO: augment re-attach API to enable the control plane to
@@ -468,9 +471,9 @@ pub async fn init_tenant_mgr(
                        // away local state, we can gracefully fall back to secondary here, if the control
                        // plane tells us so.
                        // (https://github.com/neondatabase/neon/issues/5377)
-                        info!(%tenant_id, "Detaching tenant, control plane omitted it in re-attach response");
+                        info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
                        if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
-                            error!(%tenant_id,
+                            error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
                                "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
                            );
                        }
@@ -482,21 +485,23 @@ pub async fn init_tenant_mgr(
        } else {
            // Legacy mode: no generation information, any tenant present
            // on local disk may activate
-            info!(%tenant_id, "Starting tenant in legacy mode, no generation",);
+            info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",);
            Generation::none()
        };
        // Presence of a generation number implies attachment: attach the tenant
        // if it wasn't already, and apply the generation number.
        location_conf.attach_in_generation(generation);
-        Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;
+        Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
        let shard_identity = location_conf.shard;
        match tenant_spawn(
            conf,
-            tenant_id,
+            tenant_shard_id,
            &tenant_dir_path,
            resources.clone(),
            AttachedTenantConf::try_from(location_conf)?,
            shard_identity,
            Some(init_order.clone()),
            &TENANTS,
            SpawnMode::Normal,
@@ -509,7 +514,7 @@ pub async fn init_tenant_mgr(
                );
            }
            Err(e) => {
-                error!(%tenant_id, "Failed to start tenant: {e:#}");
+                error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
            }
        }
    }
@@ -533,10 +538,11 @@ pub async fn init_tenant_mgr(
 #[allow(clippy::too_many_arguments)]
 pub(crate) fn tenant_spawn(
    conf: &'static PageServerConf,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    tenant_path: &Utf8Path,
    resources: TenantSharedResources,
    location_conf: AttachedTenantConf,
    shard_identity: ShardIdentity,
    init_order: Option<InitializationOrder>,
    tenants: &'static std::sync::RwLock<TenantsMap>,
    mode: SpawnMode,
@@ -557,18 +563,25 @@ pub(crate) fn tenant_spawn(
        "Cannot load tenant from empty directory {tenant_path:?}"
    );
-    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
+    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
    anyhow::ensure!(
-        !conf.tenant_ignore_mark_file_path(&tenant_id).exists(),
+        !conf.tenant_ignore_mark_file_path(&tenant_shard_id).exists(),
        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
    );
-    info!("Attaching tenant {tenant_id}");
+    info!(
        tenant_id = %tenant_shard_id.tenant_id,
        shard_id = %tenant_shard_id.shard_slug(),
        generation = ?location_conf.location.generation,
        attach_mode = ?location_conf.location.attach_mode,
        "Attaching tenant"
    );
    let tenant = match Tenant::spawn(
        conf,
-        tenant_id,
+        tenant_shard_id,
        resources,
        location_conf,
        shard_identity,
        init_order,
        tenants,
        mode,
@@ -576,8 +589,8 @@ pub(crate) fn tenant_spawn(
    ) {
        Ok(tenant) => tenant,
        Err(e) => {
-            error!("Failed to spawn tenant {tenant_id}, reason: {e:#}");
+            error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}");
-            Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
+            Tenant::create_broken_tenant(conf, tenant_shard_id, format!("{e:#}"))
        }
    };
@@ -732,19 +745,20 @@ pub(crate) async fn create_tenant(
    ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
    let location_conf = LocationConf::attached_single(tenant_conf, generation);
    info!("Creating tenant at location {location_conf:?}");
    let slot_guard =
        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
-    // TODO(sharding): make local paths shard-aware
+    let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
    let tenant_path =
        super::create_tenant_files(conf, &location_conf, &tenant_shard_id.tenant_id).await?;
    let shard_identity = location_conf.shard;
    let created_tenant = tenant_spawn(
        conf,
-        tenant_shard_id.tenant_id,
+        tenant_shard_id,
        &tenant_path,
        resources,
        AttachedTenantConf::try_from(location_conf)?,
        shard_identity,
        None,
        &TENANTS,
        SpawnMode::Create,
@@ -774,15 +788,18 @@ pub(crate) async fn set_new_tenant_config(
    new_tenant_conf: TenantConfOpt,
    tenant_id: TenantId,
 ) -> Result<(), SetNewTenantConfigError> {
    // Legacy API: does not support sharding
    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
    info!("configuring tenant {tenant_id}");
-    let tenant = get_tenant(tenant_id, true)?;
+    let tenant = get_tenant(tenant_shard_id, true)?;
    // This is a legacy API that only operates on attached tenants: the preferred
    // API to use is the location_config/ endpoint, which lets the caller provide
    // the full LocationConf.
    let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation);
-    Tenant::persist_tenant_config(conf, &tenant_id, &location_conf)
+    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(SetNewTenantConfigError::Persist)?;
    tenant.set_new_tenant_config(new_tenant_conf);
@@ -792,8 +809,6 @@ pub(crate) async fn set_new_tenant_config(
 impl TenantManager {
    /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
    /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
    ///
    /// This method is cancel-safe.
    pub(crate) fn get_attached_tenant_shard(
        &self,
        tenant_shard_id: TenantShardId,
@@ -838,10 +853,12 @@ impl TenantManager {
        Ok(())
    }
    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
    pub(crate) async fn upsert_location(
        &self,
        tenant_shard_id: TenantShardId,
        new_location_config: LocationConf,
        flush: Option<Duration>,
        ctx: &RequestContext,
    ) -> Result<(), anyhow::Error> {
        debug_assert_current_span_has_tenant_id();
@@ -850,7 +867,7 @@ impl TenantManager {
        // Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
        // then we do not need to set the slot to InProgress, we can just call into the
        // existng tenant.
-        {
+        let modify_tenant = {
            let locked = self.tenants.read().unwrap();
            let peek_slot =
                tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
@@ -861,22 +878,50 @@ impl TenantManager {
                        // take our fast path and just provide the updated configuration
                        // to the tenant.
                        tenant.set_new_location_config(AttachedTenantConf::try_from(
-                            new_location_config,
+                            new_location_config.clone(),
                        )?);
-                        // Persist the new config in the background, to avoid holding up any
+                        Some(tenant.clone())
                        // locks while we do so.
                        // TODO
                        return Ok(());
                    } else {
                        // Different generations, fall through to general case
                        None
                    }
                }
                _ => {
                    // Not an Attached->Attached transition, fall through to general case
                    None
                }
            }
        };
        // Fast-path continued: having dropped out of the self.tenants lock, do the async
        // phase of waiting for flush, before returning.
        if let Some(tenant) = modify_tenant {
            // Transition to AttachedStale means we may well hold a valid generation
            // still, and have been requested to go stale as part of a migration.  If
            // the caller set `flush`, then flush to remote storage.
            if let LocationMode::Attached(AttachedLocationConfig {
                generation: _,
                attach_mode: AttachmentMode::Stale,
            }) = &new_location_config.mode
            {
                if let Some(flush_timeout) = flush {
                    match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
                        Ok(Err(e)) => {
                            return Err(e);
                        }
                        Ok(Ok(_)) => return Ok(()),
                        Err(_) => {
                            tracing::warn!(
                                timeout_ms = flush_timeout.as_millis(),
                                "Timed out waiting for flush to remote storage, proceeding anyway."
                            )
                        }
                    }
                }
            }
            return Ok(());
        }
        // General case for upserts to TenantsMap, excluding the case above: we will substitute an
@@ -915,55 +960,44 @@ impl TenantManager {
            slot_guard.drop_old_value().expect("We just shut it down");
        }
-        // TODO(sharding): make local paths sharding-aware
+        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
        let tenant_path = self.conf.tenant_path(&tenant_shard_id.tenant_id);
        let new_slot = match &new_location_config.mode {
            LocationMode::Secondary(_) => {
                // Directory doesn't need to be fsync'd because if we crash it can
                // safely be recreated next time this tenant location is configured.
-                unsafe_create_dir_all(&tenant_path)
+                tokio::fs::create_dir_all(&tenant_path)
                    .await
                    .with_context(|| format!("Creating {tenant_path}"))?;
-                // TODO(sharding): make local paths sharding-aware
+                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                Tenant::persist_tenant_config(
+                    .await
-                    self.conf,
+                    .map_err(SetNewTenantConfigError::Persist)?;
                    &tenant_shard_id.tenant_id,
                    &new_location_config,
                )
                .await
                .map_err(SetNewTenantConfigError::Persist)?;
                TenantSlot::Secondary
            }
            LocationMode::Attached(_attach_config) => {
-                // TODO(sharding): make local paths sharding-aware
+                let timelines_path = self.conf.timelines_path(&tenant_shard_id);
                let timelines_path = self.conf.timelines_path(&tenant_shard_id.tenant_id);
                // Directory doesn't need to be fsync'd because we do not depend on
                // it to exist after crashes: it may be recreated when tenant is
                // re-attached, see https://github.com/neondatabase/neon/issues/5550
-                unsafe_create_dir_all(&timelines_path)
+                tokio::fs::create_dir_all(&tenant_path)
                    .await
                    .with_context(|| format!("Creating {timelines_path}"))?;
-                // TODO(sharding): make local paths sharding-aware
+                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
-                Tenant::persist_tenant_config(
+                    .await
-                    self.conf,
+                    .map_err(SetNewTenantConfigError::Persist)?;
                    &tenant_shard_id.tenant_id,
                    &new_location_config,
                )
                .await
                .map_err(SetNewTenantConfigError::Persist)?;
-                // TODO(sharding): make spawn sharding-aware
+                let shard_identity = new_location_config.shard;
                let tenant = tenant_spawn(
                    self.conf,
-                    tenant_shard_id.tenant_id,
+                    tenant_shard_id,
                    &tenant_path,
                    self.resources.clone(),
                    AttachedTenantConf::try_from(new_location_config)?,
                    shard_identity,
                    None,
                    self.tenants,
                    SpawnMode::Normal,
@@ -978,6 +1012,81 @@ impl TenantManager {
        Ok(())
    }
    /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
    /// LocationConf that was last used to attach it.  Optionally, the local file cache may be
    /// dropped before re-attaching.
    ///
    /// This is not part of a tenant's normal lifecycle: it is used for debug/support, in situations
    /// where an issue is identified that would go away with a restart of the tenant.
    ///
    /// This does not have any special "force" shutdown of a tenant: it relies on the tenant's tasks
    /// to respect the cancellation tokens used in normal shutdown().
    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %drop_cache))]
    pub(crate) async fn reset_tenant(
        &self,
        tenant_shard_id: TenantShardId,
        drop_cache: bool,
        ctx: RequestContext,
    ) -> anyhow::Result<()> {
        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
        let Some(old_slot) = slot_guard.get_old_value() else {
            anyhow::bail!("Tenant not found when trying to reset");
        };
        let Some(tenant) = old_slot.get_attached() else {
            slot_guard.revert();
            anyhow::bail!("Tenant is not in attached state");
        };
        let (_guard, progress) = utils::completion::channel();
        match tenant.shutdown(progress, false).await {
            Ok(()) => {
                slot_guard.drop_old_value()?;
            }
            Err(_barrier) => {
                slot_guard.revert();
                anyhow::bail!("Cannot reset Tenant, already shutting down");
            }
        }
        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
        let timelines_path = self.conf.timelines_path(&tenant_shard_id);
        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
        if drop_cache {
            tracing::info!("Dropping local file cache");
            match tokio::fs::read_dir(&timelines_path).await {
                Err(e) => {
                    tracing::warn!("Failed to list timelines while dropping cache: {}", e);
                }
                Ok(mut entries) => {
                    while let Some(entry) = entries.next_entry().await? {
                        tokio::fs::remove_dir_all(entry.path()).await?;
                    }
                }
            }
        }
        let shard_identity = config.shard;
        let tenant = tenant_spawn(
            self.conf,
            tenant_shard_id,
            &tenant_path,
            self.resources.clone(),
            AttachedTenantConf::try_from(config)?,
            shard_identity,
            None,
            self.tenants,
            SpawnMode::Normal,
            &ctx,
        )?;
        slot_guard.upsert(TenantSlot::Attached(tenant))?;
        Ok(())
    }
 }
 #[derive(Debug, thiserror::Error)]
@@ -1002,14 +1111,11 @@ pub(crate) enum GetTenantError {
 ///
 /// This method is cancel-safe.
 pub(crate) fn get_tenant(
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    active_only: bool,
 ) -> Result<Arc<Tenant>, GetTenantError> {
    let locked = TENANTS.read().unwrap();
    // TODO(sharding): make all callers of get_tenant shard-aware
    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
    let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)?;
    match peek_slot {
@@ -1021,14 +1127,18 @@ pub(crate) fn get_tenant(
            TenantState::Active => Ok(Arc::clone(tenant)),
            _ => {
                if active_only {
-                    Err(GetTenantError::NotActive(tenant_id))
+                    Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
                } else {
                    Ok(Arc::clone(tenant))
                }
            }
        },
-        Some(TenantSlot::InProgress(_)) => Err(GetTenantError::NotActive(tenant_id)),
+        Some(TenantSlot::InProgress(_)) => {
-        None | Some(TenantSlot::Secondary) => Err(GetTenantError::NotFound(tenant_id)),
+            Err(GetTenantError::NotActive(tenant_shard_id.tenant_id))
        }
        None | Some(TenantSlot::Secondary) => {
            Err(GetTenantError::NotFound(tenant_shard_id.tenant_id))
        }
    }
 }
@@ -1062,6 +1172,7 @@ pub(crate) enum GetActiveTenantError {
 /// then wait for up to `timeout` (minus however long we waited for the slot).
 pub(crate) async fn get_active_tenant_with_timeout(
    tenant_id: TenantId,
    shard_selector: ShardSelector,
    timeout: Duration,
    cancel: &CancellationToken,
 ) -> Result<Arc<Tenant>, GetActiveTenantError> {
@@ -1070,15 +1181,17 @@ pub(crate) async fn get_active_tenant_with_timeout(
        Tenant(Arc<Tenant>),
    }
    // TODO(sharding): make page service interface sharding-aware (page service should apply ShardIdentity to the key
    // to decide which shard services the request)
    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
    let wait_start = Instant::now();
    let deadline = wait_start + timeout;
-    let wait_for = {
+    let (wait_for, tenant_shard_id) = {
        let locked = TENANTS.read().unwrap();
        // Resolve TenantId to TenantShardId
        let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
            GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
        )?;
        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
            .map_err(GetTenantError::MapState)?;
        match peek_slot {
@@ -1088,7 +1201,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                        // Fast path: we don't need to do any async waiting.
                        return Ok(tenant.clone());
                    }
-                    _ => WaitFor::Tenant(tenant.clone()),
+                    _ => (WaitFor::Tenant(tenant.clone()), tenant_shard_id),
                }
            }
            Some(TenantSlot::Secondary) => {
@@ -1096,7 +1209,9 @@ pub(crate) async fn get_active_tenant_with_timeout(
                    tenant_id,
                )))
            }
-            Some(TenantSlot::InProgress(barrier)) => WaitFor::Barrier(barrier.clone()),
+            Some(TenantSlot::InProgress(barrier)) => {
                (WaitFor::Barrier(barrier.clone()), tenant_shard_id)
            }
            None => {
                return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
                    tenant_id,
@@ -1181,8 +1296,7 @@ pub(crate) async fn delete_tenant(
    // See https://github.com/neondatabase/neon/issues/5080
    // TODO(sharding): make delete API sharding-aware
-    let mut slot_guard =
+    let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
    // unwrap is safe because we used MustExist mode when acquiring
    let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
@@ -1238,6 +1352,11 @@ pub(crate) async fn detach_tenant(
    // Although we are cleaning up the tenant, this task is not meant to be bound by the lifetime of the tenant in memory.
    // After a tenant is detached, there are no more task_mgr tasks for that tenant_id.
    let task_tenant_id = None;
    let cancel = crate::PAGESERVER_SHUTDOWN_TOKEN
        .get()
        .cloned()
        .unwrap_or_default()
        .child_token();
    task_mgr::spawn(
        task_mgr::BACKGROUND_RUNTIME.handle(),
        TaskKind::MgmtRequest,
@@ -1245,6 +1364,7 @@ pub(crate) async fn detach_tenant(
        None,
        "tenant_files_delete",
        false,
        cancel,
        async move {
            fs::remove_dir_all(tmp_path.as_path())
                .await
@@ -1262,8 +1382,7 @@ async fn detach_tenant0(
    deletion_queue_client: &DeletionQueueClient,
 ) -> Result<Utf8PathBuf, TenantStateError> {
    let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
-        // TODO(sharding): make local path helpers shard-aware
+        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean.tenant_id);
        safe_rename_tenant_dir(&local_tenant_directory)
            .await
            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))
@@ -1288,8 +1407,7 @@ async fn detach_tenant0(
            Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
        )
    {
-        // TODO(sharding): make local paths sharding-aware
+        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id.tenant_id);
        if tenant_ignore_mark.exists() {
            info!("Detaching an ignored tenant");
            let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
@@ -1318,9 +1436,9 @@ pub(crate) async fn load_tenant(
    let slot_guard =
        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
-    let tenant_path = conf.tenant_path(&tenant_id);
+    let tenant_path = conf.tenant_path(&tenant_shard_id);
-    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
+    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
    if tenant_ignore_mark.exists() {
        std::fs::remove_file(&tenant_ignore_mark).with_context(|| {
            format!(
@@ -1336,17 +1454,19 @@ pub(crate) async fn load_tenant(
    };
    let mut location_conf =
-        Tenant::load_tenant_config(conf, &tenant_id).map_err(TenantMapInsertError::Other)?;
+        Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
    location_conf.attach_in_generation(generation);
-    Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;
+    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;
    let shard_identity = location_conf.shard;
    let new_tenant = tenant_spawn(
        conf,
-        tenant_id,
+        tenant_shard_id,
        &tenant_path,
        resources,
        AttachedTenantConf::try_from(location_conf)?,
        shard_identity,
        None,
        &TENANTS,
        SpawnMode::Normal,
@@ -1374,7 +1494,7 @@ async fn ignore_tenant0(
    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
    remove_tenant_from_memory(tenants, tenant_shard_id, async {
-        let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_id);
+        let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
        fs::File::create(&ignore_mark_file)
            .await
            .context("Failed to create ignore mark file")
@@ -1397,7 +1517,8 @@ pub(crate) enum TenantMapListError {
 ///
 /// Get list of tenants, for the mgmt API
 ///
-pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, TenantMapListError> {
+pub(crate) async fn list_tenants() -> Result<Vec<(TenantShardId, TenantState)>, TenantMapListError>
 {
    let tenants = TENANTS.read().unwrap();
    let m = match &*tenants {
        TenantsMap::Initializing => return Err(TenantMapListError::Initializing),
@@ -1405,12 +1526,10 @@ pub(crate) async fn list_tenants() -> Result<Vec<(TenantId, TenantState)>, Tenan
    };
    Ok(m.iter()
        .filter_map(|(id, tenant)| match tenant {
-            TenantSlot::Attached(tenant) => Some((id, tenant.current_state())),
+            TenantSlot::Attached(tenant) => Some((*id, tenant.current_state())),
            TenantSlot::Secondary => None,
            TenantSlot::InProgress(_) => None,
        })
        // TODO(sharding): make callers of this function shard-aware
        .map(|(k, v)| (k.tenant_id, v))
        .collect())
 }
@@ -1432,16 +1551,18 @@ pub(crate) async fn attach_tenant(
    let slot_guard =
        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
    let location_conf = LocationConf::attached_single(tenant_conf, generation);
-    let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id).await?;
+    let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
    // TODO: tenant directory remains on disk if we bail out from here on.
    //       See https://github.com/neondatabase/neon/issues/4233
    let shard_identity = location_conf.shard;
    let attached_tenant = tenant_spawn(
        conf,
-        tenant_id,
+        tenant_shard_id,
        &tenant_dir,
        resources,
        AttachedTenantConf::try_from(location_conf)?,
        shard_identity,
        None,
        &TENANTS,
        SpawnMode::Normal,
@@ -1507,9 +1628,10 @@ pub enum TenantSlotUpsertError {
    MapState(#[from] TenantMapError),
 }
-#[derive(Debug)]
+#[derive(Debug, thiserror::Error)]
 enum TenantSlotDropError {
    /// It is only legal to drop a TenantSlot if its contents are fully shut down
    #[error("Tenant was not shut down")]
    NotShutdown,
 }
@@ -1569,9 +1691,9 @@ impl SlotGuard {
        }
    }
-    /// Take any value that was present in the slot before we acquired ownership
+    /// Get any value that was present in the slot before we acquired ownership
    /// of it: in state transitions, this will be the old state.
-    fn get_old_value(&mut self) -> &Option<TenantSlot> {
+    fn get_old_value(&self) -> &Option<TenantSlot> {
        &self.old_value
    }
@@ -1789,7 +1911,7 @@ fn tenant_map_acquire_slot_impl(
    METRICS.tenant_slot_writes.inc();
    let mut locked = tenants.write().unwrap();
-    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard=tenant_shard_id.shard_slug());
+    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug());
    let _guard = span.enter();
    let m = match &mut *locked {
@@ -1941,16 +2063,18 @@ use {
 };
 pub(crate) async fn immediate_gc(
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    gc_req: TimelineGcRequest,
    cancel: CancellationToken,
    ctx: &RequestContext,
 ) -> Result<tokio::sync::oneshot::Receiver<Result<GcResult, anyhow::Error>>, ApiError> {
    let guard = TENANTS.read().unwrap();
    let tenant = guard
-        .get(&tenant_id)
+        .get(&tenant_shard_id)
        .map(Arc::clone)
-        .with_context(|| format!("tenant {tenant_id}"))
+        .with_context(|| format!("tenant {tenant_shard_id}"))
        .map_err(|e| ApiError::NotFound(e.into()))?;
    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
@@ -1960,21 +2084,51 @@ pub(crate) async fn immediate_gc(
    // Run in task_mgr to avoid race with tenant_detach operation
    let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
    // TODO: spawning is redundant now, need to hold the gate
    task_mgr::spawn(
        &tokio::runtime::Handle::current(),
        TaskKind::GarbageCollector,
-        Some(tenant_id),
+        Some(tenant_shard_id),
        Some(timeline_id),
-        &format!("timeline_gc_handler garbage collection run for tenant {tenant_id} timeline {timeline_id}"),
+        &format!("timeline_gc_handler garbage collection run for tenant {tenant_shard_id} timeline {timeline_id}"),
        false,
        tenant.cancel.child_token(),
        async move {
            fail::fail_point!("immediate_gc_task_pre");
-            let result = tenant
+
-                .gc_iteration(Some(timeline_id), gc_horizon, pitr, &ctx)
+            #[allow(unused_mut)]
-                .instrument(info_span!("manual_gc", %tenant_id, %timeline_id))
+            let mut result = tenant
                .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
                .instrument(info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))
                .await;
                // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
                // better once the types support it.
            #[cfg(feature = "testing")]
            {
                if let Ok(result) = result.as_mut() {
                    // why not futures unordered? it seems it needs very much the same task structure
                    // but would only run on single task.
                    let mut js = tokio::task::JoinSet::new();
                    for layer in std::mem::take(&mut result.doomed_layers) {
                        js.spawn(layer.wait_drop());
                    }
                    tracing::info!(total = js.len(), "starting to wait for the gc'd layers to be dropped");
                    while let Some(res) = js.join_next().await {
                        res.expect("wait_drop should not panic");
                    }
                }
                let timeline = tenant.get_timeline(timeline_id, false).ok();
                let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref());
                if let Some(rtc) = rtc {
                    // layer drops schedule actions on remote timeline client to actually do the
                    // deletions; don't care just exit fast about the shutdown error
                    drop(rtc.wait_completion().await);
                }
            }
            match task_done.send(result) {
                Ok(_) => (),
                Err(result) => error!("failed to send gc result: {result:?}"),
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -188,8 +188,11 @@ use anyhow::Context;
 use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};
 pub(crate) use download::download_initdb_tar_zst;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 pub(crate) use upload::upload_initdb_dir;
 use utils::backoff::{
    self, exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS,
 };
@@ -249,6 +252,11 @@ pub(crate) const FAILED_REMOTE_OP_RETRIES: u32 = 10;
 // retries. Uploads and deletions are retried forever, though.
 pub(crate) const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
 pub(crate) const INITDB_PATH: &str = "initdb.tar.zst";
 /// Default buffer size when interfacing with [`tokio::fs::File`].
 pub(crate) const BUFFER_SIZE: usize = 32 * 1024;
 pub enum MaybeDeletedIndexPart {
    IndexPart(IndexPart),
    Deleted(IndexPart),
@@ -297,7 +305,7 @@ pub struct RemoteTimelineClient {
    runtime: tokio::runtime::Handle,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    generation: Generation,
@@ -321,7 +329,7 @@ impl RemoteTimelineClient {
        remote_storage: GenericRemoteStorage,
        deletion_queue_client: DeletionQueueClient,
        conf: &'static PageServerConf,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        generation: Generation,
    ) -> RemoteTimelineClient {
@@ -333,22 +341,29 @@ impl RemoteTimelineClient {
            } else {
                BACKGROUND_RUNTIME.handle().clone()
            },
-            tenant_id,
+            tenant_shard_id,
            timeline_id,
            generation,
            storage_impl: remote_storage,
            deletion_queue_client,
            upload_queue: Mutex::new(UploadQueue::Uninitialized),
-            metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
+            metrics: Arc::new(RemoteTimelineClientMetrics::new(
                &tenant_shard_id,
                &timeline_id,
            )),
        }
    }
    /// Initialize the upload queue for a remote storage that already received
    /// an index file upload, i.e., it's not empty.
    /// The given `index_part` must be the one on the remote.
-    pub fn init_upload_queue(&self, index_part: &IndexPart) -> anyhow::Result<()> {
+    pub fn init_upload_queue(
        &self,
        index_part: &IndexPart,
        cancel: CancellationToken,
    ) -> anyhow::Result<()> {
        let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_with_current_remote_index_part(index_part)?;
+        upload_queue.initialize_with_current_remote_index_part(index_part, cancel)?;
        self.update_remote_physical_size_gauge(Some(index_part));
        info!(
            "initialized upload queue from remote index with {} layer files",
@@ -362,9 +377,10 @@ impl RemoteTimelineClient {
    pub fn init_upload_queue_for_empty_remote(
        &self,
        local_metadata: &TimelineMetadata,
        cancel: CancellationToken,
    ) -> anyhow::Result<()> {
        let mut upload_queue = self.upload_queue.lock().unwrap();
-        upload_queue.initialize_empty_remote(local_metadata)?;
+        upload_queue.initialize_empty_remote(local_metadata, cancel)?;
        self.update_remote_physical_size_gauge(None);
        info!("initialized upload queue as empty");
        Ok(())
@@ -375,6 +391,7 @@ impl RemoteTimelineClient {
    pub fn init_upload_queue_stopped_to_continue_deletion(
        &self,
        index_part: &IndexPart,
        cancel: CancellationToken,
    ) -> anyhow::Result<()> {
        // FIXME: consider newtype for DeletedIndexPart.
        let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
@@ -383,7 +400,7 @@ impl RemoteTimelineClient {
        {
            let mut upload_queue = self.upload_queue.lock().unwrap();
-            upload_queue.initialize_with_current_remote_index_part(index_part)?;
+            upload_queue.initialize_with_current_remote_index_part(index_part, cancel)?;
            self.update_remote_physical_size_gauge(Some(index_part));
        }
        // also locks upload queue, without dropping the guard above it will be a deadlock
@@ -460,13 +477,13 @@ impl RemoteTimelineClient {
        let index_part = download::download_index_part(
            &self.storage_impl,
-            &self.tenant_id,
+            &self.tenant_shard_id,
            &self.timeline_id,
            self.generation,
            cancel,
        )
        .measure_remote_op(
-            self.tenant_id,
+            self.tenant_shard_id.tenant_id,
            self.timeline_id,
            RemoteOpFileKind::Index,
            RemoteOpKind::Download,
@@ -502,13 +519,13 @@ impl RemoteTimelineClient {
            download::download_layer_file(
                self.conf,
                &self.storage_impl,
-                self.tenant_id,
+                self.tenant_shard_id,
                self.timeline_id,
                layer_file_name,
                layer_metadata,
            )
            .measure_remote_op(
-                self.tenant_id,
+                self.tenant_shard_id.tenant_id,
                self.timeline_id,
                RemoteOpFileKind::Layer,
                RemoteOpKind::Download,
@@ -654,10 +671,10 @@ impl RemoteTimelineClient {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;
-        let with_generations =
+        let with_metadata =
            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());
-        self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
+        self.schedule_deletion_of_unlinked0(upload_queue, with_metadata);
        // Launch the tasks immediately, if possible
        self.launch_queued_tasks(upload_queue);
@@ -692,7 +709,7 @@ impl RemoteTimelineClient {
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
        names: I,
-    ) -> Vec<(LayerFileName, Generation)>
+    ) -> Vec<(LayerFileName, LayerFileMetadata)>
    where
        I: IntoIterator<Item = LayerFileName>,
    {
@@ -700,16 +717,17 @@ impl RemoteTimelineClient {
        // so we don't need update it. Just serialize it.
        let metadata = upload_queue.latest_metadata.clone();
-        // Decorate our list of names with each name's generation, dropping
+        // Decorate our list of names with each name's metadata, dropping
-        // names that are unexpectedly missing from our metadata.
+        // names that are unexpectedly missing from our metadata.  This metadata
-        let with_generations: Vec<_> = names
+        // is later used when physically deleting layers, to construct key paths.
        let with_metadata: Vec<_> = names
            .into_iter()
            .filter_map(|name| {
                let meta = upload_queue.latest_files.remove(&name);
                if let Some(meta) = meta {
                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-                    Some((name, meta.generation))
+                    Some((name, meta))
                } else {
                    // This can only happen if we forgot to to schedule the file upload
                    // before scheduling the delete. Log it because it is a rare/strange
@@ -722,9 +740,10 @@ impl RemoteTimelineClient {
            .collect();
        #[cfg(feature = "testing")]
-        for (name, gen) in &with_generations {
+        for (name, metadata) in &with_metadata {
-            if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), *gen) {
+            let gen = metadata.generation;
-                if &unexpected == gen {
+            if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), gen) {
                if unexpected == gen {
                    tracing::error!("{name} was unlinked twice with same generation");
                } else {
                    tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}");
@@ -739,14 +758,14 @@ impl RemoteTimelineClient {
            self.schedule_index_upload(upload_queue, metadata);
        }
-        with_generations
+        with_metadata
    }
    /// Schedules deletion for layer files which have previously been unlinked from the
    /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
    pub(crate) fn schedule_deletion_of_unlinked(
        self: &Arc<Self>,
-        layers: Vec<(LayerFileName, Generation)>,
+        layers: Vec<(LayerFileName, LayerFileMetadata)>,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;
@@ -759,16 +778,22 @@ impl RemoteTimelineClient {
    fn schedule_deletion_of_unlinked0(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
-        with_generations: Vec<(LayerFileName, Generation)>,
+        with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
    ) {
-        for (name, gen) in &with_generations {
+        for (name, meta) in &with_metadata {
-            info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
+            info!(
                "scheduling deletion of layer {}{} (shard {})",
                name,
                meta.generation.get_suffix(),
                meta.shard
            );
        }
        #[cfg(feature = "testing")]
-        for (name, gen) in &with_generations {
+        for (name, meta) in &with_metadata {
            let gen = meta.generation;
            match upload_queue.dangling_files.remove(name) {
-                Some(same) if &same == gen => { /* expected */ }
+                Some(same) if same == gen => { /* expected */ }
                Some(other) => {
                    tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}");
                }
@@ -780,7 +805,7 @@ impl RemoteTimelineClient {
        // schedule the actual deletions
        let op = UploadOp::Delete(Delete {
-            layers: with_generations,
+            layers: with_metadata,
        });
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
@@ -809,23 +834,29 @@ impl RemoteTimelineClient {
        Ok(())
    }
    ///
    /// Wait for all previously scheduled uploads/deletions to complete
-    ///
+    pub(crate) async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
    pub async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
        let mut receiver = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;
-            self.schedule_barrier(upload_queue)
+            self.schedule_barrier0(upload_queue)
        };
        if receiver.changed().await.is_err() {
            anyhow::bail!("wait_completion aborted because upload queue was stopped");
        }
        Ok(())
    }
-    fn schedule_barrier(
+    pub(crate) fn schedule_barrier(self: &Arc<Self>) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;
        self.schedule_barrier0(upload_queue);
        Ok(())
    }
    fn schedule_barrier0(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
    ) -> tokio::sync::watch::Receiver<()> {
@@ -841,6 +872,56 @@ impl RemoteTimelineClient {
        receiver
    }
    /// Wait for all previously scheduled operations to complete, and then stop.
    ///
    /// Not cancellation safe
    pub(crate) async fn shutdown(self: &Arc<Self>) -> Result<(), StopError> {
        // On cancellation the queue is left in ackward state of refusing new operations but
        // proper stop is yet to be called. On cancel the original or some later task must call
        // `stop` or `shutdown`.
        let sg = scopeguard::guard((), |_| {
            tracing::error!("RemoteTimelineClient::shutdown was cancelled; this should not happen, do not make this into an allowed_error")
        });
        let fut = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = match &mut *guard {
                UploadQueue::Stopped(_) => return Ok(()),
                UploadQueue::Uninitialized => return Err(StopError::QueueUninitialized),
                UploadQueue::Initialized(ref mut init) => init,
            };
            // if the queue is already stuck due to a shutdown operation which was cancelled, then
            // just don't add more of these as they would never complete.
            //
            // TODO: if launch_queued_tasks were to be refactored to accept a &mut UploadQueue
            // in every place we would not have to jump through this hoop, and this method could be
            // made cancellable.
            if !upload_queue.shutting_down {
                upload_queue.shutting_down = true;
                upload_queue.queued_operations.push_back(UploadOp::Shutdown);
                // this operation is not counted similar to Barrier
                self.launch_queued_tasks(upload_queue);
            }
            upload_queue.shutdown_ready.clone().acquire_owned()
        };
        let res = fut.await;
        scopeguard::ScopeGuard::into_inner(sg);
        match res {
            Ok(_permit) => unreachable!("shutdown_ready should not have been added permits"),
            Err(_closed) => {
                // expected
            }
        }
        self.stop()
    }
    /// Set the deleted_at field in the remote index file.
    ///
    /// This fails if the upload queue has not been `stop()`ed.
@@ -892,7 +973,7 @@ impl RemoteTimelineClient {
            || {
                upload::upload_index_part(
                    &self.storage_impl,
-                    &self.tenant_id,
+                    &self.tenant_shard_id,
                    &self.timeline_id,
                    self.generation,
                    &index_part_with_deleted_at,
@@ -950,8 +1031,9 @@ impl RemoteTimelineClient {
                .drain()
                .map(|(file_name, meta)| {
                    remote_layer_path(
-                        &self.tenant_id,
+                        &self.tenant_shard_id.tenant_id,
                        &self.timeline_id,
                        meta.shard,
                        &file_name,
                        meta.generation,
                    )
@@ -964,7 +1046,7 @@ impl RemoteTimelineClient {
        // Do not delete index part yet, it is needed for possible retry. If we remove it first
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
-        let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);
+        let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);
        // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
        // taking the burden of listing all the layers that we already know we should delete.
@@ -1000,12 +1082,22 @@ impl RemoteTimelineClient {
            .unwrap_or(
                // No generation-suffixed indices, assume we are dealing with
                // a legacy index.
-                remote_index_path(&self.tenant_id, &self.timeline_id, Generation::none()),
+                remote_index_path(&self.tenant_shard_id, &self.timeline_id, Generation::none()),
            );
        let remaining_layers: Vec<RemotePath> = remaining
            .into_iter()
-            .filter(|p| p!= &latest_index)
+            .filter(|p| {
                if p == &latest_index {
                    return false;
                }
                if let Some(name) = p.object_name() {
                    if name == INITDB_PATH {
                        return false;
                    }
                }
                true
            })
            .inspect(|path| {
                if let Some(name) = path.object_name() {
                    info!(%name, "deleting a file not referenced from index_part.json");
@@ -1071,7 +1163,9 @@ impl RemoteTimelineClient {
                    upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
                }
-                UploadOp::Barrier(_) => upload_queue.inprogress_tasks.is_empty(),
+                UploadOp::Barrier(_) | UploadOp::Shutdown => {
                    upload_queue.inprogress_tasks.is_empty()
                }
            };
            // If we cannot launch this task, don't look any further.
@@ -1084,6 +1178,13 @@ impl RemoteTimelineClient {
                break;
            }
            if let UploadOp::Shutdown = next_op {
                // leave the op in the queue but do not start more tasks; it will be dropped when
                // the stop is called.
                upload_queue.shutdown_ready.close();
                break;
            }
            // We can launch this task. Remove it from the queue first.
            let next_op = upload_queue.queued_operations.pop_front().unwrap();
@@ -1104,6 +1205,7 @@ impl RemoteTimelineClient {
                    sender.send_replace(());
                    continue;
                }
                UploadOp::Shutdown => unreachable!("shutdown is intentionally never popped off"),
            };
            // Assign unique ID to this task
@@ -1122,20 +1224,21 @@ impl RemoteTimelineClient {
            // Spawn task to perform the task
            let self_rc = Arc::clone(self);
-            let tenant_id = self.tenant_id;
+            let tenant_shard_id = self.tenant_shard_id;
            let timeline_id = self.timeline_id;
            task_mgr::spawn(
                &self.runtime,
                TaskKind::RemoteUploadTask,
-                Some(self.tenant_id),
+                Some(self.tenant_shard_id),
                Some(self.timeline_id),
                "remote upload",
                false,
                upload_queue.cancel.child_token(),
                async move {
                    self_rc.perform_upload_task(task).await;
                    Ok(())
                }
-                .instrument(info_span!(parent: None, "remote_upload", %tenant_id, %timeline_id, %upload_task_id)),
+                .instrument(info_span!(parent: None, "remote_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, %upload_task_id)),
            );
            // Loop back to process next task
@@ -1187,7 +1290,7 @@ impl RemoteTimelineClient {
                        self.generation,
                    )
                    .measure_remote_op(
-                        self.tenant_id,
+                        self.tenant_shard_id.tenant_id,
                        self.timeline_id,
                        RemoteOpFileKind::Layer,
                        RemoteOpKind::Upload,
@@ -1207,13 +1310,13 @@ impl RemoteTimelineClient {
                    let res = upload::upload_index_part(
                        &self.storage_impl,
-                        &self.tenant_id,
+                        &self.tenant_shard_id,
                        &self.timeline_id,
                        self.generation,
                        index_part,
                    )
                    .measure_remote_op(
-                        self.tenant_id,
+                        self.tenant_shard_id.tenant_id,
                        self.timeline_id,
                        RemoteOpFileKind::Index,
                        RemoteOpKind::Upload,
@@ -1229,20 +1332,22 @@ impl RemoteTimelineClient {
                    }
                    res
                }
-                UploadOp::Delete(delete) => self
+                UploadOp::Delete(delete) => {
-                    .deletion_queue_client
+                    pausable_failpoint!("before-delete-layer-pausable");
-                    .push_layers(
+                    self.deletion_queue_client
-                        self.tenant_id,
+                        .push_layers(
-                        self.timeline_id,
+                            self.tenant_shard_id,
-                        self.generation,
+                            self.timeline_id,
-                        delete.layers.clone(),
+                            self.generation,
-                    )
+                            delete.layers.clone(),
-                    .await
+                        )
-                    .map_err(|e| anyhow::anyhow!(e)),
+                        .await
-                UploadOp::Barrier(_) => {
+                        .map_err(|e| anyhow::anyhow!(e))
                }
                unexpected @ UploadOp::Barrier(_) | unexpected @ UploadOp::Shutdown => {
                    // unreachable. Barrier operations are handled synchronously in
                    // launch_queued_tasks
-                    warn!("unexpected Barrier operation in perform_upload_task");
+                    warn!("unexpected {unexpected:?} operation in perform_upload_task");
                    break;
                }
            };
@@ -1336,7 +1441,7 @@ impl RemoteTimelineClient {
                    upload_queue.num_inprogress_deletions -= 1;
                    None
                }
-                UploadOp::Barrier(_) => unreachable!(),
+                UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(),
            };
            // Launch any queued tasks that were unblocked by this one.
@@ -1350,7 +1455,7 @@ impl RemoteTimelineClient {
            // data safety guarantees (see docs/rfcs/025-generation-numbers.md)
            self.deletion_queue_client
                .update_remote_consistent_lsn(
-                    self.tenant_id,
+                    self.tenant_shard_id,
                    self.timeline_id,
                    self.generation,
                    lsn,
@@ -1391,7 +1496,7 @@ impl RemoteTimelineClient {
                    reason: "should we track deletes? positive or negative sign?",
                },
            ),
-            UploadOp::Barrier(_) => {
+            UploadOp::Barrier(..) | UploadOp::Shutdown => {
                // we do not account these
                return None;
            }
@@ -1417,10 +1522,13 @@ impl RemoteTimelineClient {
    }
    /// Close the upload queue for new operations and cancel queued operations.
    ///
    /// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
    ///
    /// In-progress operations will still be running after this function returns.
    /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
    /// to wait for them to complete, after calling this function.
-    pub fn stop(&self) -> Result<(), StopError> {
+    pub(crate) fn stop(&self) -> Result<(), StopError> {
        // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
        // into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
        // The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
@@ -1458,6 +1566,15 @@ impl RemoteTimelineClient {
                        queued_operations: VecDeque::default(),
                        #[cfg(feature = "testing")]
                        dangling_files: HashMap::default(),
                        shutting_down: false,
                        shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
                        // TODO: this is the only place where we cannot reasonably continue the
                        // tree
                        cancel: crate::PAGESERVER_SHUTDOWN_TOKEN
                            .get()
                            .cloned()
                            .unwrap_or_default()
                            .child_token(),
                    };
                    let upload_queue = std::mem::replace(
@@ -1503,24 +1620,32 @@ impl RemoteTimelineClient {
    }
 }
-pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
+pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
-    let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}");
+    let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
    RemotePath::from_string(&path).expect("Failed to construct path")
 }
-pub fn remote_timeline_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
+pub fn remote_timeline_path(
-    remote_timelines_path(tenant_id).join(Utf8Path::new(&timeline_id.to_string()))
+    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
 ) -> RemotePath {
    remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string()))
 }
 /// Note that the shard component of a remote layer path is _not_ always the same
 /// as in the TenantShardId of the caller: tenants may reference layers from a different
 /// ShardIndex.  Use the ShardIndex from the layer's metadata.
 pub fn remote_layer_path(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    shard: ShardIndex,
    layer_file_name: &LayerFileName,
    generation: Generation,
 ) -> RemotePath {
    // Generation-aware key format
    let path = format!(
-        "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
+        "tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}",
        shard.get_suffix(),
        layer_file_name.file_name(),
        generation.get_suffix()
    );
@@ -1528,13 +1653,20 @@ pub fn remote_layer_path(
    RemotePath::from_string(&path).expect("Failed to construct path")
 }
 pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
    RemotePath::from_string(&format!(
        "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{INITDB_PATH}"
    ))
    .expect("Failed to construct path")
 }
 pub fn remote_index_path(
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    generation: Generation,
 ) -> RemotePath {
    RemotePath::from_string(&format!(
-        "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
+        "tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
        IndexPart::FILE_NAME,
        generation.get_suffix()
    ))
@@ -1676,14 +1808,14 @@ mod tests {
            Arc::new(RemoteTimelineClient {
                conf: self.harness.conf,
                runtime: tokio::runtime::Handle::current(),
-                tenant_id: self.harness.tenant_id,
+                tenant_shard_id: self.harness.tenant_shard_id,
                timeline_id: TIMELINE_ID,
                generation,
                storage_impl: self.harness.remote_storage.clone(),
                deletion_queue_client: self.harness.deletion_queue.new_client(),
                upload_queue: Mutex::new(UploadQueue::Uninitialized),
                metrics: Arc::new(RemoteTimelineClientMetrics::new(
-                    &self.harness.tenant_id,
+                    &self.harness.tenant_shard_id,
                    &TIMELINE_ID,
                )),
            })
@@ -1759,6 +1891,7 @@ mod tests {
        println!("remote_timeline_dir: {remote_timeline_dir}");
        let generation = harness.generation;
        let shard = harness.shard;
        // Create a couple of dummy files,  schedule upload for them
@@ -1775,7 +1908,7 @@ mod tests {
                harness.conf,
                &timeline,
                name,
-                LayerFileMetadata::new(contents.len() as u64, generation),
+                LayerFileMetadata::new(contents.len() as u64, generation, shard),
            )
        }).collect::<Vec<_>>();
@@ -1924,7 +2057,7 @@ mod tests {
            harness.conf,
            &timeline,
            layer_file_name_1.clone(),
-            LayerFileMetadata::new(content_1.len() as u64, harness.generation),
+            LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard),
        );
        #[derive(Debug, PartialEq, Clone, Copy)]
@@ -2010,7 +2143,12 @@ mod tests {
        std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");
        let index_path = test_state.harness.remote_fs_dir.join(
-            remote_index_path(&test_state.harness.tenant_id, &TIMELINE_ID, generation).get_path(),
+            remote_index_path(
                &test_state.harness.tenant_shard_id,
                &TIMELINE_ID,
                generation,
            )
            .get_path(),
        );
        eprintln!("Writing {index_path}");
        std::fs::write(&index_path, index_part_bytes).unwrap();
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -8,10 +8,12 @@ use std::future::Future;
 use std::time::Duration;
 use anyhow::{anyhow, Context};
-use camino::Utf8Path;
+use camino::{Utf8Path, Utf8PathBuf};
-use tokio::fs;
+use pageserver_api::shard::TenantShardId;
-use tokio::io::AsyncWriteExt;
+use tokio::fs::{self, File, OpenOptions};
 use tokio::io::{AsyncSeekExt, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
 use utils::{backoff, crashsafe};
 use crate::config::PageServerConf;
@@ -19,14 +21,15 @@ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::Generation;
 use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
 use utils::crashsafe::path_with_suffix_extension;
-use utils::id::{TenantId, TimelineId};
+use utils::id::TimelineId;
 use super::index::{IndexPart, LayerFileMetadata};
 use super::{
-    parse_remote_index_path, remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
+    parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
-    FAILED_REMOTE_OP_RETRIES,
+    FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
 };
 static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
@@ -39,7 +42,7 @@ static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
 pub async fn download_layer_file<'a>(
    conf: &'static PageServerConf,
    storage: &'a GenericRemoteStorage,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    layer_file_name: &'a LayerFileName,
    layer_metadata: &'a LayerFileMetadata,
@@ -47,12 +50,13 @@ pub async fn download_layer_file<'a>(
    debug_assert_current_span_has_tenant_and_timeline_id();
    let local_path = conf
-        .timeline_path(&tenant_id, &timeline_id)
+        .timeline_path(&tenant_shard_id, &timeline_id)
        .join(layer_file_name.file_name());
    let remote_path = remote_layer_path(
-        &tenant_id,
+        &tenant_shard_id.tenant_id,
        &timeline_id,
        layer_metadata.shard,
        layer_file_name,
        layer_metadata.generation,
    );
@@ -71,12 +75,11 @@ pub async fn download_layer_file<'a>(
    let (mut destination_file, bytes_amount) = download_retry(
        || async {
-            // TODO: this doesn't use the cached fd for some reason?
+            let destination_file = tokio::fs::File::create(&temp_file_path)
            let mut destination_file = fs::File::create(&temp_file_path)
                .await
                .with_context(|| format!("create a destination file for layer '{temp_file_path}'"))
                .map_err(DownloadError::Other)?;
-            let mut download = storage
+            let download = storage
                .download(&remote_path)
                .await
                .with_context(|| {
@@ -86,9 +89,14 @@ pub async fn download_layer_file<'a>(
                })
                .map_err(DownloadError::Other)?;
            let mut destination_file =
                tokio::io::BufWriter::with_capacity(super::BUFFER_SIZE, destination_file);
            let mut reader = tokio_util::io::StreamReader::new(download.download_stream);
            let bytes_amount = tokio::time::timeout(
                MAX_DOWNLOAD_DURATION,
-                tokio::io::copy(&mut download.download_stream, &mut destination_file),
+                tokio::io::copy_buf(&mut reader, &mut destination_file),
            )
            .await
            .map_err(|e| DownloadError::Other(anyhow::anyhow!("Timed out  {:?}", e)))?
@@ -99,6 +107,8 @@ pub async fn download_layer_file<'a>(
            })
            .map_err(DownloadError::Other)?;
            let destination_file = destination_file.into_inner();
            Ok((destination_file, bytes_amount))
        },
        &format!("download {remote_path:?}"),
@@ -169,10 +179,10 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool {
 /// List timelines of given tenant in remote storage
 pub async fn list_remote_timelines(
    storage: &GenericRemoteStorage,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    cancel: CancellationToken,
 ) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
-    let remote_path = remote_timelines_path(&tenant_id);
+    let remote_path = remote_timelines_path(&tenant_shard_id);
    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
        anyhow::bail!("storage-sync-list-remote-timelines");
@@ -180,7 +190,7 @@ pub async fn list_remote_timelines(
    let listing = download_retry_forever(
        || storage.list(Some(&remote_path), ListingMode::WithDelimiter),
-        &format!("list timelines for {tenant_id}"),
+        &format!("list timelines for {tenant_shard_id}"),
        cancel,
    )
    .await?;
@@ -190,7 +200,7 @@ pub async fn list_remote_timelines(
    for timeline_remote_storage_key in listing.prefixes {
        let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
-            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
+            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_shard_id}")
        })?;
        match object_name.parse::<TimelineId>() {
@@ -211,25 +221,27 @@ pub async fn list_remote_timelines(
 async fn do_download_index_part(
    storage: &GenericRemoteStorage,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    index_generation: Generation,
    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
-    let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);
+    use futures::stream::StreamExt;
    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);
    let index_part_bytes = download_retry_forever(
        || async {
-            let mut index_part_download = storage.download(&remote_path).await?;
+            let index_part_download = storage.download(&remote_path).await?;
            let mut index_part_bytes = Vec::new();
-            tokio::io::copy(
+            let mut stream = std::pin::pin!(index_part_download.download_stream);
-                &mut index_part_download.download_stream,
+            while let Some(chunk) = stream.next().await {
-                &mut index_part_bytes,
+                let chunk = chunk
-            )
+                    .with_context(|| format!("download index part at {remote_path:?}"))
-            .await
+                    .map_err(DownloadError::Other)?;
-            .with_context(|| format!("download index part at {remote_path:?}"))
+                index_part_bytes.extend_from_slice(&chunk[..]);
-            .map_err(DownloadError::Other)?;
+            }
            Ok(index_part_bytes)
        },
        &format!("download {remote_path:?}"),
@@ -252,7 +264,7 @@ async fn do_download_index_part(
 #[tracing::instrument(skip_all, fields(generation=?my_generation))]
 pub(super) async fn download_index_part(
    storage: &GenericRemoteStorage,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    my_generation: Generation,
    cancel: CancellationToken,
@@ -261,8 +273,14 @@ pub(super) async fn download_index_part(
    if my_generation.is_none() {
        // Operating without generations: just fetch the generation-less path
-        return do_download_index_part(storage, tenant_id, timeline_id, my_generation, cancel)
+        return do_download_index_part(
-            .await;
+            storage,
            tenant_shard_id,
            timeline_id,
            my_generation,
            cancel,
        )
        .await;
    }
    // Stale case: If we were intentionally attached in a stale generation, there may already be a remote
@@ -271,7 +289,7 @@ pub(super) async fn download_index_part(
    // This is an optimization to avoid doing the listing for the general case below.
    let res = do_download_index_part(
        storage,
-        tenant_id,
+        tenant_shard_id,
        timeline_id,
        my_generation,
        cancel.clone(),
@@ -298,7 +316,7 @@ pub(super) async fn download_index_part(
    // This is an optimization to avoid doing the listing for the general case below.
    let res = do_download_index_part(
        storage,
-        tenant_id,
+        tenant_shard_id,
        timeline_id,
        my_generation.previous(),
        cancel.clone(),
@@ -320,8 +338,9 @@ pub(super) async fn download_index_part(
    }
    // General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
-    // objects, and select the highest one with a generation <= my_generation.
+    // objects, and select the highest one with a generation <= my_generation.  Constructing the prefix is equivalent
-    let index_prefix = remote_index_path(tenant_id, timeline_id, Generation::none());
+    // to constructing a full index path with no generation, because the generation is a suffix.
    let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
    let indices = backoff::retry(
        || async { storage.list_files(Some(&index_prefix)).await },
        |_| false,
@@ -347,18 +366,93 @@ pub(super) async fn download_index_part(
    match max_previous_generation {
        Some(g) => {
            tracing::debug!("Found index_part in generation {g:?}");
-            do_download_index_part(storage, tenant_id, timeline_id, g, cancel).await
+            do_download_index_part(storage, tenant_shard_id, timeline_id, g, cancel).await
        }
        None => {
            // Migration from legacy pre-generation state: we have a generation but no prior
            // attached pageservers did.  Try to load from a no-generation path.
-            tracing::info!("No index_part.json* found");
+            tracing::debug!("No index_part.json* found");
-            do_download_index_part(storage, tenant_id, timeline_id, Generation::none(), cancel)
+            do_download_index_part(
-                .await
+                storage,
                tenant_shard_id,
                timeline_id,
                Generation::none(),
                cancel,
            )
            .await
        }
    }
 }
 pub(crate) async fn download_initdb_tar_zst(
    conf: &'static PageServerConf,
    storage: &GenericRemoteStorage,
    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
 ) -> Result<(Utf8PathBuf, File), DownloadError> {
    debug_assert_current_span_has_tenant_and_timeline_id();
    let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id);
    let timeline_path = conf.timelines_path(tenant_shard_id);
    if !timeline_path.exists() {
        tokio::fs::create_dir_all(&timeline_path)
            .await
            .with_context(|| format!("timeline dir creation {timeline_path}"))
            .map_err(DownloadError::Other)?;
    }
    let temp_path = timeline_path.join(format!(
        "{INITDB_PATH}.download-{timeline_id}.{TEMP_FILE_SUFFIX}"
    ));
    let file = download_retry(
        || async {
            let file = OpenOptions::new()
                .create(true)
                .truncate(true)
                .read(true)
                .write(true)
                .open(&temp_path)
                .await
                .with_context(|| format!("tempfile creation {temp_path}"))
                .map_err(DownloadError::Other)?;
            let download = storage.download(&remote_path).await?;
            let mut download = tokio_util::io::StreamReader::new(download.download_stream);
            let mut writer = tokio::io::BufWriter::with_capacity(8 * 1024, file);
            tokio::io::copy_buf(&mut download, &mut writer)
                .await
                .with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
                .map_err(DownloadError::Other)?;
            let mut file = writer.into_inner();
            file.seek(std::io::SeekFrom::Start(0))
                .await
                .with_context(|| format!("rewinding initdb.tar.zst at: {remote_path:?}"))
                .map_err(DownloadError::Other)?;
            Ok(file)
        },
        &format!("download {remote_path}"),
    )
    .await
    .map_err(|e| {
        // Do a best-effort attempt at deleting the temporary file upon encountering an error.
        // We don't have async here nor do we want to pile on any extra errors.
        if let Err(e) = std::fs::remove_file(&temp_path) {
            if e.kind() != std::io::ErrorKind::NotFound {
                warn!("error deleting temporary file {temp_path}: {e}");
            }
        }
        e
    })?;
    Ok((temp_path, file))
 }
 /// Helper function to handle retries for a download operation.
 ///
 /// Remote operations can fail due to rate limits (IAM, S3), spurious network
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -12,6 +12,7 @@ use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::upload_queue::UploadQueueInitialized;
 use crate::tenant::Generation;
 use pageserver_api::shard::ShardIndex;
 use utils::lsn::Lsn;
@@ -25,6 +26,8 @@ pub struct LayerFileMetadata {
    file_size: u64,
    pub(crate) generation: Generation,
    pub(crate) shard: ShardIndex,
 }
 impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
@@ -32,15 +35,17 @@ impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
        LayerFileMetadata {
            file_size: other.file_size,
            generation: other.generation,
            shard: other.shard,
        }
    }
 }
 impl LayerFileMetadata {
-    pub fn new(file_size: u64, generation: Generation) -> Self {
+    pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
        LayerFileMetadata {
            file_size,
            generation,
            shard,
        }
    }
@@ -128,6 +133,14 @@ impl IndexPart {
    pub fn get_disk_consistent_lsn(&self) -> Lsn {
        self.disk_consistent_lsn
    }
    pub fn from_s3_bytes(bytes: &[u8]) -> Result<Self, serde_json::Error> {
        serde_json::from_slice::<IndexPart>(bytes)
    }
    pub fn to_s3_bytes(&self) -> serde_json::Result<Vec<u8>> {
        serde_json::to_vec(self)
    }
 }
 impl TryFrom<&UploadQueueInitialized> for IndexPart {
@@ -153,6 +166,10 @@ pub struct IndexLayerMetadata {
    #[serde(default = "Generation::none")]
    #[serde(skip_serializing_if = "Generation::is_none")]
    pub generation: Generation,
    #[serde(default = "ShardIndex::unsharded")]
    #[serde(skip_serializing_if = "ShardIndex::is_unsharded")]
    pub shard: ShardIndex,
 }
 impl From<LayerFileMetadata> for IndexLayerMetadata {
@@ -160,6 +177,7 @@ impl From<LayerFileMetadata> for IndexLayerMetadata {
        IndexLayerMetadata {
            file_size: other.file_size,
            generation: other.generation,
            shard: other.shard,
        }
    }
 }
@@ -187,13 +205,15 @@ mod tests {
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
-                    generation: Generation::none()
+                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
-                    generation: Generation::none()
+                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
@@ -201,7 +221,7 @@ mod tests {
            deleted_at: None,
        };
-        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }
@@ -225,13 +245,15 @@ mod tests {
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
-                    generation: Generation::none()
+                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
-                    generation: Generation::none()
+                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
@@ -239,7 +261,7 @@ mod tests {
            deleted_at: None,
        };
-        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }
@@ -264,13 +286,15 @@ mod tests {
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
-                    generation: Generation::none()
+                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
-                    generation: Generation::none()
+                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
@@ -279,7 +303,7 @@ mod tests {
                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
        };
-        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }
@@ -323,7 +347,7 @@ mod tests {
            deleted_at: None,
        };
-        let empty_layers_parsed = serde_json::from_str::<IndexPart>(empty_layers_json).unwrap();
+        let empty_layers_parsed = IndexPart::from_s3_bytes(empty_layers_json.as_bytes()).unwrap();
        assert_eq!(empty_layers_parsed, expected);
    }
@@ -346,22 +370,24 @@ mod tests {
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
-                    generation: Generation::none()
+                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                }),
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
-                    generation: Generation::none()
+                    generation: Generation::none(),
                    shard: ShardIndex::unsharded()
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
        };
-        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
        assert_eq!(part, expected);
    }
 }
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -3,13 +3,16 @@
 use anyhow::{bail, Context};
 use camino::Utf8Path;
 use fail::fail_point;
 use pageserver_api::shard::TenantShardId;
 use std::io::ErrorKind;
-use tokio::fs;
+use tokio::fs::{self, File};
 use super::Generation;
 use crate::{
    config::PageServerConf,
-    tenant::remote_timeline_client::{index::IndexPart, remote_index_path, remote_path},
+    tenant::remote_timeline_client::{
        index::IndexPart, remote_index_path, remote_initdb_archive_path, remote_path,
    },
 };
 use remote_storage::GenericRemoteStorage;
 use utils::id::{TenantId, TimelineId};
@@ -21,7 +24,7 @@ use tracing::info;
 /// Serializes and uploads the given index part data to the remote storage.
 pub(super) async fn upload_index_part<'a>(
    storage: &'a GenericRemoteStorage,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    generation: Generation,
    index_part: &'a IndexPart,
@@ -33,16 +36,21 @@ pub(super) async fn upload_index_part<'a>(
    });
    pausable_failpoint!("before-upload-index-pausable");
-    let index_part_bytes =
+    let index_part_bytes = index_part
-        serde_json::to_vec(&index_part).context("serialize index part file into bytes")?;
+        .to_s3_bytes()
        .context("serialize index part file into bytes")?;
    let index_part_size = index_part_bytes.len();
-    let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));
+    let index_part_bytes = bytes::Bytes::from(index_part_bytes);
-    let remote_path = remote_index_path(tenant_id, timeline_id, generation);
+    let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
    storage
-        .upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
+        .upload_storage_object(
            futures::stream::once(futures::future::ready(Ok(index_part_bytes))),
            index_part_size,
            &remote_path,
        )
        .await
-        .with_context(|| format!("upload index part for '{tenant_id} / {timeline_id}'"))
+        .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }
 /// Attempts to upload given layer files.
@@ -96,10 +104,31 @@ pub(super) async fn upload_timeline_layer<'a>(
    let fs_size = usize::try_from(fs_size)
        .with_context(|| format!("convert {source_path:?} size {fs_size} usize"))?;
    let reader = tokio_util::io::ReaderStream::with_capacity(source_file, super::BUFFER_SIZE);
    storage
-        .upload(source_file, fs_size, &storage_path, None)
+        .upload(reader, fs_size, &storage_path, None)
        .await
        .with_context(|| format!("upload layer from local path '{source_path}'"))?;
    Ok(())
 }
 /// Uploads the given `initdb` data to the remote storage.
 pub(crate) async fn upload_initdb_dir(
    storage: &GenericRemoteStorage,
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
    initdb_tar_zst: File,
    size: u64,
 ) -> anyhow::Result<()> {
    tracing::trace!("uploading initdb dir");
    let file = tokio_util::io::ReaderStream::with_capacity(initdb_tar_zst, super::BUFFER_SIZE);
    let remote_path = remote_initdb_archive_path(tenant_id, timeline_id);
    storage
        .upload_storage_object(file, size as usize, &remote_path)
        .await
        .with_context(|| format!("upload initdb dir for '{tenant_id} / {timeline_id}'"))
 }
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -6,6 +6,7 @@ use std::sync::Arc;
 use anyhow::{bail, Context};
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 use tokio_util::sync::CancellationToken;
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
@@ -113,11 +114,12 @@ pub(super) async fn gather_inputs(
    max_retention_period: Option<u64>,
    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
    cause: LogicalSizeCalculationCause,
    cancel: &CancellationToken,
    ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
    // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
    tenant
-        .refresh_gc_info(ctx)
+        .refresh_gc_info(cancel, ctx)
        .await
        .context("Failed to refresh gc_info before gathering inputs")?;
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -2,9 +2,9 @@
 pub mod delta_layer;
 mod filename;
-mod image_layer;
+pub mod image_layer;
 mod inmemory_layer;
-mod layer;
+pub(crate) mod layer;
 mod layer_desc;
 use crate::context::{AccessStatsBehavior, RequestContext};
@@ -24,10 +24,7 @@ use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
 use utils::rate_limit::RateLimit;
-use utils::{
+use utils::{id::TimelineId, lsn::Lsn};
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };
 pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
 pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
@@ -304,12 +301,14 @@ pub trait AsLayerDesc {
 }
 pub mod tests {
    use pageserver_api::shard::TenantShardId;
    use super::*;
    impl From<DeltaFileName> for PersistentLayerDesc {
        fn from(value: DeltaFileName) -> Self {
            PersistentLayerDesc::new_delta(
-                TenantId::from_array([0; 16]),
+                TenantShardId::from([0; 18]),
                TimelineId::from_array([0; 16]),
                value.key_range,
                value.lsn_range,
@@ -321,7 +320,7 @@ pub mod tests {
    impl From<ImageFileName> for PersistentLayerDesc {
        fn from(value: ImageFileName) -> Self {
            PersistentLayerDesc::new_img(
-                TenantId::from_array([0; 16]),
+                TenantShardId::from([0; 18]),
                TimelineId::from_array([0; 16]),
                value.key_range,
                value.lsn,
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -42,6 +42,7 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::fs::File;
@@ -69,13 +70,13 @@ use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
 pub struct Summary {
    /// Magic value to identify this as a neon delta file. Always DELTA_FILE_MAGIC.
-    magic: u16,
+    pub magic: u16,
-    format_version: u16,
+    pub format_version: u16,
-    tenant_id: TenantId,
+    pub tenant_id: TenantId,
-    timeline_id: TimelineId,
+    pub timeline_id: TimelineId,
-    key_range: Range<Key>,
+    pub key_range: Range<Key>,
-    lsn_range: Range<Lsn>,
+    pub lsn_range: Range<Lsn>,
    /// Block number where the 'index' part of the file begins.
    pub index_start_blk: u32,
@@ -86,7 +87,7 @@ pub struct Summary {
 impl From<&DeltaLayer> for Summary {
    fn from(layer: &DeltaLayer) -> Self {
        Self::expected(
-            layer.desc.tenant_id,
+            layer.desc.tenant_shard_id.tenant_id,
            layer.desc.timeline_id,
            layer.desc.key_range.clone(),
            layer.desc.lsn_range.clone(),
@@ -248,7 +249,7 @@ impl DeltaLayer {
    fn temp_path_for(
        conf: &PageServerConf,
-        tenant_id: &TenantId,
+        tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
        key_start: Key,
        lsn_range: &Range<Lsn>,
@@ -259,14 +260,15 @@ impl DeltaLayer {
            .map(char::from)
            .collect();
-        conf.timeline_path(tenant_id, timeline_id).join(format!(
+        conf.timeline_path(tenant_shard_id, timeline_id)
-            "{}-XXX__{:016X}-{:016X}.{}.{}",
+            .join(format!(
-            key_start,
+                "{}-XXX__{:016X}-{:016X}.{}.{}",
-            u64::from(lsn_range.start),
+                key_start,
-            u64::from(lsn_range.end),
+                u64::from(lsn_range.start),
-            rand_string,
+                u64::from(lsn_range.end),
-            TEMP_FILE_SUFFIX,
+                rand_string,
-        ))
+                TEMP_FILE_SUFFIX,
            ))
    }
    ///
@@ -289,7 +291,9 @@ impl DeltaLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<Arc<DeltaLayerInner>> {
        let path = self.path();
-        let loaded = DeltaLayerInner::load(&path, None, ctx).await?;
+        let loaded = DeltaLayerInner::load(&path, None, ctx)
            .await
            .and_then(|res| res)?;
        // not production code
        let actual_filename = path.file_name().unwrap().to_owned();
@@ -316,10 +320,14 @@ impl DeltaLayer {
            .metadata()
            .context("get file metadata to determine size")?;
        // TODO(sharding): we must get the TenantShardId from the path instead of reading the Summary.
        // we should also validate the path against the Summary, as both should contain the same tenant, timeline, key, lsn.
        let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
        Ok(DeltaLayer {
            path: path.to_path_buf(),
            desc: PersistentLayerDesc::new_delta(
-                summary.tenant_id,
+                tenant_shard_id,
                summary.timeline_id,
                summary.key_range,
                summary.lsn_range,
@@ -351,7 +359,7 @@ struct DeltaLayerWriterInner {
    conf: &'static PageServerConf,
    pub path: Utf8PathBuf,
    timeline_id: TimelineId,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    key_start: Key,
    lsn_range: Range<Lsn>,
@@ -368,7 +376,7 @@ impl DeltaLayerWriterInner {
    async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        key_start: Key,
        lsn_range: Range<Lsn>,
    ) -> anyhow::Result<Self> {
@@ -378,7 +386,8 @@ impl DeltaLayerWriterInner {
        //
        // Note: This overwrites any existing file. There shouldn't be any.
        // FIXME: throw an error instead?
-        let path = DeltaLayer::temp_path_for(conf, &tenant_id, &timeline_id, key_start, &lsn_range);
+        let path =
            DeltaLayer::temp_path_for(conf, &tenant_shard_id, &timeline_id, key_start, &lsn_range);
        let mut file = VirtualFile::create(&path).await?;
        // make room for the header block
@@ -393,7 +402,7 @@ impl DeltaLayerWriterInner {
            conf,
            path,
            timeline_id,
-            tenant_id,
+            tenant_shard_id,
            key_start,
            lsn_range,
            tree: tree_builder,
@@ -455,7 +464,7 @@ impl DeltaLayerWriterInner {
        let summary = Summary {
            magic: DELTA_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,
-            tenant_id: self.tenant_id,
+            tenant_id: self.tenant_shard_id.tenant_id,
            timeline_id: self.timeline_id,
            key_range: self.key_start..key_end,
            lsn_range: self.lsn_range.clone(),
@@ -496,7 +505,7 @@ impl DeltaLayerWriterInner {
        // set inner.file here. The first read will have to re-open it.
        let desc = PersistentLayerDesc::new_delta(
-            self.tenant_id,
+            self.tenant_shard_id,
            self.timeline_id,
            self.key_start..key_end,
            self.lsn_range.clone(),
@@ -547,14 +556,20 @@ impl DeltaLayerWriter {
    pub async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        key_start: Key,
        lsn_range: Range<Lsn>,
    ) -> anyhow::Result<Self> {
        Ok(Self {
            inner: Some(
-                DeltaLayerWriterInner::new(conf, timeline_id, tenant_id, key_start, lsn_range)
+                DeltaLayerWriterInner::new(
-                    .await?,
+                    conf,
                    timeline_id,
                    tenant_shard_id,
                    key_start,
                    lsn_range,
                )
                .await?,
            ),
        })
    }
@@ -609,19 +624,84 @@ impl Drop for DeltaLayerWriter {
    }
 }
 #[derive(thiserror::Error, Debug)]
 pub enum RewriteSummaryError {
    #[error("magic mismatch")]
    MagicMismatch,
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }
 impl From<std::io::Error> for RewriteSummaryError {
    fn from(e: std::io::Error) -> Self {
        Self::Other(anyhow::anyhow!(e))
    }
 }
 impl DeltaLayer {
    pub async fn rewrite_summary<F>(
        path: &Utf8Path,
        rewrite: F,
        ctx: &RequestContext,
    ) -> Result<(), RewriteSummaryError>
    where
        F: Fn(Summary) -> Summary,
    {
        let file = VirtualFile::open_with_options(
            path,
            &*std::fs::OpenOptions::new().read(true).write(true),
        )
        .await
        .with_context(|| format!("Failed to open file '{}'", path))?;
        let file = FileBlockReader::new(file);
        let summary_blk = file.read_blk(0, ctx).await?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
        let mut file = file.file;
        if actual_summary.magic != DELTA_FILE_MAGIC {
            return Err(RewriteSummaryError::MagicMismatch);
        }
        let new_summary = rewrite(actual_summary);
        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
        Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
        if buf.spilled() {
            // The code in DeltaLayerWriterInner just warn!()s for this.
            // It should probably error out as well.
            return Err(RewriteSummaryError::Other(anyhow::anyhow!(
                "Used more than one page size for summary buffer: {}",
                buf.len()
            )));
        }
        file.seek(SeekFrom::Start(0)).await?;
        file.write_all(&buf).await?;
        Ok(())
    }
 }
 impl DeltaLayerInner {
    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
    /// - inner has the success or transient failure
    /// - outer has the permanent failure
    pub(super) async fn load(
        path: &Utf8Path,
        summary: Option<Summary>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
+    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
-        let file = VirtualFile::open(path)
+        let file = match VirtualFile::open(path).await {
-            .await
+            Ok(file) => file,
-            .with_context(|| format!("Failed to open file '{path}'"))?;
+            Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
        };
        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0, ctx).await?;
+        let summary_blk = match file.read_blk(0, ctx).await {
-        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+            Ok(blk) => blk,
            Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
        };
        // TODO: this should be an assertion instead; see ImageLayerInner::load
        let actual_summary =
            Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?;
        if let Some(mut expected_summary) = summary {
            // production code path
@@ -636,11 +716,11 @@ impl DeltaLayerInner {
            }
        }
-        Ok(DeltaLayerInner {
+        Ok(Ok(DeltaLayerInner {
            file,
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
-        })
+        }))
    }
    pub(super) async fn get_value_reconstruct_data(
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -41,6 +41,7 @@ use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use pageserver_api::models::LayerAccessKind;
 use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::fs::File;
@@ -67,27 +68,27 @@ use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
 /// the 'index' starts at the block indicated by 'index_start_blk'
 ///
 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
-pub(super) struct Summary {
+pub struct Summary {
    /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
-    magic: u16,
+    pub magic: u16,
-    format_version: u16,
+    pub format_version: u16,
-    tenant_id: TenantId,
+    pub tenant_id: TenantId,
-    timeline_id: TimelineId,
+    pub timeline_id: TimelineId,
-    key_range: Range<Key>,
+    pub key_range: Range<Key>,
-    lsn: Lsn,
+    pub lsn: Lsn,
    /// Block number where the 'index' part of the file begins.
-    index_start_blk: u32,
+    pub index_start_blk: u32,
    /// Block within the 'index', where the B-tree root page is stored
-    index_root_blk: u32,
+    pub index_root_blk: u32,
    // the 'values' part starts after the summary header, on block 1.
 }
 impl From<&ImageLayer> for Summary {
    fn from(layer: &ImageLayer) -> Self {
        Self::expected(
-            layer.desc.tenant_id,
+            layer.desc.tenant_shard_id.tenant_id,
            layer.desc.timeline_id,
            layer.desc.key_range.clone(),
            layer.lsn,
@@ -217,7 +218,7 @@ impl ImageLayer {
    fn temp_path_for(
        conf: &PageServerConf,
        timeline_id: TimelineId,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        fname: &ImageFileName,
    ) -> Utf8PathBuf {
        let rand_string: String = rand::thread_rng()
@@ -226,7 +227,7 @@ impl ImageLayer {
            .map(char::from)
            .collect();
-        conf.timeline_path(&tenant_id, &timeline_id)
+        conf.timeline_path(&tenant_shard_id, &timeline_id)
            .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
    }
@@ -249,7 +250,9 @@ impl ImageLayer {
    async fn load_inner(&self, ctx: &RequestContext) -> Result<ImageLayerInner> {
        let path = self.path();
-        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx).await?;
+        let loaded = ImageLayerInner::load(&path, self.desc.image_layer_lsn(), None, ctx)
            .await
            .and_then(|res| res)?;
        // not production code
        let actual_filename = path.file_name().unwrap().to_owned();
@@ -274,10 +277,15 @@ impl ImageLayer {
        let metadata = file
            .metadata()
            .context("get file metadata to determine size")?;
        // TODO(sharding): we should get TenantShardId from path.
        // OR, not at all: any layer we load from disk should also get reconciled with remote IndexPart.
        let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
        Ok(ImageLayer {
            path: path.to_path_buf(),
            desc: PersistentLayerDesc::new_img(
-                summary.tenant_id,
+                tenant_shard_id,
                summary.timeline_id,
                summary.key_range,
                summary.lsn,
@@ -294,19 +302,87 @@ impl ImageLayer {
    }
 }
 #[derive(thiserror::Error, Debug)]
 pub enum RewriteSummaryError {
    #[error("magic mismatch")]
    MagicMismatch,
    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }
 impl From<std::io::Error> for RewriteSummaryError {
    fn from(e: std::io::Error) -> Self {
        Self::Other(anyhow::anyhow!(e))
    }
 }
 impl ImageLayer {
    pub async fn rewrite_summary<F>(
        path: &Utf8Path,
        rewrite: F,
        ctx: &RequestContext,
    ) -> Result<(), RewriteSummaryError>
    where
        F: Fn(Summary) -> Summary,
    {
        let file = VirtualFile::open_with_options(
            path,
            &*std::fs::OpenOptions::new().read(true).write(true),
        )
        .await
        .with_context(|| format!("Failed to open file '{}'", path))?;
        let file = FileBlockReader::new(file);
        let summary_blk = file.read_blk(0, ctx).await?;
        let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
        let mut file = file.file;
        if actual_summary.magic != IMAGE_FILE_MAGIC {
            return Err(RewriteSummaryError::MagicMismatch);
        }
        let new_summary = rewrite(actual_summary);
        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
        Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
        if buf.spilled() {
            // The code in ImageLayerWriterInner just warn!()s for this.
            // It should probably error out as well.
            return Err(RewriteSummaryError::Other(anyhow::anyhow!(
                "Used more than one page size for summary buffer: {}",
                buf.len()
            )));
        }
        file.seek(SeekFrom::Start(0)).await?;
        file.write_all(&buf).await?;
        Ok(())
    }
 }
 impl ImageLayerInner {
    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
    /// - inner has the success or transient failure
    /// - outer has the permanent failure
    pub(super) async fn load(
        path: &Utf8Path,
        lsn: Lsn,
        summary: Option<Summary>,
        ctx: &RequestContext,
-    ) -> anyhow::Result<Self> {
+    ) -> Result<Result<Self, anyhow::Error>, anyhow::Error> {
-        let file = VirtualFile::open(path)
+        let file = match VirtualFile::open(path).await {
-            .await
+            Ok(file) => file,
-            .with_context(|| format!("Failed to open file '{}'", path))?;
+            Err(e) => return Ok(Err(anyhow::Error::new(e).context("open layer file"))),
        };
        let file = FileBlockReader::new(file);
-        let summary_blk = file.read_blk(0, ctx).await?;
+        let summary_blk = match file.read_blk(0, ctx).await {
-        let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
+            Ok(blk) => blk,
            Err(e) => return Ok(Err(anyhow::Error::new(e).context("read first block"))),
        };
        // length is the only way how this could fail, so it's not actually likely at all unless
        // read_blk returns wrong sized block.
        //
        // TODO: confirm and make this into assertion
        let actual_summary =
            Summary::des_prefix(summary_blk.as_ref()).context("deserialize first block")?;
        if let Some(mut expected_summary) = summary {
            // production code path
@@ -322,12 +398,12 @@ impl ImageLayerInner {
            }
        }
-        Ok(ImageLayerInner {
+        Ok(Ok(ImageLayerInner {
            index_start_blk: actual_summary.index_start_blk,
            index_root_blk: actual_summary.index_root_blk,
            lsn,
            file,
-        })
+        }))
    }
    pub(super) async fn get_value_reconstruct_data(
@@ -385,7 +461,7 @@ struct ImageLayerWriterInner {
    conf: &'static PageServerConf,
    path: Utf8PathBuf,
    timeline_id: TimelineId,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    key_range: Range<Key>,
    lsn: Lsn,
@@ -400,7 +476,7 @@ impl ImageLayerWriterInner {
    async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        key_range: &Range<Key>,
        lsn: Lsn,
    ) -> anyhow::Result<Self> {
@@ -409,7 +485,7 @@ impl ImageLayerWriterInner {
        let path = ImageLayer::temp_path_for(
            conf,
            timeline_id,
-            tenant_id,
+            tenant_shard_id,
            &ImageFileName {
                key_range: key_range.clone(),
                lsn,
@@ -433,7 +509,7 @@ impl ImageLayerWriterInner {
            conf,
            path,
            timeline_id,
-            tenant_id,
+            tenant_shard_id,
            key_range: key_range.clone(),
            lsn,
            tree: tree_builder,
@@ -480,7 +556,7 @@ impl ImageLayerWriterInner {
        let summary = Summary {
            magic: IMAGE_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,
-            tenant_id: self.tenant_id,
+            tenant_id: self.tenant_shard_id.tenant_id,
            timeline_id: self.timeline_id,
            key_range: self.key_range.clone(),
            lsn: self.lsn,
@@ -506,7 +582,7 @@ impl ImageLayerWriterInner {
            .context("get metadata to determine file size")?;
        let desc = PersistentLayerDesc::new_img(
-            self.tenant_id,
+            self.tenant_shard_id,
            self.timeline_id,
            self.key_range.clone(),
            self.lsn,
@@ -562,13 +638,14 @@ impl ImageLayerWriter {
    pub async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        key_range: &Range<Key>,
        lsn: Lsn,
    ) -> anyhow::Result<ImageLayerWriter> {
        Ok(Self {
            inner: Some(
-                ImageLayerWriterInner::new(conf, timeline_id, tenant_id, key_range, lsn).await?,
+                ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn)
                    .await?,
            ),
        })
    }
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -14,15 +14,11 @@ use crate::tenant::Timeline;
 use crate::walrecord;
 use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
 use pageserver_api::shard::TenantShardId;
 use std::collections::HashMap;
 use std::sync::{Arc, OnceLock};
 use tracing::*;
-use utils::{
+use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
    bin_ser::BeSer,
    id::{TenantId, TimelineId},
    lsn::Lsn,
    vec_map::VecMap,
 };
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
@@ -33,7 +29,7 @@ use super::{DeltaLayerWriter, ResidentLayer};
 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    /// This layer contains all the changes from 'start_lsn'. The
@@ -226,17 +222,17 @@ impl InMemoryLayer {
    pub async fn create(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        start_lsn: Lsn,
    ) -> Result<InMemoryLayer> {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");
-        let file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;
+        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;
        Ok(InMemoryLayer {
            conf,
            timeline_id,
-            tenant_id,
+            tenant_shard_id,
            start_lsn,
            end_lsn: OnceLock::new(),
            inner: RwLock::new(InMemoryLayerInner {
@@ -335,7 +331,7 @@ impl InMemoryLayer {
        let mut delta_layer_writer = DeltaLayerWriter::new(
            self.conf,
            self.timeline_id,
-            self.tenant_id,
+            self.tenant_shard_id,
            Key::MIN,
            self.start_lsn..end_lsn,
        )
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -3,6 +3,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::models::{
    HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
 };
 use pageserver_api::shard::ShardIndex;
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, Weak};
@@ -81,7 +82,7 @@ impl Layer {
        metadata: LayerFileMetadata,
    ) -> Self {
        let desc = PersistentLayerDesc::from_filename(
-            timeline.tenant_id,
+            timeline.tenant_shard_id,
            timeline.timeline_id,
            file_name,
            metadata.file_size(),
@@ -96,6 +97,7 @@ impl Layer {
            desc,
            None,
            metadata.generation,
            metadata.shard,
        )));
        debug_assert!(owner.0.needs_download_blocking().unwrap().is_some());
@@ -111,7 +113,7 @@ impl Layer {
        metadata: LayerFileMetadata,
    ) -> ResidentLayer {
        let desc = PersistentLayerDesc::from_filename(
-            timeline.tenant_id,
+            timeline.tenant_shard_id,
            timeline.timeline_id,
            file_name,
            metadata.file_size(),
@@ -136,6 +138,7 @@ impl Layer {
                desc,
                Some(inner),
                metadata.generation,
                metadata.shard,
            )
        }));
@@ -179,6 +182,7 @@ impl Layer {
                desc,
                Some(inner),
                timeline.generation,
                timeline.get_shard_index(),
            )
        }));
@@ -218,14 +222,18 @@ impl Layer {
    ///
    /// [gc]: [`RemoteTimelineClient::schedule_gc_update`]
    /// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`]
-    pub(crate) fn garbage_collect_on_drop(&self) {
+    pub(crate) fn delete_on_drop(&self) {
-        self.0.garbage_collect_on_drop();
+        self.0.delete_on_drop();
    }
    /// Return data needed to reconstruct given page at LSN.
    ///
    /// It is up to the caller to collect more data from the previous layer and
    /// perform WAL redo, if necessary.
    ///
    /// # Cancellation-Safety
    ///
    /// This method is cancellation-safe.
    pub(crate) async fn get_value_reconstruct_data(
        &self,
        key: Key,
@@ -322,6 +330,24 @@ impl Layer {
        Ok(())
    }
    /// Waits until this layer has been dropped (and if needed, local file deletion and remote
    /// deletion scheduling has completed).
    ///
    /// Does not start local deletion, use [`Self::delete_on_drop`] for that
    /// separatedly.
    #[cfg(feature = "testing")]
    pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
        let mut rx = self.0.status.subscribe();
        async move {
            loop {
                if let Err(tokio::sync::broadcast::error::RecvError::Closed) = rx.recv().await {
                    break;
                }
            }
        }
    }
 }
 /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
@@ -397,8 +423,8 @@ struct LayerInner {
    /// Initialization and deinitialization are done while holding a permit.
    inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,
-    /// Do we want to garbage collect this when `LayerInner` is dropped
+    /// Do we want to delete locally and remotely this when `LayerInner` is dropped
-    wanted_garbage_collected: AtomicBool,
+    wanted_deleted: AtomicBool,
    /// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses
    /// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger
@@ -412,10 +438,6 @@ struct LayerInner {
    version: AtomicUsize,
    /// Allow subscribing to when the layer actually gets evicted.
    ///
    /// If in future we need to implement "wait until layer instances are gone and done", carrying
    /// this over to the gc spawn_blocking from LayerInner::drop will do the trick, and adding a
    /// method for "wait_gc" which will wait to this being closed.
    status: tokio::sync::broadcast::Sender<Status>,
    /// Counter for exponential backoff with the download
@@ -426,6 +448,15 @@ struct LayerInner {
    /// For loaded layers (resident or evicted) this comes from [`LayerFileMetadata::generation`],
    /// for created layers from [`Timeline::generation`].
    generation: Generation,
    /// The shard of this Layer.
    ///
    /// For layers created in this process, this will always be the [`ShardIndex`] of the
    /// current `ShardIdentity`` (TODO: add link once it's introduced).
    ///
    /// For loaded layers, this may be some other value if the tenant has undergone
    /// a shard split since the layer was originally written.
    shard: ShardIndex,
 }
 impl std::fmt::Display for LayerInner {
@@ -448,24 +479,28 @@ enum Status {
 impl Drop for LayerInner {
    fn drop(&mut self) {
-        if !*self.wanted_garbage_collected.get_mut() {
+        if !*self.wanted_deleted.get_mut() {
            // should we try to evict if the last wish was for eviction?
            // feels like there's some hazard of overcrowding near shutdown near by, but we don't
            // run drops during shutdown (yet)
            return;
        }
-        let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_id, timeline_id = %self.layer_desc().timeline_id);
+        let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);
        let path = std::mem::take(&mut self.path);
        let file_name = self.layer_desc().filename();
        let gen = self.generation;
        let file_size = self.layer_desc().file_size;
        let timeline = self.timeline.clone();
        let meta = self.metadata();
        let status = self.status.clone();
        crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
            let _g = span.entered();
            // carry this until we are finished for [`Layer::wait_drop`] support
            let _status = status;
            let removed = match std::fs::remove_file(path) {
                Ok(()) => true,
                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
@@ -478,8 +513,8 @@ impl Drop for LayerInner {
                    false
                }
                Err(e) => {
-                    tracing::error!("failed to remove garbage collected layer: {e}");
+                    tracing::error!("failed to remove wanted deleted layer: {e}");
-                    LAYER_IMPL_METRICS.inc_gc_removes_failed();
+                    LAYER_IMPL_METRICS.inc_delete_removes_failed();
                    false
                }
            };
@@ -489,7 +524,7 @@ impl Drop for LayerInner {
                    timeline.metrics.resident_physical_size_sub(file_size);
                }
                if let Some(remote_client) = timeline.remote_client.as_ref() {
-                    let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, gen)]);
+                    let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
                    if let Err(e) = res {
                        // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
@@ -501,15 +536,15 @@ impl Drop for LayerInner {
                        } else {
                            tracing::warn!("scheduling deletion on drop failed: {e:#}");
                        }
-                        LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::DeleteSchedulingFailed);
+                        LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
                    } else {
-                        LAYER_IMPL_METRICS.inc_completed_gcs();
+                        LAYER_IMPL_METRICS.inc_completed_deletes();
                    }
                }
            } else {
                // no need to nag that timeline is gone: under normal situation on
                // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
-                LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::TimelineGone);
+                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
            }
        });
    }
@@ -523,9 +558,10 @@ impl LayerInner {
        desc: PersistentLayerDesc,
        downloaded: Option<Arc<DownloadedLayer>>,
        generation: Generation,
        shard: ShardIndex,
    ) -> Self {
        let path = conf
-            .timeline_path(&timeline.tenant_id, &timeline.timeline_id)
+            .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id)
            .join(desc.filename().to_string());
        let (inner, version) = if let Some(inner) = downloaded {
@@ -543,26 +579,24 @@ impl LayerInner {
            timeline: Arc::downgrade(timeline),
            have_remote_client: timeline.remote_client.is_some(),
            access_stats,
-            wanted_garbage_collected: AtomicBool::new(false),
+            wanted_deleted: AtomicBool::new(false),
            wanted_evicted: AtomicBool::new(false),
            inner,
            version: AtomicUsize::new(version),
            status: tokio::sync::broadcast::channel(1).0,
            consecutive_failures: AtomicUsize::new(0),
            generation,
            shard,
        }
    }
-    fn garbage_collect_on_drop(&self) {
+    fn delete_on_drop(&self) {
-        let res = self.wanted_garbage_collected.compare_exchange(
+        let res =
-            false,
+            self.wanted_deleted
-            true,
+                .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
            Ordering::Release,
            Ordering::Relaxed,
        );
        if res.is_ok() {
-            LAYER_IMPL_METRICS.inc_started_gcs();
+            LAYER_IMPL_METRICS.inc_started_deletes();
        }
    }
@@ -630,6 +664,10 @@ impl LayerInner {
                // disable any scheduled but not yet running eviction deletions for this
                let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);
                // count cancellations, which currently remain largely unexpected
                let init_cancelled =
                    scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
                // no need to make the evict_and_wait wait for the actual download to complete
                drop(self.status.send(Status::Downloaded));
@@ -638,6 +676,8 @@ impl LayerInner {
                    .upgrade()
                    .ok_or_else(|| DownloadError::TimelineShutdown)?;
                // FIXME: grab a gate
                let can_ever_evict = timeline.remote_client.as_ref().is_some();
                // check if we really need to be downloaded; could have been already downloaded by a
@@ -698,6 +738,8 @@ impl LayerInner {
                    tracing::info!(waiters, "completing the on-demand download for other tasks");
                }
                scopeguard::ScopeGuard::into_inner(init_cancelled);
                Ok((ResidentOrWantedEvicted::Resident(res), permit))
            };
@@ -795,10 +837,11 @@ impl LayerInner {
        crate::task_mgr::spawn(
            &tokio::runtime::Handle::current(),
            crate::task_mgr::TaskKind::RemoteDownloadTask,
-            Some(self.desc.tenant_id),
+            Some(self.desc.tenant_shard_id),
            Some(self.desc.timeline_id),
            &task_name,
            false,
            timeline.cancel.child_token(),
            async move {
                let client = timeline
@@ -818,6 +861,21 @@ impl LayerInner {
                        Ok(())
                    }
                    Err(e) => {
                        let consecutive_failures =
                            this.consecutive_failures.fetch_add(1, Ordering::Relaxed);
                        let backoff = utils::backoff::exponential_backoff_duration_seconds(
                            consecutive_failures.min(u32::MAX as usize) as u32,
                            1.5,
                            60.0,
                        );
                        let backoff = std::time::Duration::from_secs_f64(backoff);
                        tokio::select! {
                            _ = tokio::time::sleep(backoff) => {},
                            _ = crate::task_mgr::shutdown_token().cancelled_owned() => {},
                        };
                        Err(e)
                    }
                };
@@ -826,14 +884,13 @@ impl LayerInner {
                    match res {
                        (Ok(()), _) => {
                            // our caller is cancellation safe so this is fine; if someone
-                            // else requests the layer, they'll find it already downloaded
+                            // else requests the layer, they'll find it already downloaded.
                            // or redownload.
                            //
-                            // however, could be that we should consider marking the layer
+                            // See counter [`LayerImplMetrics::inc_init_needed_no_download`]
-                            // for eviction? alas, cannot: because only DownloadedLayer
+                            //
-                            // will handle that.
+                            // FIXME(#6028): however, could be that we should consider marking the
-                            tracing::info!("layer file download completed after requester had cancelled");
+                            // layer for eviction? alas, cannot: because only DownloadedLayer will
-                            LAYER_IMPL_METRICS.inc_download_completed_without_requester();
+                            // handle that.
                        },
                        (Err(e), _) => {
                            // our caller is cancellation safe, but we might be racing with
@@ -866,21 +923,7 @@ impl LayerInner {
                Ok(permit)
            }
-            Ok((Err(e), _permit)) => {
+            Ok((Err(_), _permit)) => Err(DownloadError::DownloadFailed),
                // FIXME: this should be with the spawned task and be cancellation sensitive
                let consecutive_failures =
                    self.consecutive_failures.fetch_add(1, Ordering::Relaxed);
                tracing::error!(consecutive_failures, "layer file download failed: {e:#}");
                let backoff = utils::backoff::exponential_backoff_duration_seconds(
                    consecutive_failures.min(u32::MAX as usize) as u32,
                    1.5,
                    60.0,
                );
                let backoff = std::time::Duration::from_secs_f64(backoff);
                tokio::time::sleep(backoff).await;
                Err(DownloadError::DownloadFailed)
            }
            Err(_gone) => Err(DownloadError::DownloadCancelled),
        }
    }
@@ -950,14 +993,17 @@ impl LayerInner {
    /// `DownloadedLayer` is being dropped, so it calls this method.
    fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
-        let gc = self.wanted_garbage_collected.load(Ordering::Acquire);
+        let delete = self.wanted_deleted.load(Ordering::Acquire);
        let evict = self.wanted_evicted.load(Ordering::Acquire);
        let can_evict = self.have_remote_client;
-        if gc {
+        if delete {
-            // do nothing now, only in LayerInner::drop
+            // do nothing now, only in LayerInner::drop -- this was originally implemented because
            // we could had already scheduled the deletion at the time.
            //
            // FIXME: this is not true anymore, we can safely evict wanted deleted files.
        } else if can_evict && evict {
-            let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_id, timeline_id = %self.desc.timeline_id, layer=%self, %version);
+            let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);
            // downgrade for queueing, in case there's a tear down already ongoing we should not
            // hold it alive.
@@ -970,7 +1016,7 @@ impl LayerInner {
            crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
                let _g = span.entered();
-                // if LayerInner is already dropped here, do nothing because the garbage collection
+                // if LayerInner is already dropped here, do nothing because the delete on drop
                // has already ran while we were in queue
                let Some(this) = this.upgrade() else {
                    LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
@@ -1074,7 +1120,7 @@ impl LayerInner {
    }
    fn metadata(&self) -> LayerFileMetadata {
-        LayerFileMetadata::new(self.desc.file_size, self.generation)
+        LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard)
    }
 }
@@ -1189,41 +1235,50 @@ impl DownloadedLayer {
            let res = if owner.desc.is_delta {
                let summary = Some(delta_layer::Summary::expected(
-                    owner.desc.tenant_id,
+                    owner.desc.tenant_shard_id.tenant_id,
                    owner.desc.timeline_id,
                    owner.desc.key_range.clone(),
                    owner.desc.lsn_range.clone(),
                ));
                delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
                    .await
-                    .map(LayerKind::Delta)
+                    .map(|res| res.map(LayerKind::Delta))
            } else {
                let lsn = owner.desc.image_layer_lsn();
                let summary = Some(image_layer::Summary::expected(
-                    owner.desc.tenant_id,
+                    owner.desc.tenant_shard_id.tenant_id,
                    owner.desc.timeline_id,
                    owner.desc.key_range.clone(),
                    lsn,
                ));
                image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
                    .await
-                    .map(LayerKind::Image)
+                    .map(|res| res.map(LayerKind::Image))
-            }
+            };
            // this will be a permanent failure
            .context("load layer");
-            if let Err(e) = res.as_ref() {
+            match res {
-                LAYER_IMPL_METRICS.inc_permanent_loading_failures();
+                Ok(Ok(layer)) => Ok(Ok(layer)),
-                // TODO(#5815): we are not logging all errors, so temporarily log them here as well
+                Ok(Err(transient)) => Err(transient),
-                tracing::error!("layer loading failed permanently: {e:#}");
+                Err(permanent) => {
                    LAYER_IMPL_METRICS.inc_permanent_loading_failures();
                    // TODO(#5815): we are not logging all errors, so temporarily log them **once**
                    // here as well
                    let permanent = permanent.context("load layer");
                    tracing::error!("layer loading failed permanently: {permanent:#}");
                    Ok(Err(permanent))
                }
            }
            res
        };
-        self.kind.get_or_init(init).await.as_ref().map_err(|e| {
+        self.kind
-            // errors are not clonabled, cannot but stringify
+            .get_or_try_init(init)
-            // test_broken_timeline matches this string
+            // return transient errors using `?`
-            anyhow::anyhow!("layer loading failed: {e:#}")
+            .await?
-        })
+            .as_ref()
            .map_err(|e| {
                // errors are not clonabled, cannot but stringify
                // test_broken_timeline matches this string
                anyhow::anyhow!("layer loading failed: {e:#}")
            })
    }
    async fn get_value_reconstruct_data(
@@ -1352,35 +1407,37 @@ impl From<ResidentLayer> for Layer {
    }
 }
-use metrics::{IntCounter, IntCounterVec};
+use metrics::IntCounter;
-struct LayerImplMetrics {
+pub(crate) struct LayerImplMetrics {
    started_evictions: IntCounter,
    completed_evictions: IntCounter,
-    cancelled_evictions: IntCounterVec,
+    cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
-    started_gcs: IntCounter,
+    started_deletes: IntCounter,
-    completed_gcs: IntCounter,
+    completed_deletes: IntCounter,
-    failed_gcs: IntCounterVec,
+    failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
-    rare_counters: IntCounterVec,
+    rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
    inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
 }
 impl Default for LayerImplMetrics {
    fn default() -> Self {
-        let evictions = metrics::register_int_counter_vec!(
+        use enum_map::Enum;
-            "pageserver_layer_evictions_count",
+
-            "Evictions started and completed in the Layer implementation",
+        // reminder: these will be pageserver_layer_* with "_total" suffix
-            &["state"]
+
        let started_evictions = metrics::register_int_counter!(
            "pageserver_layer_started_evictions",
            "Evictions started in the Layer implementation"
        )
        .unwrap();
        let completed_evictions = metrics::register_int_counter!(
            "pageserver_layer_completed_evictions",
            "Evictions completed in the Layer implementation"
        )
        .unwrap();
        let started_evictions = evictions
            .get_metric_with_label_values(&["started"])
            .unwrap();
        let completed_evictions = evictions
            .get_metric_with_label_values(&["completed"])
            .unwrap();
        let cancelled_evictions = metrics::register_int_counter_vec!(
            "pageserver_layer_cancelled_evictions_count",
@@ -1389,23 +1446,36 @@ impl Default for LayerImplMetrics {
        )
        .unwrap();
-        let gcs = metrics::register_int_counter_vec!(
+        let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-            "pageserver_layer_gcs_count",
+            let reason = EvictionCancelled::from_usize(i);
-            "Garbage collections started and completed in the Layer implementation",
+            let s = reason.as_str();
-            &["state"]
+            cancelled_evictions.with_label_values(&[s])
        }));
        let started_deletes = metrics::register_int_counter!(
            "pageserver_layer_started_deletes",
            "Deletions on drop pending in the Layer implementation"
        )
        .unwrap();
        let completed_deletes = metrics::register_int_counter!(
            "pageserver_layer_completed_deletes",
            "Deletions on drop completed in the Layer implementation"
        )
        .unwrap();
-        let started_gcs = gcs.get_metric_with_label_values(&["pending"]).unwrap();
+        let failed_deletes = metrics::register_int_counter_vec!(
-        let completed_gcs = gcs.get_metric_with_label_values(&["completed"]).unwrap();
+            "pageserver_layer_failed_deletes_count",
-
+            "Different reasons for deletions on drop to have failed",
        let failed_gcs = metrics::register_int_counter_vec!(
            "pageserver_layer_failed_gcs_count",
            "Different reasons for garbage collections to have failed",
            &["reason"]
        )
        .unwrap();
        let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
            let reason = DeleteFailed::from_usize(i);
            let s = reason.as_str();
            failed_deletes.with_label_values(&[s])
        }));
        let rare_counters = metrics::register_int_counter_vec!(
            "pageserver_layer_assumed_rare_count",
            "Times unexpected or assumed rare event happened",
@@ -1413,16 +1483,29 @@ impl Default for LayerImplMetrics {
        )
        .unwrap();
        let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
            let event = RareEvent::from_usize(i);
            let s = event.as_str();
            rare_counters.with_label_values(&[s])
        }));
        let inits_cancelled = metrics::register_int_counter!(
            "pageserver_layer_inits_cancelled_count",
            "Times Layer initialization was cancelled",
        )
        .unwrap();
        Self {
            started_evictions,
            completed_evictions,
            cancelled_evictions,
-            started_gcs,
+            started_deletes,
-            completed_gcs,
+            completed_deletes,
-            failed_gcs,
+            failed_deletes,
            rare_counters,
            inits_cancelled,
        }
    }
 }
@@ -1435,57 +1518,33 @@ impl LayerImplMetrics {
        self.completed_evictions.inc();
    }
    fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
-        self.cancelled_evictions
+        self.cancelled_evictions[reason].inc()
            .get_metric_with_label_values(&[reason.as_str()])
            .unwrap()
            .inc()
    }
-    fn inc_started_gcs(&self) {
+    fn inc_started_deletes(&self) {
-        self.started_gcs.inc();
+        self.started_deletes.inc();
    }
-    fn inc_completed_gcs(&self) {
+    fn inc_completed_deletes(&self) {
-        self.completed_gcs.inc();
+        self.completed_deletes.inc();
    }
-    fn inc_gcs_failed(&self, reason: GcFailed) {
+    fn inc_deletes_failed(&self, reason: DeleteFailed) {
-        self.failed_gcs
+        self.failed_deletes[reason].inc();
            .get_metric_with_label_values(&[reason.as_str()])
            .unwrap()
            .inc();
    }
-    /// Counted separatedly from failed gcs because we will complete the gc attempt regardless of
+    /// Counted separatedly from failed layer deletes because we will complete the layer deletion
-    /// failure to delete local file.
+    /// attempt regardless of failure to delete local file.
-    fn inc_gc_removes_failed(&self) {
+    fn inc_delete_removes_failed(&self) {
-        self.rare_counters
+        self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
            .get_metric_with_label_values(&["gc_remove_failed"])
            .unwrap()
            .inc();
    }
-    /// Expected rare because requires a race with `evict_blocking` and
+    /// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`.
    /// `get_or_maybe_download`.
    fn inc_retried_get_or_maybe_download(&self) {
-        self.rare_counters
+        self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc();
            .get_metric_with_label_values(&["retried_gomd"])
            .unwrap()
            .inc();
    }
-    /// Expected rare because cancellations are unexpected
+    /// Expected rare because cancellations are unexpected, and failures are unexpected
    fn inc_download_completed_without_requester(&self) {
        self.rare_counters
            .get_metric_with_label_values(&["download_completed_without"])
            .unwrap()
            .inc();
    }
    /// Expected rare because cancellations are unexpected
    fn inc_download_failed_without_requester(&self) {
-        self.rare_counters
+        self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
            .get_metric_with_label_values(&["download_failed_without"])
            .unwrap()
            .inc();
    }
    /// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
@@ -1493,37 +1552,30 @@ impl LayerImplMetrics {
    /// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
    /// Option.
    fn inc_raced_wanted_evicted_accesses(&self) {
-        self.rare_counters
+        self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
            .get_metric_with_label_values(&["raced_wanted_evicted"])
            .unwrap()
            .inc();
    }
-    /// These are only expected for [`Self::inc_download_completed_without_requester`] amount when
+    /// These are only expected for [`Self::inc_init_cancelled`] amount when
    /// running with remote storage.
    fn inc_init_needed_no_download(&self) {
-        self.rare_counters
+        self.rare_counters[RareEvent::InitWithoutDownload].inc();
            .get_metric_with_label_values(&["init_needed_no_download"])
            .unwrap()
            .inc();
    }
    /// Expected rare because all layer files should be readable and good
    fn inc_permanent_loading_failures(&self) {
-        self.rare_counters
+        self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
            .get_metric_with_label_values(&["permanent_loading_failure"])
            .unwrap()
            .inc();
    }
    fn inc_broadcast_lagged(&self) {
-        self.rare_counters
+        self.rare_counters[RareEvent::EvictAndWaitLagged].inc();
-            .get_metric_with_label_values(&["broadcast_lagged"])
+    }
-            .unwrap()
+
-            .inc();
+    fn inc_init_cancelled(&self) {
        self.inits_cancelled.inc()
    }
 }
 #[derive(enum_map::Enum)]
 enum EvictionCancelled {
    LayerGone,
    TimelineGone,
@@ -1552,19 +1604,47 @@ impl EvictionCancelled {
    }
 }
-enum GcFailed {
+#[derive(enum_map::Enum)]
 enum DeleteFailed {
    TimelineGone,
    DeleteSchedulingFailed,
 }
-impl GcFailed {
+impl DeleteFailed {
    fn as_str(&self) -> &'static str {
        match self {
-            GcFailed::TimelineGone => "timeline_gone",
+            DeleteFailed::TimelineGone => "timeline_gone",
-            GcFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
+            DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
        }
    }
 }
-static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
+#[derive(enum_map::Enum)]
 enum RareEvent {
    RemoveOnDropFailed,
    RetriedGetOrMaybeDownload,
    DownloadFailedWithoutRequester,
    UpgradedWantedEvicted,
    InitWithoutDownload,
    PermanentLoadingFailure,
    EvictAndWaitLagged,
 }
 impl RareEvent {
    fn as_str(&self) -> &'static str {
        use RareEvent::*;
        match self {
            RemoveOnDropFailed => "remove_on_drop_failed",
            RetriedGetOrMaybeDownload => "retried_gomd",
            DownloadFailedWithoutRequester => "download_failed_without",
            UpgradedWantedEvicted => "raced_wanted_evicted",
            InitWithoutDownload => "init_needed_no_download",
            PermanentLoadingFailure => "permanent_loading_failure",
            EvictAndWaitLagged => "broadcast_lagged",
        }
    }
 }
 pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
    once_cell::sync::Lazy::new(LayerImplMetrics::default);
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,9 +1,7 @@
 use core::fmt::Display;
 use pageserver_api::shard::TenantShardId;
 use std::ops::Range;
-use utils::{
+use utils::{id::TimelineId, lsn::Lsn};
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };
 use crate::repository::Key;
@@ -11,12 +9,15 @@ use super::{DeltaFileName, ImageFileName, LayerFileName};
 use serde::{Deserialize, Serialize};
 #[cfg(test)]
 use utils::id::TenantId;
 /// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
 /// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
 /// a unified way to generate layer information like file name.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct PersistentLayerDesc {
-    pub tenant_id: TenantId,
+    pub tenant_shard_id: TenantShardId,
    pub timeline_id: TimelineId,
    /// Range of keys that this layer covers
    pub key_range: Range<Key>,
@@ -56,7 +57,7 @@ impl PersistentLayerDesc {
    #[cfg(test)]
    pub fn new_test(key_range: Range<Key>) -> Self {
        Self {
-            tenant_id: TenantId::generate(),
+            tenant_shard_id: TenantShardId::unsharded(TenantId::generate()),
            timeline_id: TimelineId::generate(),
            key_range,
            lsn_range: Lsn(0)..Lsn(1),
@@ -66,14 +67,14 @@ impl PersistentLayerDesc {
    }
    pub fn new_img(
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        key_range: Range<Key>,
        lsn: Lsn,
        file_size: u64,
    ) -> Self {
        Self {
-            tenant_id,
+            tenant_shard_id,
            timeline_id,
            key_range,
            lsn_range: Self::image_layer_lsn_range(lsn),
@@ -83,14 +84,14 @@ impl PersistentLayerDesc {
    }
    pub fn new_delta(
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        key_range: Range<Key>,
        lsn_range: Range<Lsn>,
        file_size: u64,
    ) -> Self {
        Self {
-            tenant_id,
+            tenant_shard_id,
            timeline_id,
            key_range,
            lsn_range,
@@ -100,18 +101,22 @@ impl PersistentLayerDesc {
    }
    pub fn from_filename(
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        filename: LayerFileName,
        file_size: u64,
    ) -> Self {
        match filename {
            LayerFileName::Image(i) => {
-                Self::new_img(tenant_id, timeline_id, i.key_range, i.lsn, file_size)
+                Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size)
            }
            LayerFileName::Delta(d) => {
                Self::new_delta(tenant_id, timeline_id, d.key_range, d.lsn_range, file_size)
            }
            LayerFileName::Delta(d) => Self::new_delta(
                tenant_shard_id,
                timeline_id,
                d.key_range,
                d.lsn_range,
                file_size,
            ),
        }
    }
@@ -172,10 +177,6 @@ impl PersistentLayerDesc {
        self.timeline_id
    }
    pub fn get_tenant_id(&self) -> TenantId {
        self.tenant_id
    }
    /// Does this layer only contain some data for the key-range (incremental),
    /// or does it contain a version of every page? This is important to know
    /// for garbage collecting old layers: an incremental layer depends on
@@ -192,7 +193,7 @@ impl PersistentLayerDesc {
        if self.is_delta {
            println!(
                "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----",
-                self.tenant_id,
+                self.tenant_shard_id,
                self.timeline_id,
                self.key_range.start,
                self.key_range.end,
@@ -204,7 +205,7 @@ impl PersistentLayerDesc {
        } else {
            println!(
                "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
-                self.tenant_id,
+                self.tenant_shard_id,
                self.timeline_id,
                self.key_range.start,
                self.key_range.end,
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -44,6 +44,7 @@ pub(crate) enum BackgroundLoopKind {
    Eviction,
    ConsumptionMetricsCollectMetrics,
    ConsumptionMetricsSyntheticSizeWorker,
    InitialLogicalSizeCalculation,
 }
 impl BackgroundLoopKind {
@@ -53,31 +54,21 @@ impl BackgroundLoopKind {
    }
 }
-pub(crate) enum RateLimitError {
+pub(crate) async fn concurrent_background_tasks_rate_limit_permit(
    Cancelled,
 }
 pub(crate) async fn concurrent_background_tasks_rate_limit(
    loop_kind: BackgroundLoopKind,
    _ctx: &RequestContext,
-    cancel: &CancellationToken,
+) -> impl Drop {
 ) -> Result<impl Drop, RateLimitError> {
    crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_START_COUNT
        .with_label_values(&[loop_kind.as_static_str()])
        .inc();
    scopeguard::defer!(
        crate::metrics::BACKGROUND_LOOP_SEMAPHORE_WAIT_FINISH_COUNT.with_label_values(&[loop_kind.as_static_str()]).inc();
    );
-    tokio::select! {
+
-        permit = CONCURRENT_BACKGROUND_TASKS.acquire() => {
+    match CONCURRENT_BACKGROUND_TASKS.acquire().await {
-            match permit {
+        Ok(permit) => permit,
-                Ok(permit) => Ok(permit),
+        Err(_closed) => unreachable!("we never close the semaphore"),
                Err(_closed) => unreachable!("we never close the semaphore"),
            }
        },
        _ = cancel.cancelled() => {
            Err(RateLimitError::Cancelled)
        }
    }
 }
@@ -86,14 +77,15 @@ pub fn start_background_loops(
    tenant: &Arc<Tenant>,
    background_jobs_can_start: Option<&completion::Barrier>,
 ) {
-    let tenant_id = tenant.tenant_id;
+    let tenant_shard_id = tenant.tenant_shard_id;
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::Compaction,
-        Some(tenant_id),
+        Some(tenant_shard_id),
        None,
-        &format!("compactor for tenant {tenant_id}"),
+        &format!("compactor for tenant {tenant_shard_id}"),
        false,
        tenant.cancel.child_token(),
        {
            let tenant = Arc::clone(tenant);
            let background_jobs_can_start = background_jobs_can_start.cloned();
@@ -104,7 +96,7 @@ pub fn start_background_loops(
                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
                };
                compaction_loop(tenant, cancel)
-                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
+                    .instrument(info_span!("compaction_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                    .await;
                Ok(())
            }
@@ -113,10 +105,11 @@ pub fn start_background_loops(
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::GarbageCollector,
-        Some(tenant_id),
+        Some(tenant_shard_id),
        None,
-        &format!("garbage collector for tenant {tenant_id}"),
+        &format!("garbage collector for tenant {tenant_shard_id}"),
        false,
        tenant.cancel.child_token(),
        {
            let tenant = Arc::clone(tenant);
            let background_jobs_can_start = background_jobs_can_start.cloned();
@@ -127,7 +120,7 @@ pub fn start_background_loops(
                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
                };
                gc_loop(tenant, cancel)
-                    .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
+                    .instrument(info_span!("gc_loop", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug()))
                    .await;
                Ok(())
            }
@@ -180,16 +173,16 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
                // Run compaction
                if let Err(e) = tenant.compaction_iteration(&cancel, &ctx).await {
                    let wait_duration = backoff::exponential_backoff_duration_seconds(
-                        error_run_count,
+                        error_run_count + 1,
                        1.0,
                        MAX_BACKOFF_SECS,
                    );
                    error_run_count += 1;
                    let wait_duration = Duration::from_secs_f64(wait_duration);
                    error!(
-                        "Compaction failed {error_run_count} times, retrying in {:?}: {e:?}",
+                        "Compaction failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
                        wait_duration
                    );
-                    Duration::from_secs_f64(wait_duration)
+                    wait_duration
                } else {
                    error_run_count = 0;
                    period
@@ -198,6 +191,10 @@ async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            warn_when_period_overrun(started_at.elapsed(), period, BackgroundLoopKind::Compaction);
            // Perhaps we did no work and the walredo process has been idle for some time:
            // give it a chance to shut down to avoid leaving walredo process running indefinitely.
            tenant.walredo_mgr.maybe_quiesce(period * 10);
            // Sleep
            if tokio::time::timeout(sleep_duration, cancel.cancelled())
                .await
@@ -257,20 +254,20 @@ async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
            } else {
                // Run gc
                let res = tenant
-                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx)
+                    .gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &cancel, &ctx)
                    .await;
                if let Err(e) = res {
                    let wait_duration = backoff::exponential_backoff_duration_seconds(
-                        error_run_count,
+                        error_run_count + 1,
                        1.0,
                        MAX_BACKOFF_SECS,
                    );
                    error_run_count += 1;
                    let wait_duration = Duration::from_secs_f64(wait_duration);
                    error!(
-                        "Gc failed {error_run_count} times, retrying in {:?}: {e:?}",
+                        "Gc failed {error_run_count} times, retrying in {wait_duration:?}: {e:?}",
                        wait_duration
                    );
-                    Duration::from_secs_f64(wait_duration)
+                    wait_duration
                } else {
                    error_run_count = 0;
                    period
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -4,13 +4,11 @@ use std::{
 };
 use anyhow::Context;
-use pageserver_api::models::TimelineState;
+use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
 use tracing::{debug, error, info, instrument, warn, Instrument, Span};
-use utils::{
+use utils::{crashsafe, fs_ext, id::TimelineId};
    crashsafe, fs_ext,
    id::{TenantId, TimelineId},
 };
 use crate::{
    config::PageServerConf,
@@ -24,7 +22,6 @@ use crate::{
        },
        CreateTimelineCause, DeleteTimelineError, Tenant,
    },
    InitializationOrder,
 };
 use super::{Timeline, TimelineResources};
@@ -47,7 +44,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
    // Shut down the layer flush task before the remote client, as one depends on the other
    task_mgr::shutdown_tasks(
        Some(TaskKind::LayerFlushTask),
-        Some(timeline.tenant_id),
+        Some(timeline.tenant_shard_id),
        Some(timeline.timeline_id),
    )
    .await;
@@ -73,7 +70,12 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
    // NB: This and other delete_timeline calls do not run as a task_mgr task,
    //     so, they are not affected by this shutdown_tasks() call.
    info!("waiting for timeline tasks to shutdown");
-    task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await;
+    task_mgr::shutdown_tasks(
        None,
        Some(timeline.tenant_shard_id),
        Some(timeline.timeline_id),
    )
    .await;
    fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
        Err(anyhow::anyhow!(
@@ -110,40 +112,11 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi
    Ok(())
 }
-// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued.
+/// Grab the compaction and gc locks, and actually perform the deletion.
 // This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json
 // gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote"
 // and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file.
 // After index part is deleted presence of this mark file indentifies that it was a deletion intention.
 // So we can just remove the mark file.
 async fn create_delete_mark(
    conf: &PageServerConf,
    tenant_id: TenantId,
    timeline_id: TimelineId,
 ) -> Result<(), DeleteTimelineError> {
    fail::fail_point!("timeline-delete-before-delete-mark", |_| {
        Err(anyhow::anyhow!(
            "failpoint: timeline-delete-before-delete-mark"
        ))?
    });
    let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id);
    // Note: we're ok to replace existing file.
    let _ = std::fs::OpenOptions::new()
        .write(true)
        .create(true)
        .open(&marker_path)
        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
    Ok(())
 }
 /// Grab the layer_removal_cs lock, and actually perform the deletion.
 ///
-/// This lock prevents prevents GC or compaction from running at the same time.
+/// The locks prevent GC or compaction from running at the same time. The background tasks do not
-/// The GC task doesn't register itself with the timeline it's operating on,
+/// register themselves with the timeline it's operating on, so it might still be running even
-/// so it might still be running even though we called `shutdown_tasks`.
+/// though we called `shutdown_tasks`.
 ///
 /// Note that there are still other race conditions between
 /// GC, compaction and timeline deletion. See
@@ -151,19 +124,24 @@ async fn create_delete_mark(
 ///
 /// No timeout here, GC & Compaction should be responsive to the
 /// `TimelineState::Stopping` change.
-async fn delete_local_layer_files(
+// pub(super): documentation link
 pub(super) async fn delete_local_layer_files(
    conf: &PageServerConf,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    timeline: &Timeline,
 ) -> anyhow::Result<()> {
-    info!("waiting for layer_removal_cs.lock()");
+    let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
-    let layer_removal_guard = timeline.layer_removal_cs.lock().await;
+    let guards = crate::timed(
-    info!("got layer_removal_cs.lock(), deleting layer files");
+        guards,
        "acquire gc and compaction locks",
        std::time::Duration::from_secs(5),
    )
    .await;
    // NB: storage_sync upload tasks that reference these layers have been cancelled
    //     by the caller.
-    let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id);
+    let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id);
    fail::fail_point!("timeline-delete-before-rm", |_| {
        Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
@@ -179,8 +157,8 @@ async fn delete_local_layer_files(
    // because of a previous failure/cancellation at/after
    // failpoint timeline-delete-after-rm.
    //
-    // It can also happen if we race with tenant detach, because,
+    // ErrorKind::NotFound can also happen if we race with tenant detach, because,
-    // it doesn't grab the layer_removal_cs lock.
+    // no locks are shared.
    //
    // For now, log and continue.
    // warn! level is technically not appropriate for the
@@ -199,7 +177,7 @@ async fn delete_local_layer_files(
        return Ok(());
    }
-    let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id);
+    let metadata_path = conf.metadata_path(&tenant_shard_id, &timeline.timeline_id);
    for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
        #[cfg(feature = "testing")]
@@ -248,8 +226,8 @@ async fn delete_local_layer_files(
        .with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
    }
-    info!("finished deleting layer files, releasing layer_removal_cs.lock()");
+    info!("finished deleting layer files, releasing locks");
-    drop(layer_removal_guard);
+    drop(guards);
    fail::fail_point!("timeline-delete-after-rm", |_| {
        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
@@ -274,11 +252,11 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<(
 // (nothing can fail after its deletion)
 async fn cleanup_remaining_timeline_fs_traces(
    conf: &PageServerConf,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
 ) -> anyhow::Result<()> {
    // Remove local metadata
-    tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id))
+    tokio::fs::remove_file(conf.metadata_path(&tenant_shard_id, &timeline_id))
        .await
        .or_else(fs_ext::ignore_not_found)
        .context("remove metadata")?;
@@ -290,7 +268,7 @@ async fn cleanup_remaining_timeline_fs_traces(
    });
    // Remove timeline dir
-    tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id))
+    tokio::fs::remove_dir(conf.timeline_path(&tenant_shard_id, &timeline_id))
        .await
        .or_else(fs_ext::ignore_not_found)
        .context("timeline dir")?;
@@ -305,13 +283,15 @@ async fn cleanup_remaining_timeline_fs_traces(
    // to be reordered later and thus missed if a crash occurs.
    // Note that we dont need to sync after mark file is removed
    // because we can tolerate the case when mark file reappears on startup.
-    let timeline_path = conf.timelines_path(&tenant_id);
+    let timeline_path = conf.timelines_path(&tenant_shard_id);
    crashsafe::fsync_async(timeline_path)
        .await
        .context("fsync_pre_mark_remove")?;
    // Remove delete mark
-    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
+    // TODO: once we are confident that no more exist in the field, remove this
    // line.  It cleans up a legacy marker file that might in rare cases be present.
    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_shard_id, timeline_id))
        .await
        .or_else(fs_ext::ignore_not_found)
        .context("remove delete mark")
@@ -377,7 +357,7 @@ impl DeleteTimelineFlow {
    // NB: If this fails half-way through, and is retried, the retry will go through
    // all the same steps again. Make sure the code here is idempotent, and don't
    // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))]
+    #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_shard_id.tenant_id, shard_id=%tenant.tenant_shard_id.shard_slug()))]
    pub async fn run(
        tenant: &Arc<Tenant>,
        timeline_id: TimelineId,
@@ -391,8 +371,6 @@ impl DeleteTimelineFlow {
        set_deleted_in_remote_index(&timeline).await?;
        create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?;
        fail::fail_point!("timeline-delete-before-schedule", |_| {
            Err(anyhow::anyhow!(
                "failpoint: timeline-delete-before-schedule"
@@ -429,7 +407,7 @@ impl DeleteTimelineFlow {
        local_metadata: &TimelineMetadata,
        remote_client: Option<RemoteTimelineClient>,
        deletion_queue_client: DeletionQueueClient,
-        init_order: Option<&InitializationOrder>,
+        cancel: CancellationToken,
    ) -> anyhow::Result<()> {
        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
        // RemoteTimelineClient is the only functioning part.
@@ -442,10 +420,10 @@ impl DeleteTimelineFlow {
                    remote_client,
                    deletion_queue_client,
                },
                init_order,
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
                CreateTimelineCause::Delete,
                cancel,
            )
            .context("create_timeline_struct")?;
@@ -464,10 +442,6 @@ impl DeleteTimelineFlow {
        guard.mark_in_progress()?;
        // Note that delete mark can be missing on resume
        // because we create delete mark after we set deleted_at in the index part.
        create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?;
        Self::schedule_background(guard, tenant.conf, tenant, timeline);
        Ok(())
@@ -479,7 +453,8 @@ impl DeleteTimelineFlow {
        timeline_id: TimelineId,
    ) -> anyhow::Result<()> {
        let r =
-            cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await;
+            cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_shard_id, timeline_id)
                .await;
        info!("Done");
        r
    }
@@ -550,16 +525,17 @@ impl DeleteTimelineFlow {
        tenant: Arc<Tenant>,
        timeline: Arc<Timeline>,
    ) {
-        let tenant_id = timeline.tenant_id;
+        let tenant_shard_id = timeline.tenant_shard_id;
        let timeline_id = timeline.timeline_id;
        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            TaskKind::TimelineDeletionWorker,
-            Some(tenant_id),
+            Some(tenant_shard_id),
            Some(timeline_id),
            "timeline_delete",
            false,
            tenant.cancel.child_token(),
            async move {
                if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
                    error!("Error: {err:#}");
@@ -569,7 +545,7 @@ impl DeleteTimelineFlow {
            }
            .instrument({
                let span =
-                    tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
+                    tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id);
                span.follows_from(Span::current());
                span
            }),
@@ -582,13 +558,14 @@ impl DeleteTimelineFlow {
        tenant: &Tenant,
        timeline: &Timeline,
    ) -> Result<(), DeleteTimelineError> {
-        delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;
+        delete_local_layer_files(conf, tenant.tenant_shard_id, timeline).await?;
        delete_remote_layers_and_index(timeline).await?;
        pausable_failpoint!("in_progress_delete");
-        cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?;
+        cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_shard_id, timeline.timeline_id)
            .await?;
        remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,7 +30,7 @@ use crate::{
    task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
    tenant::{
        config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
-        tasks::{BackgroundLoopKind, RateLimitError},
+        tasks::BackgroundLoopKind,
        timeline::EvictionError,
        LogicalSizeCalculationCause, Tenant,
    },
@@ -60,10 +60,14 @@ impl Timeline {
        task_mgr::spawn(
            BACKGROUND_RUNTIME.handle(),
            TaskKind::Eviction,
-            Some(self.tenant_id),
+            Some(self.tenant_shard_id),
            Some(self.timeline_id),
-            &format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id),
+            &format!(
                "layer eviction for {}/{}",
                self.tenant_shard_id, self.timeline_id
            ),
            false,
            self.cancel.child_token(),
            async move {
                let cancel = task_mgr::shutdown_token();
                tokio::select! {
@@ -77,7 +81,7 @@ impl Timeline {
        );
    }
-    #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
+    #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
    async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
        use crate::tenant::tasks::random_init_delay;
        {
@@ -155,15 +159,14 @@ impl Timeline {
    ) -> ControlFlow<()> {
        let now = SystemTime::now();
-        let _permit = match crate::tenant::tasks::concurrent_background_tasks_rate_limit(
+        let acquire_permit = crate::tenant::tasks::concurrent_background_tasks_rate_limit_permit(
            BackgroundLoopKind::Eviction,
            ctx,
-            cancel,
+        );
-        )
+
-        .await
+        let _permit = tokio::select! {
-        {
+            permit = acquire_permit => permit,
-            Ok(permit) => permit,
+            _ = cancel.cancelled() => return ControlFlow::Break(()),
            Err(RateLimitError::Cancelled) => return ControlFlow::Break(()),
        };
        // If we evict layers but keep cached values derived from those layers, then
@@ -209,11 +212,21 @@ impl Timeline {
        // Gather layers for eviction.
        // NB: all the checks can be invalidated as soon as we release the layer map lock.
        // We don't want to hold the layer map lock during eviction.
        // So, we just need to deal with this.
-        let candidates: Vec<_> = {
+
        let remote_client = match self.remote_client.as_ref() {
            Some(c) => c,
            None => {
                error!("no remote storage configured, cannot evict layers");
                return ControlFlow::Continue(());
            }
        };
        let mut js = tokio::task::JoinSet::new();
        {
            let guard = self.layers.read().await;
            let layers = guard.layer_map();
            let mut candidates = Vec::new();
            for hist_layer in layers.iter_historic_layers() {
                let hist_layer = guard.get_from_desc(&hist_layer);
@@ -259,55 +272,49 @@ impl Timeline {
                        continue;
                    }
                };
                let layer = guard.drop_eviction_guard();
                if no_activity_for > p.threshold {
-                    candidates.push(guard.drop_eviction_guard())
+                    let remote_client = remote_client.clone();
                    // this could cause a lot of allocations in some cases
                    js.spawn(async move { layer.evict_and_wait(&remote_client).await });
                    stats.candidates += 1;
                }
            }
            candidates
        };
        stats.candidates = candidates.len();
        let remote_client = match self.remote_client.as_ref() {
            None => {
                error!(
                    num_candidates = candidates.len(),
                    "no remote storage configured, cannot evict layers"
                );
                return ControlFlow::Continue(());
            }
            Some(c) => c,
        };
-        let results = match self.evict_layer_batch(remote_client, &candidates).await {
+        let join_all = async move {
-            Err(pre_err) => {
+            while let Some(next) = js.join_next().await {
-                stats.errors += candidates.len();
+                match next {
-                error!("could not do any evictions: {pre_err:#}");
+                    Ok(Ok(())) => stats.evicted += 1,
-                return ControlFlow::Continue(());
+                    Ok(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
                        stats.not_evictable += 1;
                    }
                    Err(je) if je.is_cancelled() => unreachable!("not used"),
                    Err(je) if je.is_panic() => {
                        /* already logged */
                        stats.errors += 1;
                    }
                    Err(je) => tracing::error!("unknown JoinError: {je:?}"),
                }
            }
-            Ok(results) => results,
+            stats
        };
-        assert_eq!(results.len(), candidates.len());
+
-        for result in results {
+        tokio::select! {
-            match result {
+            stats = join_all => {
-                None => {
+                if stats.candidates == stats.not_evictable {
-                    stats.skipped_for_shutdown += 1;
+                    debug!(stats=?stats, "eviction iteration complete");
-                }
+                } else if stats.errors > 0 || stats.not_evictable > 0 {
-                Some(Ok(())) => {
+                    warn!(stats=?stats, "eviction iteration complete");
-                    stats.evicted += 1;
+                } else {
-                }
+                    info!(stats=?stats, "eviction iteration complete");
                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
                    // compaction/gc removed the file while we were waiting on layer_removal_cs
                    stats.not_evictable += 1;
                }
            }
            _ = cancel.cancelled() => {
                // just drop the joinset to "abort"
            }
        }
-        if stats.candidates == stats.not_evictable {
+
            debug!(stats=?stats, "eviction iteration complete");
        } else if stats.errors > 0 || stats.not_evictable > 0 {
            warn!(stats=?stats, "eviction iteration complete");
        } else {
            info!(stats=?stats, "eviction iteration complete");
        }
        ControlFlow::Continue(())
    }
@@ -341,7 +348,7 @@ impl Timeline {
        // Make one of the tenant's timelines draw the short straw and run the calculation.
        // The others wait until the calculation is done so that they take into account the
        // imitated accesses that the winner made.
-        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_id, true) {
+        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id, true) {
            Ok(t) => t,
            Err(_) => {
                return ControlFlow::Break(());
@@ -351,7 +358,7 @@ impl Timeline {
        match state.last_layer_access_imitation {
            Some(ts) if ts.elapsed() < inter_imitate_period => { /* no need to run */ }
            _ => {
-                self.imitate_synthetic_size_calculation_worker(&tenant, ctx, cancel)
+                self.imitate_synthetic_size_calculation_worker(&tenant, cancel, ctx)
                    .await;
                state.last_layer_access_imitation = Some(tokio::time::Instant::now());
            }
@@ -417,8 +424,8 @@ impl Timeline {
    async fn imitate_synthetic_size_calculation_worker(
        &self,
        tenant: &Arc<Tenant>,
        ctx: &RequestContext,
        cancel: &CancellationToken,
        ctx: &RequestContext,
    ) {
        if self.conf.metric_collection_endpoint.is_none() {
            // We don't start the consumption metrics task if this is not set in the config.
@@ -457,6 +464,7 @@ impl Timeline {
            None,
            &mut throwaway_cache,
            LogicalSizeCalculationCause::EvictionTaskImitation,
            cancel,
            ctx,
        )
        .instrument(info_span!("gather_inputs"));
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -13,6 +13,7 @@ use crate::{
 };
 use anyhow::Context;
 use camino::Utf8Path;
 use pageserver_api::shard::ShardIndex;
 use std::{collections::HashMap, str::FromStr};
 use utils::lsn::Lsn;
@@ -107,6 +108,7 @@ pub(super) fn reconcile(
    index_part: Option<&IndexPart>,
    disk_consistent_lsn: Lsn,
    generation: Generation,
    shard: ShardIndex,
 ) -> Vec<(LayerFileName, Result<Decision, DismissedLayer>)> {
    use Decision::*;
@@ -118,10 +120,13 @@ pub(super) fn reconcile(
        .map(|(name, file_size)| {
            (
                name,
-                // The generation here will be corrected to match IndexPart in the merge below, unless
+                // The generation and shard here will be corrected to match IndexPart in the merge below, unless
                // it is not in IndexPart, in which case using our current generation makes sense
                // because it will be uploaded in this generation.
-                (Some(LayerFileMetadata::new(file_size, generation)), None),
+                (
                    Some(LayerFileMetadata::new(file_size, generation, shard)),
                    None,
                ),
            )
        })
        .collect::<Collected>();
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,8 +1,9 @@
 use anyhow::{bail, ensure, Context, Result};
 use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
 use tracing::trace;
 use utils::{
-    id::{TenantId, TimelineId},
+    id::TimelineId,
    lsn::{AtomicLsn, Lsn},
 };
@@ -73,7 +74,7 @@ impl LayerManager {
        last_record_lsn: Lsn,
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
    ) -> Result<Arc<InMemoryLayer>> {
        ensure!(lsn.is_aligned());
@@ -109,7 +110,8 @@ impl LayerManager {
                lsn
            );
-            let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn).await?;
+            let new_layer =
                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn).await?;
            let layer = Arc::new(new_layer);
            self.layer_map.open_layer = Some(layer.clone());
@@ -190,7 +192,6 @@ impl LayerManager {
    /// Called when compaction is completed.
    pub(crate) fn finish_compact_l0(
        &mut self,
        layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
        compact_from: &[Layer],
        compact_to: &[ResidentLayer],
        metrics: &TimelineMetrics,
@@ -201,25 +202,16 @@ impl LayerManager {
            metrics.record_new_file_metrics(l.layer_desc().file_size);
        }
        for l in compact_from {
-            Self::delete_historic_layer(layer_removal_cs, l, &mut updates, &mut self.layer_fmgr);
+            Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
        }
        updates.flush();
    }
-    /// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map.
+    /// Called when garbage collect has selected the layers to be removed.
-    pub(crate) fn finish_gc_timeline(
+    pub(crate) fn finish_gc_timeline(&mut self, gc_layers: &[Layer]) {
        &mut self,
        layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
        gc_layers: Vec<Layer>,
    ) {
        let mut updates = self.layer_map.batch_update();
        for doomed_layer in gc_layers {
-            Self::delete_historic_layer(
+            Self::delete_historic_layer(doomed_layer, &mut updates, &mut self.layer_fmgr);
                layer_removal_cs,
                &doomed_layer,
                &mut updates,
                &mut self.layer_fmgr,
            );
        }
        updates.flush()
    }
@@ -238,7 +230,6 @@ impl LayerManager {
    /// Remote storage is not affected by this operation.
    fn delete_historic_layer(
        // we cannot remove layers otherwise, since gc and compaction will race
        _layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
        layer: &Layer,
        updates: &mut BatchedUpdates<'_>,
        mapping: &mut LayerFileManager<Layer>,
@@ -252,7 +243,7 @@ impl LayerManager {
        //      map index without actually rebuilding the index.
        updates.remove_historic(desc);
        mapping.remove(layer);
-        layer.garbage_collect_on_drop();
+        layer.delete_on_drop();
    }
    pub(crate) fn contains(&self, layer: &Layer) -> bool {
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -1,11 +1,10 @@
 use anyhow::Context;
 use once_cell::sync::OnceCell;
-use tokio::sync::Semaphore;
+use once_cell::sync::OnceCell;
 use tokio_util::sync::CancellationToken;
 use utils::lsn::Lsn;
-use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
+use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
 use std::sync::Arc;
 /// Internal structure to hold all data needed for logical size calculation.
 ///
@@ -23,10 +22,17 @@ pub(super) struct LogicalSize {
    ///
    /// NOTE: size at a given LSN is constant, but after a restart we will calculate
    /// the initial size at a different LSN.
-    pub initial_logical_size: OnceCell<u64>,
+    pub initial_logical_size: OnceCell<(
        u64,
        crate::metrics::initial_logical_size::FinishedCalculationGuard,
    )>,
-    /// Semaphore to track ongoing calculation of `initial_logical_size`.
+    /// Cancellation for the best-effort logical size calculation.
-    pub initial_size_computation: Arc<tokio::sync::Semaphore>,
+    ///
    /// The token is kept in a once-cell so that we can error out if a higher priority
    /// request comes in *before* we have started the normal logical size calculation.
    pub(crate) cancel_wait_for_background_loop_concurrency_limit_semaphore:
        OnceCell<CancellationToken>,
    /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
    pub initial_part_end: Option<Lsn>,
@@ -52,25 +58,57 @@ pub(super) struct LogicalSize {
    /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
    /// to modify this, it will also keep the prometheus metric in sync.
    pub size_added_after_initial: AtomicI64,
    /// For [`crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE`].
    pub(super) did_return_approximate_to_walreceiver: AtomicBool,
 }
 /// Normalized current size, that the data in pageserver occupies.
 #[derive(Debug, Clone, Copy)]
-pub(super) enum CurrentLogicalSize {
+pub(crate) enum CurrentLogicalSize {
    /// The size is not yet calculated to the end, this is an intermediate result,
    /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
    /// yet total logical size cannot be below 0.
-    Approximate(u64),
+    Approximate(Approximate),
    // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
    // available for observation without any calculations.
-    Exact(u64),
+    Exact(Exact),
 }
 #[derive(Debug, Copy, Clone, PartialEq, Eq)]
 pub(crate) enum Accuracy {
    Approximate,
    Exact,
 }
 #[derive(Debug, Clone, Copy)]
 pub(crate) struct Approximate(u64);
 #[derive(Debug, Clone, Copy)]
 pub(crate) struct Exact(u64);
 impl From<&Approximate> for u64 {
    fn from(value: &Approximate) -> Self {
        value.0
    }
 }
 impl From<&Exact> for u64 {
    fn from(val: &Exact) -> Self {
        val.0
    }
 }
 impl CurrentLogicalSize {
-    pub(super) fn size(&self) -> u64 {
+    pub(crate) fn size_dont_care_about_accuracy(&self) -> u64 {
-        *match self {
+        match self {
-            Self::Approximate(size) => size,
+            Self::Approximate(size) => size.into(),
-            Self::Exact(size) => size,
+            Self::Exact(size) => size.into(),
        }
    }
    pub(crate) fn accuracy(&self) -> Accuracy {
        match self {
            Self::Approximate(_) => Accuracy::Approximate,
            Self::Exact(_) => Accuracy::Exact,
        }
    }
 }
@@ -78,36 +116,42 @@ impl CurrentLogicalSize {
 impl LogicalSize {
    pub(super) fn empty_initial() -> Self {
        Self {
-            initial_logical_size: OnceCell::with_value(0),
+            initial_logical_size: OnceCell::with_value((0, {
-            //  initial_logical_size already computed, so, don't admit any calculations
+                crate::metrics::initial_logical_size::START_CALCULATION
-            initial_size_computation: Arc::new(Semaphore::new(0)),
+                    .first(crate::metrics::initial_logical_size::StartCircumstances::EmptyInitial)
                    .calculation_result_saved()
            })),
            cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
            initial_part_end: None,
            size_added_after_initial: AtomicI64::new(0),
            did_return_approximate_to_walreceiver: AtomicBool::new(false),
        }
    }
    pub(super) fn deferred_initial(compute_to: Lsn) -> Self {
        Self {
            initial_logical_size: OnceCell::new(),
-            initial_size_computation: Arc::new(Semaphore::new(1)),
+            cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
            initial_part_end: Some(compute_to),
            size_added_after_initial: AtomicI64::new(0),
            did_return_approximate_to_walreceiver: AtomicBool::new(false),
        }
    }
-    pub(super) fn current_size(&self) -> anyhow::Result<CurrentLogicalSize> {
+    pub(super) fn current_size(&self) -> CurrentLogicalSize {
        let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
        //                  ^^^ keep this type explicit so that the casts in this function break if
        //                  we change the type.
        match self.initial_logical_size.get() {
-            Some(initial_size) => {
+            Some((initial_size, _)) => {
-                initial_size.checked_add_signed(size_increment)
+                CurrentLogicalSize::Exact(Exact(initial_size.checked_add_signed(size_increment)
                    .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
-                    .map(CurrentLogicalSize::Exact)
+                    .unwrap()))
            }
            None => {
                let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
-                Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
+                CurrentLogicalSize::Approximate(Approximate(non_negative_size_increment))
            }
        }
    }
@@ -121,7 +165,7 @@ impl LogicalSize {
    /// available for re-use. This doesn't contain the incremental part.
    pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
        match self.initial_part_end {
-            Some(v) if v == lsn => self.initial_logical_size.get().copied(),
+            Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, _)| *s),
            _ => None,
        }
    }
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -43,37 +43,52 @@ impl<'t> UninitializedTimeline<'t> {
    /// The caller is responsible for activating the timeline (function `.activate()`).
    pub(crate) fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
        let timeline_id = self.timeline_id;
-        let tenant_id = self.owning_tenant.tenant_id;
+        let tenant_shard_id = self.owning_tenant.tenant_shard_id;
-        let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
+        if self.raw_timeline.is_none() {
-            format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
+            return Err(anyhow::anyhow!(
-        })?;
+                "No timeline for initialization found for {tenant_shard_id}/{timeline_id}"
            ));
        }
        // Check that the caller initialized disk_consistent_lsn
-        let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
+        let new_disk_consistent_lsn = self
            .raw_timeline
            .as_ref()
            .expect("checked above")
            .0
            .get_disk_consistent_lsn();
        anyhow::ensure!(
            new_disk_consistent_lsn.is_valid(),
-            "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
+            "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn"
        );
        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
        match timelines.entry(timeline_id) {
            Entry::Occupied(_) => anyhow::bail!(
-                "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
+                "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map"
            ),
            Entry::Vacant(v) => {
                // after taking here should be no fallible operations, because the drop guard will not
                // cleanup after and would block for example the tenant deletion
                let (new_timeline, uninit_mark) =
                    self.raw_timeline.take().expect("already checked");
                // this is the mutual exclusion between different retries to create the timeline;
                // this should be an assertion.
                uninit_mark.remove_uninit_mark().with_context(|| {
                    format!(
-                        "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
+                        "Failed to remove uninit mark file for timeline {tenant_shard_id}/{timeline_id}"
                    )
                })?;
                v.insert(Arc::clone(&new_timeline));
                new_timeline.maybe_spawn_flush_loop();
                Ok(new_timeline)
            }
        }
        Ok(new_timeline)
    }
    /// Prepares timeline data by loading it from the basebackup archive.
@@ -119,7 +134,7 @@ impl<'t> UninitializedTimeline<'t> {
            .with_context(|| {
                format!(
                    "No raw timeline {}/{} found",
-                    self.owning_tenant.tenant_id, self.timeline_id
+                    self.owning_tenant.tenant_shard_id, self.timeline_id
                )
            })?
            .0)
@@ -129,7 +144,7 @@ impl<'t> UninitializedTimeline<'t> {
 impl Drop for UninitializedTimeline<'_> {
    fn drop(&mut self) {
        if let Some((_, uninit_mark)) = self.raw_timeline.take() {
-            let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_id, timeline_id = %self.timeline_id).entered();
+            let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
            error!("Timeline got dropped without initializing, cleaning its files");
            cleanup_timeline_directory(uninit_mark);
        }
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -30,6 +30,7 @@ use crate::tenant::timeline::walreceiver::connection_manager::{
    connection_manager_loop_step, ConnectionManagerState,
 };
 use pageserver_api::shard::TenantShardId;
 use std::future::Future;
 use std::num::NonZeroU64;
 use std::ops::ControlFlow;
@@ -41,7 +42,7 @@ use tokio::sync::watch;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
-use utils::id::TenantTimelineId;
+use utils::id::TimelineId;
 use self::connection_manager::ConnectionManagerStatus;
@@ -60,7 +61,8 @@ pub struct WalReceiverConf {
 }
 pub struct WalReceiver {
-    timeline: TenantTimelineId,
+    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    manager_status: Arc<std::sync::RwLock<Option<ConnectionManagerStatus>>>,
 }
@@ -71,7 +73,7 @@ impl WalReceiver {
        mut broker_client: BrokerClientChannel,
        ctx: &RequestContext,
    ) -> Self {
-        let tenant_id = timeline.tenant_id;
+        let tenant_shard_id = timeline.tenant_shard_id;
        let timeline_id = timeline.timeline_id;
        let walreceiver_ctx =
            ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
@@ -81,10 +83,11 @@ impl WalReceiver {
        task_mgr::spawn(
            WALRECEIVER_RUNTIME.handle(),
            TaskKind::WalReceiverManager,
-            Some(tenant_id),
+            Some(timeline.tenant_shard_id),
            Some(timeline_id),
-            &format!("walreceiver for timeline {tenant_id}/{timeline_id}"),
+            &format!("walreceiver for timeline {tenant_shard_id}/{timeline_id}"),
            false,
            timeline.cancel.child_token(),
            async move {
                debug_assert_current_span_has_tenant_and_timeline_id();
                debug!("WAL receiver manager started, connecting to broker");
@@ -117,11 +120,12 @@ impl WalReceiver {
                *loop_status.write().unwrap() = None;
                Ok(())
            }
-            .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_id, timeline_id = %timeline_id))
+            .instrument(info_span!(parent: None, "wal_connection_manager", tenant_id = %tenant_shard_id.tenant_id, shard_id = %tenant_shard_id.shard_slug(), timeline_id = %timeline_id))
        );
        Self {
-            timeline: TenantTimelineId::new(tenant_id, timeline_id),
+            tenant_shard_id,
            timeline_id,
            manager_status,
        }
    }
@@ -129,8 +133,8 @@ impl WalReceiver {
    pub async fn stop(self) {
        task_mgr::shutdown_tasks(
            Some(TaskKind::WalReceiverManager),
-            Some(self.timeline.tenant_id),
+            Some(self.tenant_shard_id),
-            Some(self.timeline.timeline_id),
+            Some(self.timeline_id),
        )
        .await;
    }
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -75,7 +75,7 @@ pub(super) async fn connection_manager_loop_step(
    }
    let id = TenantTimelineId {
-        tenant_id: connection_manager_state.timeline.tenant_id,
+        tenant_id: connection_manager_state.timeline.tenant_shard_id.tenant_id,
        timeline_id: connection_manager_state.timeline.timeline_id,
    };
@@ -388,7 +388,7 @@ struct BrokerSkTimeline {
 impl ConnectionManagerState {
    pub(super) fn new(timeline: Arc<Timeline>, conf: WalReceiverConf) -> Self {
        let id = TenantTimelineId {
-            tenant_id: timeline.tenant_id,
+            tenant_id: timeline.tenant_shard_id.tenant_id,
            timeline_id: timeline.timeline_id,
        };
        Self {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -163,10 +163,11 @@ pub(super) async fn handle_walreceiver_connection(
    task_mgr::spawn(
        WALRECEIVER_RUNTIME.handle(),
        TaskKind::WalReceiverConnectionPoller,
-        Some(timeline.tenant_id),
+        Some(timeline.tenant_shard_id),
        Some(timeline.timeline_id),
        "walreceiver connection",
        false,
        cancellation.clone(),
        async move {
            debug_assert_current_span_has_tenant_and_timeline_id();
@@ -396,11 +397,15 @@ pub(super) async fn handle_walreceiver_connection(
            // Send the replication feedback message.
            // Regular standby_status_update fields are put into this message.
-            let (timeline_logical_size, _) = timeline
+            let current_timeline_size = timeline
-                .get_current_logical_size(&ctx)
+                .get_current_logical_size(
-                .context("Status update creation failed to get current logical size")?;
+                    crate::tenant::timeline::GetLogicalSizePriority::User,
                    &ctx,
                )
                // FIXME: https://github.com/neondatabase/neon/issues/5963
                .size_dont_care_about_accuracy();
            let status_update = PageserverFeedback {
-                current_timeline_size: timeline_logical_size,
+                current_timeline_size,
                last_received_lsn,
                disk_consistent_lsn,
                remote_consistent_lsn,
--- a/Show More
+++ b/Show More