Switch lwlock to pthread_rwlock_t

fmt
Don't disconnect if pagerserver didn't change
2026-05-17 05:00:38 +00:00 · 2023-12-05 11:54:13 -08:00 · 2023-12-05 11:54:13 -08:00 · 2023-12-05 11:54:13 -08:00 · 2023-12-05 18:49:24 +02:00 · 2023-12-05 16:11:15 +00:00
160 changed files with 6847 additions and 3317 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -404,7 +404,7 @@ jobs:
        uses: ./.github/actions/save-coverage-data

  regress-tests:
-    needs: [ check-permissions, build-neon ]
+    needs: [ check-permissions, build-neon, tag ]
    runs-on: [ self-hosted, gen3, large ]
    container:
      image: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/rust:pinned
@@ -436,6 +436,7 @@ jobs:
        env:
          TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR_NEW }}
          CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
+          BUILD_TAG: ${{ needs.tag.outputs.build-tag }}

      - name: Merge and upload coverage data
        if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
--- a/Cargo.lock
+++ b/Cargo.lock
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -27,7 +27,6 @@ members = [
    "libs/postgres_ffi/wal_craft",
    "libs/vm_monitor",
    "libs/walproposer",
-    "libs/nostarve_queue",
 ]

 [workspace.package]
@@ -38,7 +37,6 @@ license = "Apache-2.0"
 [workspace.dependencies]
 anyhow = { version = "1.0", features = ["backtrace"] }
 arc-swap = "1.6"
-async-channel = "1.9.0"
 async-compression = { version = "0.4.0", features = ["tokio", "gzip", "zstd"] }
 azure_core = "0.16"
 azure_identity = "0.16"
@@ -47,12 +45,11 @@ azure_storage_blobs = "0.16"
 flate2 = "1.0.26"
 async-stream = "0.3"
 async-trait = "0.1"
-aws-config = { version = "0.56", default-features = false, features=["rustls"] }
-aws-sdk-s3 = "0.29"
-aws-smithy-http = "0.56"
-aws-smithy-async = { version = "0.56", default-features = false, features=["rt-tokio"] }
-aws-credential-types = "0.56"
-aws-types = "0.56"
+aws-config = { version = "1.0", default-features = false, features=["rustls"] }
+aws-sdk-s3 = "1.0"
+aws-smithy-async = { version = "1.0", default-features = false, features=["rt-tokio"] }
+aws-smithy-types = "1.0"
+aws-credential-types = "1.0"
 axum = { version = "0.6.20", features = ["ws"] }
 base64 = "0.13.0"
 bincode = "1.3"
@@ -91,6 +88,7 @@ humantime-serde = "1.1.1"
 hyper = "0.14"
 hyper-tungstenite = "0.11"
 inotify = "0.10.2"
+ipnet = "2.9.0"
 itertools = "0.10"
 jsonwebtoken = "8"
 libc = "0.2"
@@ -124,14 +122,17 @@ rustls-pemfile = "1"
 rustls-split = "0.3"
 scopeguard = "1.1"
 sysinfo = "0.29.2"
+sd-notify = "0.4.1"
 sentry = { version = "0.31", default-features = false, features = ["backtrace", "contexts", "panic", "rustls", "reqwest" ] }
 serde = { version = "1.0", features = ["derive"] }
 serde_json = "1"
+serde_path_to_error = "0.1"
 serde_with = "2.0"
 serde_assert = "0.5.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
 smallvec = "1.11"
+smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
@@ -193,7 +194,6 @@ tracing-utils = { version = "0.1", path = "./libs/tracing-utils/" }
 utils = { version = "0.1", path = "./libs/utils/" }
 vm_monitor = { version = "0.1", path = "./libs/vm_monitor/" }
 walproposer = { version = "0.1", path = "./libs/walproposer/" }
-nostarve_queue = { path = "./libs/nostarve_queue" }

 ## Common library dependency
 workspace_hack = { version = "0.1", path = "./workspace_hack/" }
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -387,18 +387,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ARG PG_VERSION
 ENV PATH "/usr/local/pgsql/bin:$PATH"

-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export TIMESCALEDB_VERSION=2.10.1 \
-        export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
-        ;; \
-      *) \
-        echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \
-    esac && \
-    apt-get update && \
+RUN apt-get update && \
    apt-get install -y cmake && \
-    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
-    echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
+    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.13.0.tar.gz -O timescaledb.tar.gz && \
+    echo "584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d timescaledb.tar.gz" | sha256sum --check && \
    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
    cd build && \
@@ -714,6 +706,23 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.3.tar.gz -
    cargo pgrx install --release && \
    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control

+#########################################################################################
+#
+# Layer "wal2json-build"
+# Compile "wal2json" extension
+#
+#########################################################################################
+
+FROM build-deps AS wal2json-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.gz && \
+    echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
+    mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -750,6 +759,7 @@ COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=wal2json-pg-build /usr/local/pgsql /usr/local/pgsql
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
--- a/README.md
+++ b/README.md
@@ -149,6 +149,9 @@ tenant 9ef87a5bf0d92544f6fafeeb3239695c successfully created on the pageserver
 Created an initial timeline 'de200bd42b49cc1814412c7e592dd6e9' at Lsn 0/16B5A50 for tenant: 9ef87a5bf0d92544f6fafeeb3239695c
 Setting tenant 9ef87a5bf0d92544f6fafeeb3239695c as a default one

+# create postgres compute node
+> cargo neon endpoint create main
+
 # start postgres compute node
 > cargo neon endpoint start main
 Starting new endpoint main (PostgreSQL v14) on timeline de200bd42b49cc1814412c7e592dd6e9 ...
@@ -185,8 +188,11 @@ Created timeline 'b3b863fa45fa9e57e615f9f2d944e601' at Lsn 0/16F9A00 for tenant:
 (L) main [de200bd42b49cc1814412c7e592dd6e9]
 (L) ┗━ @0/16F9A00: migration_check [b3b863fa45fa9e57e615f9f2d944e601]

+# create postgres on that branch
+> cargo neon endpoint create migration_check --branch-name migration_check
+
 # start postgres on that branch
-> cargo neon endpoint start migration_check --branch-name migration_check
+> cargo neon endpoint start migration_check
 Starting new endpoint migration_check (PostgreSQL v14) on timeline b3b863fa45fa9e57e615f9f2d944e601 ...
 Starting postgres at 'postgresql://cloud_admin@127.0.0.1:55434/postgres'

--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -274,7 +274,13 @@ fn main() -> Result<()> {
            let mut state = compute.state.lock().unwrap();
            state.error = Some(format!("{:?}", err));
            state.status = ComputeStatus::Failed;
-            drop(state);
+            // Notify others that Postgres failed to start. In case of configuring the
+            // empty compute, it's likely that API handler is still waiting for compute
+            // state change. With this we will notify it that compute is in Failed state,
+            // so control plane will know about it earlier and record proper error instead
+            // of timeout.
+            compute.state_changed.notify_all();
+            drop(state); // unlock
            delay_exit = true;
            None
        }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -22,7 +22,7 @@ use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

 use compute_api::responses::{ComputeMetrics, ComputeStatus};
-use compute_api::spec::{ComputeMode, ComputeSpec};
+use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

 use remote_storage::{DownloadError, RemotePath};
@@ -277,6 +277,17 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
 }

 impl ComputeNode {
+    /// Check that compute node has corresponding feature enabled.
+    pub fn has_feature(&self, feature: ComputeFeature) -> bool {
+        let state = self.state.lock().unwrap();
+
+        if let Some(s) = state.pspec.as_ref() {
+            s.spec.features.contains(&feature)
+        } else {
+            false
+        }
+    }
+
    pub fn set_status(&self, status: ComputeStatus) {
        let mut state = self.state.lock().unwrap();
        state.status = status;
@@ -728,7 +739,12 @@ impl ComputeNode {

        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
-        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
+        let postgresql_conf_path = pgdata_path.join("postgresql.conf");
+        config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
+        // temporarily reset max_cluster_size in config
+        // to avoid the possibility of hitting the limit, while we are reconfiguring:
+        // creating new extensions, roles, etc...
+        config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
        self.pg_reload_conf()?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
@@ -749,6 +765,10 @@ impl ComputeNode {
        // 'Close' connection
        drop(client);

+        // reset max_cluster_size in config back to original value and reload config
+        config::compute_ctl_temp_override_remove(pgdata_path)?;
+        self.pg_reload_conf()?;
+
        let unknown_op = "unknown".to_string();
        let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
        info!(
@@ -809,7 +829,17 @@ impl ComputeNode {

        let config_time = Utc::now();
        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
+            let pgdata_path = Path::new(&self.pgdata);
+            // temporarily reset max_cluster_size in config
+            // to avoid the possibility of hitting the limit, while we are applying config:
+            // creating new extensions, roles, etc...
+            config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
+            self.pg_reload_conf()?;
+
            self.apply_config(&compute_state)?;
+
+            config::compute_ctl_temp_override_remove(pgdata_path)?;
+            self.pg_reload_conf()?;
        }

        let startup_end_time = Utc::now();
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -93,5 +93,25 @@ pub fn write_postgres_conf(
        writeln!(file, "neon.extension_server_port={}", port)?;
    }

+    // This is essential to keep this line at the end of the file,
+    // because it is intended to override any settings above.
+    writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;
+
+    Ok(())
+}
+
+/// create file compute_ctl_temp_override.conf in pgdata_dir
+/// add provided options to this file
+pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> {
+    let path = pgdata_path.join("compute_ctl_temp_override.conf");
+    let mut file = File::create(path)?;
+    write!(file, "{}", options)?;
+    Ok(())
+}
+
+/// remove file compute_ctl_temp_override.conf in pgdata_dir
+pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> {
+    let path = pgdata_path.join("compute_ctl_temp_override.conf");
+    std::fs::remove_file(path)?;
    Ok(())
 }
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -227,7 +227,7 @@ async fn handle_configure_request(

        let parsed_spec = match ParsedSpec::try_from(spec) {
            Ok(ps) => ps,
-            Err(msg) => return Err((msg, StatusCode::PRECONDITION_FAILED)),
+            Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
        };

        // XXX: wrap state update under lock in code blocks. Otherwise,
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -156,17 +156,17 @@ paths:
                description: Error text or 'OK' if download succeeded.
                example: "OK"
        400:
-        description: Request is invalid.
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/GenericError"
+          description: Request is invalid.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"
        500:
-        description: Extension download request failed.
-        content:
-          application/json:
-            schema:
-              $ref: "#/components/schemas/GenericError"
+          description: Extension download request failed.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/GenericError"

 components:
  securitySchemes:
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -118,19 +118,6 @@ pub fn get_spec_from_control_plane(
    spec
 }

-/// It takes cluster specification and does the following:
-/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
-/// - Update `pg_hba.conf` to allow external connections.
-pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
-    // File `postgresql.conf` is no longer included into `basebackup`, so just
-    // always write all config into it creating new file.
-    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
-
-    update_pg_hba(pgdata_path)?;
-
-    Ok(())
-}
-
 /// Check `pg_hba.conf` and update if needed to allow external connections.
 pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
    // XXX: consider making it a part of spec.json
@@ -687,6 +674,9 @@ pub fn handle_extension_neon(client: &mut Client) -> Result<()> {
    info!("create neon extension with query: {}", query);
    client.simple_query(query)?;

+    query = "UPDATE pg_extension SET extrelocatable = true WHERE extname = 'neon'";
+    client.simple_query(query)?;
+
    query = "ALTER EXTENSION neon SET SCHEMA neon";
    info!("alter neon extension schema with query: {}", query);
    client.simple_query(query)?;
--- a/control_plane/src/bin/attachment_service.rs
+++ b/control_plane/src/bin/attachment_service.rs
@@ -9,6 +9,7 @@ use clap::Parser;
 use hex::FromHex;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response};
+use pageserver_api::shard::TenantShardId;
 use serde::{Deserialize, Serialize};
 use std::path::{Path, PathBuf};
 use std::{collections::HashMap, sync::Arc};
@@ -173,7 +174,8 @@ async fn handle_re_attach(mut req: Request<Body>) -> Result<Response<Body>, ApiE
        if state.pageserver == Some(reattach_req.node_id) {
            state.generation += 1;
            response.tenants.push(ReAttachResponseTenant {
-                id: *t,
+                // TODO(sharding): make this shard-aware
+                id: TenantShardId::unsharded(*t),
                gen: state.generation,
            });
        }
@@ -196,7 +198,8 @@ async fn handle_validate(mut req: Request<Body>) -> Result<Response<Body>, ApiEr
    };

    for req_tenant in validate_req.tenants {
-        if let Some(tenant_state) = locked.tenants.get(&req_tenant.id) {
+        // TODO(sharding): make this shard-aware
+        if let Some(tenant_state) = locked.tenants.get(&req_tenant.id.tenant_id) {
            let valid = tenant_state.generation == req_tenant.gen;
            response.tenants.push(ValidateResponseTenant {
                id: req_tenant.id,
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -415,6 +415,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                None,
                None,
                Some(pg_version),
+                None,
            )?;
            let new_timeline_id = timeline_info.timeline_id;
            let last_record_lsn = timeline_info.last_record_lsn;
@@ -495,6 +496,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                None,
                None,
                Some(pg_version),
+                None,
            )?;
            let new_timeline_id = timeline_info.timeline_id;

@@ -582,6 +584,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                start_lsn,
                Some(ancestor_timeline_id),
                None,
+                None,
            )?;
            let new_timeline_id = timeline_info.timeline_id;

@@ -608,11 +611,9 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
    };
    let mut cplane = ComputeControlPlane::load(env.clone())?;

-    // All subcommands take an optional --tenant-id option
-    let tenant_id = get_tenant_id(sub_args, env)?;
-
    match sub_name {
        "list" => {
+            let tenant_id = get_tenant_id(sub_args, env)?;
            let timeline_infos = get_timeline_infos(env, &tenant_id).unwrap_or_else(|e| {
                eprintln!("Failed to load timeline info: {}", e);
                HashMap::new()
@@ -672,6 +673,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
            println!("{table}");
        }
        "create" => {
+            let tenant_id = get_tenant_id(sub_args, env)?;
            let branch_name = sub_args
                .get_one::<String>("branch-name")
                .map(|s| s.as_str())
@@ -716,6 +718,18 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
            };

+            match (mode, hot_standby) {
+                (ComputeMode::Static(_), true) => {
+                    bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
+                }
+                (ComputeMode::Primary, true) => {
+                    bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
+                }
+                _ => {}
+            }
+
+            cplane.check_conflicting_endpoints(mode, tenant_id, timeline_id)?;
+
            cplane.new_endpoint(
                &endpoint_id,
                tenant_id,
@@ -728,8 +742,6 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
            )?;
        }
        "start" => {
-            let pg_port: Option<u16> = sub_args.get_one::<u16>("pg-port").copied();
-            let http_port: Option<u16> = sub_args.get_one::<u16>("http-port").copied();
            let endpoint_id = sub_args
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;
@@ -758,80 +770,28 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                    env.safekeepers.iter().map(|sk| sk.id).collect()
                };

-            let endpoint = cplane.endpoints.get(endpoint_id.as_str());
+            let endpoint = cplane
+                .endpoints
+                .get(endpoint_id.as_str())
+                .ok_or_else(|| anyhow::anyhow!("endpoint {endpoint_id} not found"))?;
+
+            cplane.check_conflicting_endpoints(
+                endpoint.mode,
+                endpoint.tenant_id,
+                endpoint.timeline_id,
+            )?;

            let ps_conf = env.get_pageserver_conf(pageserver_id)?;
            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
-                let claims = Claims::new(Some(tenant_id), Scope::Tenant);
+                let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);

                Some(env.generate_auth_token(&claims)?)
            } else {
                None
            };

-            let hot_standby = sub_args
-                .get_one::<bool>("hot-standby")
-                .copied()
-                .unwrap_or(false);
-
-            if let Some(endpoint) = endpoint {
-                match (&endpoint.mode, hot_standby) {
-                    (ComputeMode::Static(_), true) => {
-                        bail!("Cannot start a node in hot standby mode when it is already configured as a static replica")
-                    }
-                    (ComputeMode::Primary, true) => {
-                        bail!("Cannot start a node as a hot standby replica, it is already configured as primary node")
-                    }
-                    _ => {}
-                }
-                println!("Starting existing endpoint {endpoint_id}...");
-                endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
-            } else {
-                let branch_name = sub_args
-                    .get_one::<String>("branch-name")
-                    .map(|s| s.as_str())
-                    .unwrap_or(DEFAULT_BRANCH_NAME);
-                let timeline_id = env
-                    .get_branch_timeline_id(branch_name, tenant_id)
-                    .ok_or_else(|| {
-                        anyhow!("Found no timeline id for branch name '{branch_name}'")
-                    })?;
-                let lsn = sub_args
-                    .get_one::<String>("lsn")
-                    .map(|lsn_str| Lsn::from_str(lsn_str))
-                    .transpose()
-                    .context("Failed to parse Lsn from the request")?;
-                let pg_version = sub_args
-                    .get_one::<u32>("pg-version")
-                    .copied()
-                    .context("Failed to `pg-version` from the argument string")?;
-
-                let mode = match (lsn, hot_standby) {
-                    (Some(lsn), false) => ComputeMode::Static(lsn),
-                    (None, true) => ComputeMode::Replica,
-                    (None, false) => ComputeMode::Primary,
-                    (Some(_), true) => anyhow::bail!("cannot specify both lsn and hot-standby"),
-                };
-
-                // when used with custom port this results in non obvious behaviour
-                // port is remembered from first start command, i e
-                // start --port X
-                // stop
-                // start <-- will also use port X even without explicit port argument
-                println!("Starting new endpoint {endpoint_id} (PostgreSQL v{pg_version}) on timeline {timeline_id} ...");
-
-                let ep = cplane.new_endpoint(
-                    endpoint_id,
-                    tenant_id,
-                    timeline_id,
-                    pg_port,
-                    http_port,
-                    pg_version,
-                    mode,
-                    pageserver_id,
-                )?;
-                ep.start(&auth_token, safekeepers, remote_ext_config)?;
-            }
+            println!("Starting existing endpoint {endpoint_id}...");
+            endpoint.start(&auth_token, safekeepers, remote_ext_config)?;
        }
        "reconfigure" => {
            let endpoint_id = sub_args
@@ -1437,15 +1397,7 @@ fn cli() -> Command {
                .subcommand(Command::new("start")
                    .about("Start postgres.\n If the endpoint doesn't exist yet, it is created.")
                    .arg(endpoint_id_arg.clone())
-                    .arg(tenant_id_arg.clone())
-                    .arg(branch_name_arg.clone())
-                    .arg(timeline_id_arg.clone())
-                    .arg(lsn_arg)
-                    .arg(pg_port_arg)
-                    .arg(http_port_arg)
                    .arg(endpoint_pageserver_id_arg.clone())
-                    .arg(pg_version_arg)
-                    .arg(hot_standby_arg)
                    .arg(safekeepers_arg)
                    .arg(remote_ext_config_args)
                )
@@ -1458,7 +1410,6 @@ fn cli() -> Command {
                .subcommand(
                    Command::new("stop")
                    .arg(endpoint_id_arg)
-                    .arg(tenant_id_arg.clone())
                    .arg(
                        Arg::new("destroy")
                            .help("Also delete data directory (now optional, should be default in future)")
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -125,6 +125,7 @@ impl ComputeControlPlane {
        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
        let pageserver =
            PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
+
        let ep = Arc::new(Endpoint {
            endpoint_id: endpoint_id.to_owned(),
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
@@ -169,6 +170,30 @@ impl ComputeControlPlane {

        Ok(ep)
    }
+
+    pub fn check_conflicting_endpoints(
+        &self,
+        mode: ComputeMode,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> Result<()> {
+        if matches!(mode, ComputeMode::Primary) {
+            // this check is not complete, as you could have a concurrent attempt at
+            // creating another primary, both reading the state before checking it here,
+            // but it's better than nothing.
+            let mut duplicates = self.endpoints.iter().filter(|(_k, v)| {
+                v.tenant_id == tenant_id
+                    && v.timeline_id == timeline_id
+                    && v.mode == mode
+                    && v.status() != "stopped"
+            });
+
+            if let Some((key, _)) = duplicates.next() {
+                bail!("attempting to create a duplicate primary endpoint on tenant {tenant_id}, timeline {timeline_id}: endpoint {key:?} exists already. please don't do this, it is not supported.");
+            }
+        }
+        Ok(())
+    }
 }

 ///////////////////////////////////////////////////////////////////////////////
@@ -494,6 +519,7 @@ impl Endpoint {
            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
            format_version: 1.0,
            operation_uuid: None,
+            features: vec![],
            cluster: Cluster {
                cluster_id: None, // project ID: not used
                name: None,       // project name: not used
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -11,6 +11,7 @@ use std::io::{BufReader, Write};
 use std::num::NonZeroU64;
 use std::path::PathBuf;
 use std::process::{Child, Command};
+use std::time::Duration;
 use std::{io, result};

 use anyhow::{bail, Context};
@@ -21,7 +22,7 @@ use pageserver_api::models::{
 use pageserver_api::shard::TenantShardId;
 use postgres_backend::AuthType;
 use postgres_connection::{parse_host_port, PgConnectionConfig};
-use reqwest::blocking::{Client, ClientBuilder, RequestBuilder, Response};
+use reqwest::blocking::{Client, RequestBuilder, Response};
 use reqwest::{IntoUrl, Method};
 use thiserror::Error;
 use utils::auth::{Claims, Scope};
@@ -99,7 +100,7 @@ impl PageServerNode {
            pg_connection_config: PgConnectionConfig::new_host_port(host, port),
            conf: conf.clone(),
            env: env.clone(),
-            http_client: ClientBuilder::new().timeout(None).build().unwrap(),
+            http_client: Client::new(),
            http_base_url: format!("http://{}/v1", conf.listen_http_addr),
        }
    }
@@ -522,19 +523,24 @@ impl PageServerNode {
        &self,
        tenant_id: TenantId,
        config: LocationConfig,
+        flush_ms: Option<Duration>,
    ) -> anyhow::Result<()> {
        let req_body = TenantLocationConfigRequest { tenant_id, config };

-        self.http_request(
-            Method::PUT,
-            format!(
-                "{}/tenant/{}/location_config",
-                self.http_base_url, tenant_id
-            ),
-        )?
-        .json(&req_body)
-        .send()?
-        .error_from_body()?;
+        let path = format!(
+            "{}/tenant/{}/location_config",
+            self.http_base_url, tenant_id
+        );
+        let path = if let Some(flush_ms) = flush_ms {
+            format!("{}?flush_ms={}", path, flush_ms.as_millis())
+        } else {
+            path
+        };
+
+        self.http_request(Method::PUT, path)?
+            .json(&req_body)
+            .send()?
+            .error_from_body()?;

        Ok(())
    }
@@ -559,6 +565,7 @@ impl PageServerNode {
        ancestor_start_lsn: Option<Lsn>,
        ancestor_timeline_id: Option<TimelineId>,
        pg_version: Option<u32>,
+        existing_initdb_timeline_id: Option<TimelineId>,
    ) -> anyhow::Result<TimelineInfo> {
        // If timeline ID was not specified, generate one
        let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
@@ -572,6 +579,7 @@ impl PageServerNode {
            ancestor_start_lsn,
            ancestor_timeline_id,
            pg_version,
+            existing_initdb_timeline_id,
        })
        .send()?
        .error_from_body()?
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -14,7 +14,6 @@ use pageserver_api::models::{
 use std::collections::HashMap;
 use std::time::Duration;
 use utils::{
-    generation::Generation,
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };
@@ -93,6 +92,22 @@ pub fn migrate_tenant(
    // Get a new generation
    let attachment_service = AttachmentService::from_env(env);

+    fn build_location_config(
+        mode: LocationConfigMode,
+        generation: Option<u32>,
+        secondary_conf: Option<LocationConfigSecondary>,
+    ) -> LocationConfig {
+        LocationConfig {
+            mode,
+            generation,
+            secondary_conf,
+            tenant_conf: TenantConfig::default(),
+            shard_number: 0,
+            shard_count: 0,
+            shard_stripe_size: 0,
+        }
+    }
+
    let previous = attachment_service.inspect(tenant_id)?;
    let mut baseline_lsns = None;
    if let Some((generation, origin_ps_id)) = &previous {
@@ -101,40 +116,26 @@ pub fn migrate_tenant(
        if origin_ps_id == &dest_ps.conf.id {
            println!("🔁 Already attached to {origin_ps_id}, freshening...");
            let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
-            let dest_conf = LocationConfig {
-                mode: LocationConfigMode::AttachedSingle,
-                generation: gen.map(Generation::new),
-                secondary_conf: None,
-                tenant_conf: TenantConfig::default(),
-            };
-            dest_ps.location_config(tenant_id, dest_conf)?;
+            let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
+            dest_ps.location_config(tenant_id, dest_conf, None)?;
            println!("✅ Migration complete");
            return Ok(());
        }

        println!("🔁 Switching origin pageserver {origin_ps_id} to stale mode");

-        let stale_conf = LocationConfig {
-            mode: LocationConfigMode::AttachedStale,
-            generation: Some(Generation::new(*generation)),
-            secondary_conf: None,
-            tenant_conf: TenantConfig::default(),
-        };
-        origin_ps.location_config(tenant_id, stale_conf)?;
+        let stale_conf =
+            build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
+        origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?;

        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
    }

    let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
-    let dest_conf = LocationConfig {
-        mode: LocationConfigMode::AttachedMulti,
-        generation: gen.map(Generation::new),
-        secondary_conf: None,
-        tenant_conf: TenantConfig::default(),
-    };
+    let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);

    println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
-    dest_ps.location_config(tenant_id, dest_conf)?;
+    dest_ps.location_config(tenant_id, dest_conf, None)?;

    if let Some(baseline) = baseline_lsns {
        println!("🕑 Waiting for LSN to catch up...");
@@ -170,31 +171,25 @@ pub fn migrate_tenant(
        }

        // Downgrade to a secondary location
-        let secondary_conf = LocationConfig {
-            mode: LocationConfigMode::Secondary,
-            generation: None,
-            secondary_conf: Some(LocationConfigSecondary { warm: true }),
-            tenant_conf: TenantConfig::default(),
-        };
+        let secondary_conf = build_location_config(
+            LocationConfigMode::Secondary,
+            None,
+            Some(LocationConfigSecondary { warm: true }),
+        );

        println!(
            "💤 Switching to secondary mode on pageserver {}",
            other_ps.conf.id
        );
-        other_ps.location_config(tenant_id, secondary_conf)?;
+        other_ps.location_config(tenant_id, secondary_conf, None)?;
    }

    println!(
        "🔁 Switching to AttachedSingle mode on pageserver {}",
        dest_ps.conf.id
    );
-    let dest_conf = LocationConfig {
-        mode: LocationConfigMode::AttachedSingle,
-        generation: gen.map(Generation::new),
-        secondary_conf: None,
-        tenant_conf: TenantConfig::default(),
-    };
-    dest_ps.location_config(tenant_id, dest_conf)?;
+    let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
+    dest_ps.location_config(tenant_id, dest_conf, None)?;

    println!("✅ Migration complete");

--- a/docs/rfcs/029-pageserver-wal-disaster-recovery.md
+++ b/docs/rfcs/029-pageserver-wal-disaster-recovery.md
@@ -0,0 +1,205 @@
+# Name
+
+Created on: 2023-09-08
+Author: Arpad Müller
+
+## Summary
+
+Enable the pageserver to recover from data corruption events by implementing
+a feature to re-apply historic WAL records in parallel to the already occurring
+WAL replay.
+
+The feature is outside of the user-visible backup and history story, and only
+serves as a second-level backup for the case that there is a bug in the
+pageservers that corrupted the served pages.
+
+The RFC proposes the addition of two new features:
+* recover a broken branch from WAL (downtime is allowed)
+* a test recovery system to recover random branches to make sure recovery works
+
+## Motivation
+
+The historic WAL is currently stored in S3 even after it has been replayed by
+the pageserver and thus been integrated into the pageserver's storage system.
+This is done to defend from data corruption failures inside the pageservers.
+
+However, application of this WAL in the disaster recovery setting is currently
+very manual and we want to automate this to make it easier.
+
+### Use cases
+
+There are various use cases for this feature, like:
+
+* The main motivation is replaying in the instance of pageservers corrupting
+  data.
+* We might want to, beyond the user-visible history features, through our
+  support channels and upon customer request, in select instances, recover
+  historic versions beyond the range of history that we officially support.
+* Running the recovery process in the background for random tenant timelines
+  to figure out if there was a corruption of data (we would compare with what
+  the pageserver stores for the "official" timeline).
+* Using the WAL to arrive at historic pages we can then back up to S3 so that
+  WAL itself can be discarded, or at least not used for future replays.
+  Again, this sounds a lot like what the pageserver is already doing, but the
+  point is to provide a fallback to the service provided by the pageserver.
+
+## Design
+
+### Design constraints
+
+The main design constraint is that the feature needs to be *simple* enough that
+the number of bugs are as low, and reliability as high as possible: the main
+goal of this endeavour is to achieve higher correctness than the pageserver.
+
+For the background process, we cannot afford a downtime of the timeline that is
+being cloned, as we don't want to restrict ourselves to offline tenants only.
+In the scenario where we want to recover from disasters or roll back to a
+historic lsn through support staff, downtimes are more affordable, and
+inevitable if the original had been subject to the corruption. Ideally, the
+two code paths would share code, so the solution would be designed for not
+requiring downtimes.
+
+### API endpoint changes
+
+This RFC proposes two API endpoint changes in the safekeeper and the
+pageserver.
+
+Remember, the pageserver timeline API creation endpoint is to this URL:
+
+```
+/v1/tenant/{tenant_id}/timeline/
+```
+
+Where `{tenant_id}` is the ID of the tenant the timeline is created for,
+and specified as part of the URL. The timeline ID is passed via the POST
+request body as the only required parameter `new_timeline_id`.
+
+This proposal adds one optional parameter called
+`existing_initdb_timeline_id` to the request's json body. If the parameter
+is not specified, behaviour should be as existing, so the pageserver runs
+initdb.
+If the parameter is specified, it is expected to point to a timeline ID.
+In fact that ID might match `new_timeline_id`, what's important is that
+S3 storage contains a matching initdb under the URL matching the given
+tenant and timeline.
+
+Having both `ancestor_timeline_id` and `existing_initdb_timeline_id`
+specified is illegal and will yield in an HTTP error. This feature is
+only meant for the "main" branch that doesn't have any ancestors
+of its own, as only here initdb is relevant.
+
+For the safekeeper, we propose the addition of the following copy endpoint:
+
+```
+/v1/tenant/{tenant_id}/timeline/{source_timeline_id}/copy
+```
+it is meant for POST requests with json, and the two URL parameters
+`tenant_id` and `source_timeline_id`. The json request body contains
+the two required parameters `target_timeline_id` and `until_lsn`.
+
+After invoking, the copy endpoint starts a copy process of the WAL from
+the source ID to the target ID. The lsn is updated according to the
+progress of the API call.
+
+### Higher level features
+
+We want the API changes to support the following higher level features:
+
+* recovery-after-corruption DR of the main timeline of a tenant. This
+  feature allows for downtime.
+* test DR of the main timeline into a special copy timeline. this feature
+  is meant to run against selected production tenants in the background,
+  without the user noticing, so it does not allow for downtime.
+
+The recovery-after-corruption DR only needs the pageserver changes.
+It works as follows:
+
+* delete the timeline from the pageservers via timeline deletion API
+* re-create it via timeline creation API (same ID as before) and set
+  `existing_initdb_timeline_id` to the same timeline ID
+
+The test DR requires also the copy primitive and works as follows:
+
+* copy the WAL of the timeline to a new place
+* create a new timeline for the tenant
+
+## Non Goals
+
+At the danger of being repetitive, the main goal of this feature is to be a
+backup method, so reliability is very important. This implies that other
+aspects like performance or space reduction are less important.
+
+### Corrupt WAL
+
+The process suggested by this RFC assumes that the WAL is free of corruption.
+In some instances, corruption can make it into WAL, like for example when
+higher level components like postgres or the application first read corrupt
+data, and then execute a write with data derived from that earlier read. That
+written data might then contain the corruption.
+
+Common use cases can hit this quite easily. For example, an application reads
+some counter, increments it, and then writes the new counter value to the
+database.
+On a lower level, the compute might put FPIs (Full Page Images) into the WAL,
+which have corrupt data for rows unrelated to the write operation at hand.
+
+Separating corrupt writes from non-corrupt ones is a hard problem in general,
+and if the application was involved in making the corrupt write, a recovery
+would also involve the application. Therefore, corruption that has made it into
+the WAL is outside of the scope of this feature. However, the WAL replay can be
+issued to right before the point in time where the corruption occured. Then the
+data loss is isolated to post-corruption writes only.
+
+## Impacted components (e.g. pageserver, safekeeper, console, etc)
+
+Most changes would happen to the pageservers.
+For the higher level features, maybe other components like the console would
+be involved.
+
+We need to make sure that the shadow timelines are not subject to the usual
+limits and billing we apply to existing timelines.
+
+## Proposed implementation
+
+The first problem to keep in mind is the reproducability of `initdb`.
+So an initial step would be to upload `initdb` snapshots to S3.
+
+After that, we'd have the endpoint spawn a background process which
+performs the replay of the WAL to that new timeline. This process should
+follow the existing workflows as closely as possible, just using the
+WAL records of a different timeline.
+
+The timeline created will be in a special state that solely looks for WAL
+entries of the timeline it is trying to copy. Once the target LSN is reached,
+it turns into a normal timeline that also accepts writes to its own
+timeline ID.
+
+### Scalability
+
+For now we want to run this entire process on a single node, and as
+it is by nature linear, it's hard to parallelize. However, for the
+verification workloads, we can easily start the WAL replay in parallel
+for different points in time. This is valuable especially for tenants
+with large WAL records.
+
+Compare this with the tricks to make addition circuits execute with
+lower latency by making them perform the addition for both possible
+values of the carry bit, and then, in a second step, taking the
+result for the carry bit that was actually obtained.
+
+The other scalability dimension to consider is the WAL length, which
+is a growing question as tenants accumulate changes. There are
+possible approaches to this, including creating snapshots of the
+page files and uploading them to S3, but if we do this for every single
+branch, we lose the cheap branching property.
+
+### Implementation by component
+
+The proposed changes for the various components of the neon architecture
+are written up in this notion page:
+
+https://www.notion.so/neondatabase/Pageserver-disaster-recovery-one-pager-4ecfb5df16ce4f6bbfc3817ed1a6cbb2
+
+### Unresolved questions
+
+none known (outside of the mentioned ones).
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -26,6 +26,13 @@ pub struct ComputeSpec {
    // but we don't use it for anything. Serde will ignore missing fields when
    // deserializing it.
    pub operation_uuid: Option<String>,
+
+    /// Compute features to enable. These feature flags are provided, when we
+    /// know all the details about client's compute, so they cannot be used
+    /// to change `Empty` compute behavior.
+    #[serde(default)]
+    pub features: Vec<ComputeFeature>,
+
    /// Expected cluster state at the end of transition process.
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,
@@ -68,6 +75,19 @@ pub struct ComputeSpec {
    pub remote_extensions: Option<RemoteExtSpec>,
 }

+/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
+#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
+#[serde(rename_all = "snake_case")]
+pub enum ComputeFeature {
+    // XXX: Add more feature flags here.
+
+    // This is a special feature flag that is used to represent unknown feature flags.
+    // Basically all unknown to enum flags are represented as this one. See unit test
+    // `parse_unknown_features()` for more details.
+    #[serde(other)]
+    UnknownFeature,
+}
+
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct RemoteExtSpec {
    pub public_extensions: Option<Vec<String>>,
@@ -229,7 +249,10 @@ mod tests {
    #[test]
    fn parse_spec_file() {
        let file = File::open("tests/cluster_spec.json").unwrap();
-        let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
+        let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
+
+        // Features list defaults to empty vector.
+        assert!(spec.features.is_empty());
    }

    #[test]
@@ -241,4 +264,22 @@ mod tests {
        ob.insert("unknown_field_123123123".into(), "hello".into());
        let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
    }
+
+    #[test]
+    fn parse_unknown_features() {
+        // Test that unknown feature flags do not cause any errors.
+        let file = File::open("tests/cluster_spec.json").unwrap();
+        let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
+        let ob = json.as_object_mut().unwrap();
+
+        // Add unknown feature flags.
+        let features = vec!["foo_bar_feature", "baz_feature"];
+        ob.insert("features".into(), features.into());
+
+        let spec: ComputeSpec = serde_json::from_value(json).unwrap();
+
+        assert!(spec.features.len() == 2);
+        assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
+        assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
+    }
 }
--- a/libs/nostarve_queue/Cargo.toml
+++ b/libs/nostarve_queue/Cargo.toml
@@ -1,14 +0,0 @@
-[package]
-name = "nostarve_queue"
-version = "0.1.0"
-edition.workspace = true
-license.workspace = true
-
-[dependencies]
-scopeguard.workspace = true
-tracing.workspace = true
-
-[dev-dependencies]
-futures.workspace = true
-rand.workspace = true
-tokio = { workspace = true, features = ["rt", "rt-multi-thread", "time"] }
--- a/libs/nostarve_queue/src/lib.rs
+++ b/libs/nostarve_queue/src/lib.rs
@@ -1,316 +0,0 @@
-//! Synchronization primitive to prevent starvation among concurrent tasks that do the same work.
-
-use std::{
-    collections::VecDeque,
-    fmt,
-    future::poll_fn,
-    sync::Mutex,
-    task::{Poll, Waker},
-};
-
-pub struct Queue<T> {
-    inner: Mutex<Inner<T>>,
-}
-
-struct Inner<T> {
-    waiters: VecDeque<usize>,
-    free: VecDeque<usize>,
-    slots: Vec<Option<(Option<Waker>, Option<T>)>>,
-}
-
-#[derive(Clone, Copy)]
-pub struct Position<'q, T> {
-    idx: usize,
-    queue: &'q Queue<T>,
-}
-
-impl<T> fmt::Debug for Position<'_, T> {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
-        f.debug_struct("Position").field("idx", &self.idx).finish()
-    }
-}
-
-impl<T> Inner<T> {
-    #[cfg(not(test))]
-    #[inline]
-    fn integrity_check(&self) {}
-
-    #[cfg(test)]
-    fn integrity_check(&self) {
-        use std::collections::HashSet;
-        let waiters = self.waiters.iter().copied().collect::<HashSet<_>>();
-        let free = self.free.iter().copied().collect::<HashSet<_>>();
-        for (slot_idx, slot) in self.slots.iter().enumerate() {
-            match slot {
-                None => {
-                    assert!(!waiters.contains(&slot_idx));
-                    assert!(free.contains(&slot_idx));
-                }
-                Some((None, None)) => {
-                    assert!(waiters.contains(&slot_idx));
-                    assert!(!free.contains(&slot_idx));
-                }
-                Some((Some(_), Some(_))) => {
-                    assert!(!waiters.contains(&slot_idx));
-                    assert!(!free.contains(&slot_idx));
-                }
-                Some((Some(_), None)) => {
-                    assert!(waiters.contains(&slot_idx));
-                    assert!(!free.contains(&slot_idx));
-                }
-                Some((None, Some(_))) => {
-                    assert!(!waiters.contains(&slot_idx));
-                    assert!(!free.contains(&slot_idx));
-                }
-            }
-        }
-    }
-}
-
-impl<T> Queue<T> {
-    pub fn new(size: usize) -> Self {
-        Queue {
-            inner: Mutex::new(Inner {
-                waiters: VecDeque::new(),
-                free: (0..size).collect(),
-                slots: {
-                    let mut v = Vec::with_capacity(size);
-                    v.resize_with(size, || None);
-                    v
-                },
-            }),
-        }
-    }
-    pub fn begin(&self) -> Result<Position<T>, ()> {
-        #[cfg(test)]
-        tracing::trace!("get in line locking inner");
-        let mut inner = self.inner.lock().unwrap();
-        inner.integrity_check();
-        let my_waitslot_idx = inner
-            .free
-            .pop_front()
-            .expect("can't happen, len(slots) = len(waiters");
-        inner.waiters.push_back(my_waitslot_idx);
-        let prev = inner.slots[my_waitslot_idx].replace((None, None));
-        assert!(prev.is_none());
-        inner.integrity_check();
-        Ok(Position {
-            idx: my_waitslot_idx,
-            queue: &self,
-        })
-    }
-}
-
-impl<'q, T> Position<'q, T> {
-    pub fn complete_and_wait(self, datum: T) -> impl std::future::Future<Output = T> + 'q {
-        #[cfg(test)]
-        tracing::trace!("found victim locking waiters");
-        let mut inner = self.queue.inner.lock().unwrap();
-        inner.integrity_check();
-        let winner_idx = inner.waiters.pop_front().expect("we put ourselves in");
-        #[cfg(test)]
-        tracing::trace!(winner_idx, "putting victim into next waiters slot");
-        let winner_slot = inner.slots[winner_idx].as_mut().unwrap();
-        let prev = winner_slot.1.replace(datum);
-        assert!(
-            prev.is_none(),
-            "ensure we didn't mess up this simple ring buffer structure"
-        );
-        if let Some(waker) = winner_slot.0.take() {
-            #[cfg(test)]
-            tracing::trace!(winner_idx, "waking up winner");
-            waker.wake()
-        }
-        inner.integrity_check();
-        drop(inner); // the poll_fn locks it again
-
-        let mut poll_num = 0;
-        let mut drop_guard = Some(scopeguard::guard((), |()| {
-            panic!("must not drop this future until Ready");
-        }));
-
-        // take the victim that was found by someone else
-        poll_fn(move |cx| {
-            let my_waitslot_idx = self.idx;
-            poll_num += 1;
-            #[cfg(test)]
-            tracing::trace!(poll_num, "poll_fn locking waiters");
-            let mut inner = self.queue.inner.lock().unwrap();
-            inner.integrity_check();
-            let my_waitslot = inner.slots[self.idx].as_mut().unwrap();
-            // assert!(
-            //     poll_num <= 2,
-            //     "once we place the waker in the slot, next wakeup should have a result: {}",
-            //     my_waitslot.1.is_some()
-            // );
-            if let Some(res) = my_waitslot.1.take() {
-                #[cfg(test)]
-                tracing::trace!(poll_num, "have cache slot");
-                // above .take() resets the waiters slot to None
-                debug_assert!(my_waitslot.0.is_none());
-                debug_assert!(my_waitslot.1.is_none());
-                inner.slots[my_waitslot_idx] = None;
-                inner.free.push_back(my_waitslot_idx);
-                let _ = scopeguard::ScopeGuard::into_inner(drop_guard.take().unwrap());
-                inner.integrity_check();
-                return Poll::Ready(res);
-            }
-            // assert_eq!(poll_num, 1);
-            if !my_waitslot
-                .0
-                .as_ref()
-                .map(|existing| cx.waker().will_wake(existing))
-                .unwrap_or(false)
-            {
-                let prev = my_waitslot.0.replace(cx.waker().clone());
-                #[cfg(test)]
-                tracing::trace!(poll_num, prev_is_some = prev.is_some(), "updating waker");
-            }
-            inner.integrity_check();
-            #[cfg(test)]
-            tracing::trace!(poll_num, "waiting to be woken up");
-            Poll::Pending
-        })
-    }
-}
-
-#[cfg(test)]
-mod test {
-    use std::{
-        sync::{
-            atomic::{AtomicBool, Ordering},
-            Arc,
-        },
-        task::Poll,
-        time::Duration,
-    };
-
-    use rand::RngCore;
-
-    #[tokio::test]
-    async fn in_order_completion_and_wait() {
-        let queue = super::Queue::new(2);
-
-        let q1 = queue.begin().unwrap();
-        let q2 = queue.begin().unwrap();
-
-        assert_eq!(q1.complete_and_wait(23).await, 23);
-        assert_eq!(q2.complete_and_wait(42).await, 42);
-    }
-
-    #[tokio::test]
-    async fn out_of_order_completion_and_wait() {
-        let queue = super::Queue::new(2);
-
-        let q1 = queue.begin().unwrap();
-        let q2 = queue.begin().unwrap();
-
-        let mut q2compfut = q2.complete_and_wait(23);
-
-        match futures::poll!(&mut q2compfut) {
-            Poll::Pending => {}
-            Poll::Ready(_) => panic!("should not be ready yet, it's queued after q1"),
-        }
-
-        let q1res = q1.complete_and_wait(42).await;
-        assert_eq!(q1res, 23);
-
-        let q2res = q2compfut.await;
-        assert_eq!(q2res, 42);
-    }
-
-    #[tokio::test]
-    async fn in_order_completion_out_of_order_wait() {
-        let queue = super::Queue::new(2);
-
-        let q1 = queue.begin().unwrap();
-        let q2 = queue.begin().unwrap();
-
-        let mut q1compfut = q1.complete_and_wait(23);
-
-        let mut q2compfut = q2.complete_and_wait(42);
-
-        match futures::poll!(&mut q2compfut) {
-            Poll::Pending => {
-                unreachable!("q2 should be ready, it wasn't first but q1 is serviced already")
-            }
-            Poll::Ready(x) => assert_eq!(x, 42),
-        }
-
-        assert_eq!(futures::poll!(&mut q1compfut), Poll::Ready(23));
-    }
-
-    #[tokio::test(flavor = "multi_thread")]
-    async fn stress() {
-        let ntasks = 8;
-        let queue_size = 8;
-        let queue = Arc::new(super::Queue::new(queue_size));
-
-        let stop = Arc::new(AtomicBool::new(false));
-
-        let mut tasks = vec![];
-        for i in 0..ntasks {
-            let jh = tokio::spawn({
-                let queue = Arc::clone(&queue);
-                let stop = Arc::clone(&stop);
-                async move {
-                    while !stop.load(Ordering::Relaxed) {
-                        let q = queue.begin().unwrap();
-                        for _ in 0..(rand::thread_rng().next_u32() % 10_000) {
-                            std::hint::spin_loop();
-                        }
-                        q.complete_and_wait(i).await;
-                        tokio::task::yield_now().await;
-                    }
-                }
-            });
-            tasks.push(jh);
-        }
-
-        tokio::time::sleep(Duration::from_secs(10)).await;
-
-        stop.store(true, Ordering::Relaxed);
-
-        for t in tasks {
-            t.await.unwrap();
-        }
-    }
-
-    #[test]
-    fn stress_two_runtimes_shared_queue() {
-        std::thread::scope(|s| {
-            let ntasks = 8;
-            let queue_size = 8;
-            let queue = Arc::new(super::Queue::new(queue_size));
-
-            let stop = Arc::new(AtomicBool::new(false));
-
-            for i in 0..ntasks {
-                s.spawn({
-                    let queue = Arc::clone(&queue);
-                    let stop = Arc::clone(&stop);
-                    move || {
-                        let rt = tokio::runtime::Builder::new_current_thread()
-                            .enable_all()
-                            .build()
-                            .unwrap();
-                        rt.block_on(async move {
-                            while !stop.load(Ordering::Relaxed) {
-                                let q = queue.begin().unwrap();
-                                for _ in 0..(rand::thread_rng().next_u32() % 10_000) {
-                                    std::hint::spin_loop();
-                                }
-                                q.complete_and_wait(i).await;
-                                tokio::task::yield_now().await;
-                            }
-                        });
-                    }
-                });
-            }
-
-            std::thread::sleep(Duration::from_secs(10));
-
-            stop.store(true, Ordering::Relaxed);
-        });
-    }
-}
--- a/libs/pageserver_api/Cargo.toml
+++ b/libs/pageserver_api/Cargo.toml
@@ -18,6 +18,7 @@ enum-map.workspace = true
 strum.workspace = true
 strum_macros.workspace = true
 hex.workspace = true
+thiserror.workspace = true

 workspace_hack.workspace = true

--- a/libs/pageserver_api/src/control_api.rs
+++ b/libs/pageserver_api/src/control_api.rs
@@ -4,7 +4,9 @@
 //! See docs/rfcs/025-generation-numbers.md

 use serde::{Deserialize, Serialize};
-use utils::id::{NodeId, TenantId};
+use utils::id::NodeId;
+
+use crate::shard::TenantShardId;

 #[derive(Serialize, Deserialize)]
 pub struct ReAttachRequest {
@@ -13,7 +15,7 @@ pub struct ReAttachRequest {

 #[derive(Serialize, Deserialize)]
 pub struct ReAttachResponseTenant {
-    pub id: TenantId,
+    pub id: TenantShardId,
    pub gen: u32,
 }

@@ -24,7 +26,7 @@ pub struct ReAttachResponse {

 #[derive(Serialize, Deserialize)]
 pub struct ValidateRequestTenant {
-    pub id: TenantId,
+    pub id: TenantShardId,
    pub gen: u32,
 }

@@ -40,6 +42,6 @@ pub struct ValidateResponse {

 #[derive(Serialize, Deserialize)]
 pub struct ValidateResponseTenant {
-    pub id: TenantId,
+    pub id: TenantShardId,
    pub valid: bool,
 }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -140,3 +140,7 @@ impl Key {
        })
    }
 }
+
+pub fn is_rel_block_key(key: &Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0
+}
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -10,7 +10,6 @@ use serde_with::serde_as;
 use strum_macros;
 use utils::{
    completion,
-    generation::Generation,
    history_buffer::HistoryBufferWithDropCounter,
    id::{NodeId, TenantId, TimelineId},
    lsn::Lsn,
@@ -180,6 +179,8 @@ pub struct TimelineCreateRequest {
    #[serde(default)]
    pub ancestor_timeline_id: Option<TimelineId>,
    #[serde(default)]
+    pub existing_initdb_timeline_id: Option<TimelineId>,
+    #[serde(default)]
    pub ancestor_start_lsn: Option<Lsn>,
    pub pg_version: Option<u32>,
 }
@@ -262,10 +263,19 @@ pub struct LocationConfig {
    pub mode: LocationConfigMode,
    /// If attaching, in what generation?
    #[serde(default)]
-    pub generation: Option<Generation>,
+    pub generation: Option<u32>,
    #[serde(default)]
    pub secondary_conf: Option<LocationConfigSecondary>,

+    // Shard parameters: if shard_count is nonzero, then other shard_* fields
+    // must be set accurately.
+    #[serde(default)]
+    pub shard_number: u8,
+    #[serde(default)]
+    pub shard_count: u8,
+    #[serde(default)]
+    pub shard_stripe_size: u32,
+
    // If requesting mode `Secondary`, configuration for that.
    // Custom storage configuration for the tenant, if any
    pub tenant_conf: TenantConfig,
@@ -306,25 +316,7 @@ impl std::ops::Deref for TenantConfigRequest {

 impl TenantConfigRequest {
    pub fn new(tenant_id: TenantId) -> TenantConfigRequest {
-        let config = TenantConfig {
-            checkpoint_distance: None,
-            checkpoint_timeout: None,
-            compaction_target_size: None,
-            compaction_period: None,
-            compaction_threshold: None,
-            gc_horizon: None,
-            gc_period: None,
-            image_creation_threshold: None,
-            pitr_interval: None,
-            walreceiver_connect_timeout: None,
-            lagging_wal_timeout: None,
-            max_lsn_wal_lag: None,
-            trace_read_requests: None,
-            eviction_policy: None,
-            min_resident_size_override: None,
-            evictions_low_residence_duration_metric_threshold: None,
-            gc_feedback: None,
-        };
+        let config = TenantConfig::default();
        TenantConfigRequest { tenant_id, config }
    }
 }
@@ -392,7 +384,9 @@ pub struct TimelineInfo {
    /// The LSN that we are advertizing to safekeepers
    pub remote_consistent_lsn_visible: Lsn,

-    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
+    pub current_logical_size: u64,
+    pub current_logical_size_is_accurate: bool,
+
    /// Sum of the size of all layer files.
    /// If a layer is present in both local FS and S3, it counts only once.
    pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,13 +1,15 @@
 use std::{ops::RangeInclusive, str::FromStr};

+use crate::key::{is_rel_block_key, Key};
 use hex::FromHex;
 use serde::{Deserialize, Serialize};
+use thiserror;
 use utils::id::TenantId;

-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardNumber(pub u8);

-#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug)]
+#[derive(Ord, PartialOrd, Eq, PartialEq, Clone, Copy, Serialize, Deserialize, Debug, Hash)]
 pub struct ShardCount(pub u8);

 impl ShardCount {
@@ -38,7 +40,7 @@ impl ShardNumber {
 /// Note that the binary encoding is _not_ backward compatible, because
 /// at the time sharding is introduced, there are no existing binary structures
 /// containing TenantId that we need to handle.
-#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy, Hash)]
 pub struct TenantShardId {
    pub tenant_id: TenantId,
    pub shard_number: ShardNumber,
@@ -139,6 +141,89 @@ impl From<[u8; 18]> for TenantShardId {
    }
 }

+/// For use within the context of a particular tenant, when we need to know which
+/// shard we're dealing with, but do not need to know the full ShardIdentity (because
+/// we won't be doing any page->shard mapping), and do not need to know the fully qualified
+/// TenantShardId.
+#[derive(Eq, PartialEq, PartialOrd, Ord, Clone, Copy)]
+pub struct ShardIndex {
+    pub shard_number: ShardNumber,
+    pub shard_count: ShardCount,
+}
+
+impl ShardIndex {
+    pub fn new(number: ShardNumber, count: ShardCount) -> Self {
+        Self {
+            shard_number: number,
+            shard_count: count,
+        }
+    }
+    pub fn unsharded() -> Self {
+        Self {
+            shard_number: ShardNumber(0),
+            shard_count: ShardCount(0),
+        }
+    }
+
+    pub fn is_unsharded(&self) -> bool {
+        self.shard_number == ShardNumber(0) && self.shard_count == ShardCount(0)
+    }
+
+    /// For use in constructing remote storage paths: concatenate this with a TenantId
+    /// to get a fully qualified TenantShardId.
+    ///
+    /// Backward compat: this function returns an empty string if Self::is_unsharded, such
+    /// that the legacy pre-sharding remote key format is preserved.
+    pub fn get_suffix(&self) -> String {
+        if self.is_unsharded() {
+            "".to_string()
+        } else {
+            format!("-{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+        }
+    }
+}
+
+impl std::fmt::Display for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
+    }
+}
+
+impl std::fmt::Debug for ShardIndex {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        // Debug is the same as Display: the compact hex representation
+        write!(f, "{}", self)
+    }
+}
+
+impl std::str::FromStr for ShardIndex {
+    type Err = hex::FromHexError;
+
+    fn from_str(s: &str) -> Result<Self, Self::Err> {
+        // Expect format: 1 byte shard number, 1 byte shard count
+        if s.len() == 4 {
+            let bytes = s.as_bytes();
+            let mut shard_parts: [u8; 2] = [0u8; 2];
+            hex::decode_to_slice(bytes, &mut shard_parts)?;
+            Ok(Self {
+                shard_number: ShardNumber(shard_parts[0]),
+                shard_count: ShardCount(shard_parts[1]),
+            })
+        } else {
+            Err(hex::FromHexError::InvalidStringLength)
+        }
+    }
+}
+
+impl From<[u8; 2]> for ShardIndex {
+    fn from(b: [u8; 2]) -> Self {
+        Self {
+            shard_number: ShardNumber(b[0]),
+            shard_count: ShardCount(b[1]),
+        }
+    }
+}
+
 impl Serialize for TenantShardId {
    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
    where
@@ -209,6 +294,255 @@ impl<'de> Deserialize<'de> for TenantShardId {
    }
 }

+/// Stripe size in number of pages
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+pub struct ShardStripeSize(pub u32);
+
+/// Layout version: for future upgrades where we might change how the key->shard mapping works
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+pub struct ShardLayout(u8);
+
+const LAYOUT_V1: ShardLayout = ShardLayout(1);
+/// ShardIdentity uses a magic layout value to indicate if it is unusable
+const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);
+
+/// Default stripe size in pages: 256MiB divided by 8kiB page size.
+const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
+
+/// The ShardIdentity contains the information needed for one member of map
+/// to resolve a key to a shard, and then check whether that shard is ==self.
+#[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
+pub struct ShardIdentity {
+    pub number: ShardNumber,
+    pub count: ShardCount,
+    stripe_size: ShardStripeSize,
+    layout: ShardLayout,
+}
+
+#[derive(thiserror::Error, Debug, PartialEq, Eq)]
+pub enum ShardConfigError {
+    #[error("Invalid shard count")]
+    InvalidCount,
+    #[error("Invalid shard number")]
+    InvalidNumber,
+    #[error("Invalid stripe size")]
+    InvalidStripeSize,
+}
+
+impl ShardIdentity {
+    /// An identity with number=0 count=0 is a "none" identity, which represents legacy
+    /// tenants.  Modern single-shard tenants should not use this: they should
+    /// have number=0 count=1.
+    pub fn unsharded() -> Self {
+        Self {
+            number: ShardNumber(0),
+            count: ShardCount(0),
+            layout: LAYOUT_V1,
+            stripe_size: DEFAULT_STRIPE_SIZE,
+        }
+    }
+
+    /// A broken instance of this type is only used for `TenantState::Broken` tenants,
+    /// which are constructed in code paths that don't have access to proper configuration.
+    ///
+    /// A ShardIdentity in this state may not be used for anything, and should not be persisted.
+    /// Enforcement is via assertions, to avoid making our interface fallible for this
+    /// edge case: it is the Tenant's responsibility to avoid trying to do any I/O when in a broken
+    /// state, and by extension to avoid trying to do any page->shard resolution.
+    pub fn broken(number: ShardNumber, count: ShardCount) -> Self {
+        Self {
+            number,
+            count,
+            layout: LAYOUT_BROKEN,
+            stripe_size: DEFAULT_STRIPE_SIZE,
+        }
+    }
+
+    pub fn is_unsharded(&self) -> bool {
+        self.number == ShardNumber(0) && self.count == ShardCount(0)
+    }
+
+    /// Count must be nonzero, and number must be < count. To construct
+    /// the legacy case (count==0), use Self::unsharded instead.
+    pub fn new(
+        number: ShardNumber,
+        count: ShardCount,
+        stripe_size: ShardStripeSize,
+    ) -> Result<Self, ShardConfigError> {
+        if count.0 == 0 {
+            Err(ShardConfigError::InvalidCount)
+        } else if number.0 > count.0 - 1 {
+            Err(ShardConfigError::InvalidNumber)
+        } else if stripe_size.0 == 0 {
+            Err(ShardConfigError::InvalidStripeSize)
+        } else {
+            Ok(Self {
+                number,
+                count,
+                layout: LAYOUT_V1,
+                stripe_size,
+            })
+        }
+    }
+
+    fn is_broken(&self) -> bool {
+        self.layout == LAYOUT_BROKEN
+    }
+
+    pub fn get_shard_number(&self, key: &Key) -> ShardNumber {
+        assert!(!self.is_broken());
+        key_to_shard_number(self.count, self.stripe_size, key)
+    }
+
+    /// Return true if the key should be ingested by this shard
+    pub fn is_key_local(&self, key: &Key) -> bool {
+        assert!(!self.is_broken());
+        if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
+            true
+        } else {
+            key_to_shard_number(self.count, self.stripe_size, key) == self.number
+        }
+    }
+
+    pub fn shard_slug(&self) -> String {
+        if self.count > ShardCount(0) {
+            format!("-{:02x}{:02x}", self.number.0, self.count.0)
+        } else {
+            String::new()
+        }
+    }
+}
+
+impl Serialize for ShardIndex {
+    fn serialize<S>(&self, serializer: S) -> Result<S::Ok, S::Error>
+    where
+        S: serde::Serializer,
+    {
+        if serializer.is_human_readable() {
+            serializer.collect_str(self)
+        } else {
+            // Binary encoding is not used in index_part.json, but is included in anticipation of
+            // switching various structures (e.g. inter-process communication, remote metadata) to more
+            // compact binary encodings in future.
+            let mut packed: [u8; 2] = [0; 2];
+            packed[0] = self.shard_number.0;
+            packed[1] = self.shard_count.0;
+            packed.serialize(serializer)
+        }
+    }
+}
+
+impl<'de> Deserialize<'de> for ShardIndex {
+    fn deserialize<D>(deserializer: D) -> Result<Self, D::Error>
+    where
+        D: serde::Deserializer<'de>,
+    {
+        struct IdVisitor {
+            is_human_readable_deserializer: bool,
+        }
+
+        impl<'de> serde::de::Visitor<'de> for IdVisitor {
+            type Value = ShardIndex;
+
+            fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result {
+                if self.is_human_readable_deserializer {
+                    formatter.write_str("value in form of hex string")
+                } else {
+                    formatter.write_str("value in form of integer array([u8; 2])")
+                }
+            }
+
+            fn visit_seq<A>(self, seq: A) -> Result<Self::Value, A::Error>
+            where
+                A: serde::de::SeqAccess<'de>,
+            {
+                let s = serde::de::value::SeqAccessDeserializer::new(seq);
+                let id: [u8; 2] = Deserialize::deserialize(s)?;
+                Ok(ShardIndex::from(id))
+            }
+
+            fn visit_str<E>(self, v: &str) -> Result<Self::Value, E>
+            where
+                E: serde::de::Error,
+            {
+                ShardIndex::from_str(v).map_err(E::custom)
+            }
+        }
+
+        if deserializer.is_human_readable() {
+            deserializer.deserialize_str(IdVisitor {
+                is_human_readable_deserializer: true,
+            })
+        } else {
+            deserializer.deserialize_tuple(
+                2,
+                IdVisitor {
+                    is_human_readable_deserializer: false,
+                },
+            )
+        }
+    }
+}
+
+/// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
+/// in order to be able to serve basebackup requests without peer communication).
+fn key_is_shard0(key: &Key) -> bool {
+    // To decide what to shard out to shards >0, we apply a simple rule that only
+    // relation pages are distributed to shards other than shard zero. Everything else gets
+    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
+    // requests, and any request other than those for particular blocks in relations.
+    //
+    // In this condition:
+    // - is_rel_block_key includes only relations, i.e. excludes SLRU data and
+    // all metadata.
+    // - field6 is set to -1 for relation size pages.
+    !(is_rel_block_key(key) && key.field6 != 0xffffffff)
+}
+
+/// Provide the same result as the function in postgres `hashfn.h` with the same name
+fn murmurhash32(mut h: u32) -> u32 {
+    h ^= h >> 16;
+    h = h.wrapping_mul(0x85ebca6b);
+    h ^= h >> 13;
+    h = h.wrapping_mul(0xc2b2ae35);
+    h ^= h >> 16;
+    h
+}
+
+/// Provide the same result as the function in postgres `hashfn.h` with the same name
+fn hash_combine(mut a: u32, mut b: u32) -> u32 {
+    b = b.wrapping_add(0x9e3779b9);
+    b = b.wrapping_add(a << 6);
+    b = b.wrapping_add(a >> 2);
+
+    a ^= b;
+    a
+}
+
+/// Where a Key is to be distributed across shards, select the shard.  This function
+/// does not account for keys that should be broadcast across shards.
+///
+/// The hashing in this function must exactly match what we do in postgres smgr
+/// code.  The resulting distribution of pages is intended to preserve locality within
+/// `stripe_size` ranges of contiguous block numbers in the same relation, while otherwise
+/// distributing data pseudo-randomly.
+///
+/// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
+/// and will be handled at higher levels when shards are split.
+fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
+    // Fast path for un-sharded tenants or broadcast keys
+    if count < ShardCount(2) || key_is_shard0(key) {
+        return ShardNumber(0);
+    }
+
+    // relNode
+    let mut hash = murmurhash32(key.field4);
+    // blockNum/stripe size
+    hash = hash_combine(hash, murmurhash32(key.field6 / stripe_size.0));
+
+    ShardNumber((hash % count.0 as u32) as u8)
+}
+
 #[cfg(test)]
 mod tests {
    use std::str::FromStr;
@@ -318,4 +652,91 @@ mod tests {

        Ok(())
    }
+
+    #[test]
+    fn shard_identity_validation() -> Result<(), ShardConfigError> {
+        // Happy cases
+        ShardIdentity::new(ShardNumber(0), ShardCount(1), DEFAULT_STRIPE_SIZE)?;
+        ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(1))?;
+        ShardIdentity::new(ShardNumber(254), ShardCount(255), ShardStripeSize(1))?;
+
+        assert_eq!(
+            ShardIdentity::new(ShardNumber(0), ShardCount(0), DEFAULT_STRIPE_SIZE),
+            Err(ShardConfigError::InvalidCount)
+        );
+        assert_eq!(
+            ShardIdentity::new(ShardNumber(10), ShardCount(10), DEFAULT_STRIPE_SIZE),
+            Err(ShardConfigError::InvalidNumber)
+        );
+        assert_eq!(
+            ShardIdentity::new(ShardNumber(11), ShardCount(10), DEFAULT_STRIPE_SIZE),
+            Err(ShardConfigError::InvalidNumber)
+        );
+        assert_eq!(
+            ShardIdentity::new(ShardNumber(255), ShardCount(255), DEFAULT_STRIPE_SIZE),
+            Err(ShardConfigError::InvalidNumber)
+        );
+        assert_eq!(
+            ShardIdentity::new(ShardNumber(0), ShardCount(1), ShardStripeSize(0)),
+            Err(ShardConfigError::InvalidStripeSize)
+        );
+
+        Ok(())
+    }
+
+    #[test]
+    fn shard_index_human_encoding() -> Result<(), hex::FromHexError> {
+        let example = ShardIndex {
+            shard_number: ShardNumber(13),
+            shard_count: ShardCount(17),
+        };
+        let expected: String = "0d11".to_string();
+        let encoded = format!("{example}");
+        assert_eq!(&encoded, &expected);
+
+        let decoded = ShardIndex::from_str(&encoded)?;
+        assert_eq!(example, decoded);
+        Ok(())
+    }
+
+    #[test]
+    fn shard_index_binary_encoding() -> Result<(), hex::FromHexError> {
+        let example = ShardIndex {
+            shard_number: ShardNumber(13),
+            shard_count: ShardCount(17),
+        };
+        let expected: [u8; 2] = [0x0d, 0x11];
+
+        let encoded = bincode::serialize(&example).unwrap();
+        assert_eq!(Hex(&encoded), Hex(&expected));
+        let decoded = bincode::deserialize(&encoded).unwrap();
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
+
+    // These are only smoke tests to spot check that our implementation doesn't
+    // deviate from a few examples values: not aiming to validate the overall
+    // hashing algorithm.
+    #[test]
+    fn murmur_hash() {
+        assert_eq!(murmurhash32(0), 0);
+
+        assert_eq!(hash_combine(0xb1ff3b40, 0), 0xfb7923c9);
+    }
+
+    #[test]
+    fn shard_mapping() {
+        let key = Key {
+            field1: 0x00,
+            field2: 0x67f,
+            field3: 0x5,
+            field4: 0x400c,
+            field5: 0x00,
+            field6: 0x7d06,
+        };
+
+        let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
+        assert_eq!(shard, ShardNumber(8));
+    }
 }
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -289,10 +289,10 @@ impl FeStartupPacket {
        // We shouldn't advance `buf` as probably full message is not there yet,
        // so can't directly use Bytes::get_u32 etc.
        let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
-        // The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
+        // The proposed replacement is `!(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
        // which is less readable
        #[allow(clippy::manual_range_contains)]
-        if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
+        if len < 8 || len > MAX_STARTUP_PACKET_LENGTH {
            return Err(ProtocolError::Protocol(format!(
                "invalid startup packet message length {}",
                len
@@ -975,4 +975,10 @@ mod tests {
        let params = make_params("foo\\ bar \\ \\\\ baz\\  lol");
        assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
    }
+
+    #[test]
+    fn parse_fe_startup_packet_regression() {
+        let data = [0, 0, 0, 7, 0, 0, 0, 0];
+        FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err();
+    }
 }
--- a/libs/remote_storage/Cargo.toml
+++ b/libs/remote_storage/Cargo.toml
@@ -9,8 +9,7 @@ anyhow.workspace = true
 async-trait.workspace = true
 once_cell.workspace = true
 aws-smithy-async.workspace = true
-aws-smithy-http.workspace = true
-aws-types.workspace = true
+aws-smithy-types.workspace = true
 aws-config.workspace = true
 aws-sdk-s3.workspace = true
 aws-credential-types.workspace = true
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -14,18 +14,20 @@ use aws_config::{
    provider_config::ProviderConfig,
    retry::{RetryConfigBuilder, RetryMode},
    web_identity_token::WebIdentityTokenCredentialsProvider,
+    BehaviorVersion,
 };
-use aws_credential_types::cache::CredentialsCache;
+use aws_credential_types::provider::SharedCredentialsProvider;
 use aws_sdk_s3::{
-    config::{AsyncSleep, Config, Region, SharedAsyncSleep},
+    config::{AsyncSleep, Builder, IdentityCache, Region, SharedAsyncSleep},
    error::SdkError,
    operation::get_object::GetObjectError,
-    primitives::ByteStream,
    types::{Delete, ObjectIdentifier},
    Client,
 };
 use aws_smithy_async::rt::sleep::TokioSleep;
-use aws_smithy_http::body::SdkBody;
+
+use aws_smithy_types::body::SdkBody;
+use aws_smithy_types::byte_stream::ByteStream;
 use hyper::Body;
 use scopeguard::ScopeGuard;
 use tokio::io::{self, AsyncRead};
@@ -78,7 +80,6 @@ impl S3Bucket {
            // needed to access remote extensions bucket
            .or_else("token", {
                let provider_conf = ProviderConfig::without_region().with_region(region.clone());
-
                WebIdentityTokenCredentialsProvider::builder()
                    .configure(&provider_conf)
                    .build()
@@ -98,18 +99,20 @@ impl S3Bucket {
            .set_max_attempts(Some(1))
            .set_mode(Some(RetryMode::Adaptive));

-        let mut config_builder = Config::builder()
+        let mut config_builder = Builder::default()
+            .behavior_version(BehaviorVersion::v2023_11_09())
            .region(region)
-            .credentials_cache(CredentialsCache::lazy())
-            .credentials_provider(credentials_provider)
-            .sleep_impl(SharedAsyncSleep::from(sleep_impl))
-            .retry_config(retry_config.build());
+            .identity_cache(IdentityCache::lazy().build())
+            .credentials_provider(SharedCredentialsProvider::new(credentials_provider))
+            .retry_config(retry_config.build())
+            .sleep_impl(SharedAsyncSleep::from(sleep_impl));

        if let Some(custom_endpoint) = aws_config.endpoint.clone() {
            config_builder = config_builder
                .endpoint_url(custom_endpoint)
                .force_path_style(true);
        }
+
        let client = Client::from_conf(config_builder.build());

        let prefix_in_bucket = aws_config.prefix_in_bucket.as_deref().map(|prefix| {
@@ -371,11 +374,11 @@ impl RemoteStorage for S3Bucket {

            let response = response?;

-            let keys = response.contents().unwrap_or_default();
+            let keys = response.contents();
            let empty = Vec::new();
            let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);

-            tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
+            tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());

            for object in keys {
                let object_path = object.key().expect("response does not contain a key");
@@ -411,7 +414,7 @@ impl RemoteStorage for S3Bucket {
        let started_at = start_measuring_requests(kind);

        let body = Body::wrap_stream(ReaderStream::new(from));
-        let bytes_stream = ByteStream::new(SdkBody::from(body));
+        let bytes_stream = ByteStream::new(SdkBody::from_body_0_4(body));

        let res = self
            .client
@@ -474,7 +477,7 @@ impl RemoteStorage for S3Bucket {
        for path in paths {
            let obj_id = ObjectIdentifier::builder()
                .set_key(Some(self.relative_path_to_s3_object(path)))
-                .build();
+                .build()?;
            delete_objects.push(obj_id);
        }

@@ -485,7 +488,11 @@ impl RemoteStorage for S3Bucket {
                .client
                .delete_objects()
                .bucket(self.bucket_name.clone())
-                .delete(Delete::builder().set_objects(Some(chunk.to_vec())).build())
+                .delete(
+                    Delete::builder()
+                        .set_objects(Some(chunk.to_vec()))
+                        .build()?,
+                )
                .send()
                .await;

--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -152,3 +152,16 @@ impl Debug for Generation {
        }
    }
 }
+
+#[cfg(test)]
+mod test {
+    use super::*;
+
+    #[test]
+    fn generation_gt() {
+        // Important that a None generation compares less than a valid one, during upgrades from
+        // pre-generation systems.
+        assert!(Generation::none() < Generation::new(0));
+        assert!(Generation::none() < Generation::new(1));
+    }
+}
--- a/pageserver/Cargo.toml
+++ b/pageserver/Cargo.toml
@@ -37,7 +37,6 @@ humantime-serde.workspace = true
 hyper.workspace = true
 itertools.workspace = true
 nix.workspace = true
-nostarve_queue.workspace = true
 # hack to get the number of worker threads tokio uses
 num_cpus = { version = "1.15" }
 num-traits.workspace = true
@@ -52,6 +51,7 @@ regex.workspace = true
 scopeguard.workspace = true
 serde.workspace = true
 serde_json = { workspace = true, features = ["raw_value"] }
+serde_path_to_error.workspace = true
 serde_with.workspace = true
 signal-hook.workspace = true
 smallvec = { workspace = true, features = ["write"] }
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -3,6 +3,7 @@ use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::storage_layer::LayerFileName;
 use pageserver::tenant::storage_layer::PersistentLayerDesc;
+use pageserver_api::shard::TenantShardId;
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
@@ -211,7 +212,7 @@ fn bench_sequential(c: &mut Criterion) {
        let i32 = (i as u32) % 100;
        let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
        let layer = PersistentLayerDesc::new_img(
-            TenantId::generate(),
+            TenantShardId::unsharded(TenantId::generate()),
            TimelineId::generate(),
            zero.add(10 * i32)..zero.add(10 * i32 + 1),
            Lsn(i),
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -1,13 +1,15 @@
 use std::path::{Path, PathBuf};

 use anyhow::Result;
-use camino::Utf8Path;
+use camino::{Utf8Path, Utf8PathBuf};
 use clap::Subcommand;
 use pageserver::context::{DownloadBehavior, RequestContext};
 use pageserver::task_mgr::TaskKind;
 use pageserver::tenant::block_io::BlockCursor;
 use pageserver::tenant::disk_btree::DiskBtreeReader;
 use pageserver::tenant::storage_layer::delta_layer::{BlobRef, Summary};
+use pageserver::tenant::storage_layer::{delta_layer, image_layer};
+use pageserver::tenant::storage_layer::{DeltaLayer, ImageLayer};
 use pageserver::tenant::{TENANTS_SEGMENT_NAME, TIMELINES_SEGMENT_NAME};
 use pageserver::{page_cache, virtual_file};
 use pageserver::{
@@ -20,6 +22,7 @@ use pageserver::{
 };
 use std::fs;
 use utils::bin_ser::BeSer;
+use utils::id::{TenantId, TimelineId};

 use crate::layer_map_analyzer::parse_filename;

@@ -45,6 +48,13 @@ pub(crate) enum LayerCmd {
        /// The id from list-layer command
        id: usize,
    },
+    RewriteSummary {
+        layer_file_path: Utf8PathBuf,
+        #[clap(long)]
+        new_tenant_id: Option<TenantId>,
+        #[clap(long)]
+        new_timeline_id: Option<TimelineId>,
+    },
 }

 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
@@ -100,6 +110,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                    println!("- timeline {}", timeline.file_name().to_string_lossy());
                }
            }
+            Ok(())
        }
        LayerCmd::ListLayer {
            path,
@@ -128,6 +139,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                    idx += 1;
                }
            }
+            Ok(())
        }
        LayerCmd::DumpLayer {
            path,
@@ -168,7 +180,63 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
                    idx += 1;
                }
            }
+            Ok(())
+        }
+        LayerCmd::RewriteSummary {
+            layer_file_path,
+            new_tenant_id,
+            new_timeline_id,
+        } => {
+            pageserver::virtual_file::init(10);
+            pageserver::page_cache::init(100);
+
+            let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
+
+            macro_rules! rewrite_closure {
+                ($($summary_ty:tt)*) => {{
+                    |summary| $($summary_ty)* {
+                        tenant_id: new_tenant_id.unwrap_or(summary.tenant_id),
+                        timeline_id: new_timeline_id.unwrap_or(summary.timeline_id),
+                        ..summary
+                    }
+                }};
+            }
+
+            let res = ImageLayer::rewrite_summary(
+                layer_file_path,
+                rewrite_closure!(image_layer::Summary),
+                &ctx,
+            )
+            .await;
+            match res {
+                Ok(()) => {
+                    println!("Successfully rewrote summary of image layer {layer_file_path}");
+                    return Ok(());
+                }
+                Err(image_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough
+                Err(image_layer::RewriteSummaryError::Other(e)) => {
+                    return Err(e);
+                }
+            }
+
+            let res = DeltaLayer::rewrite_summary(
+                layer_file_path,
+                rewrite_closure!(delta_layer::Summary),
+                &ctx,
+            )
+            .await;
+            match res {
+                Ok(()) => {
+                    println!("Successfully rewrote summary of delta layer {layer_file_path}");
+                    return Ok(());
+                }
+                Err(delta_layer::RewriteSummaryError::MagicMismatch) => (), // fallthrough
+                Err(delta_layer::RewriteSummaryError::Other(e)) => {
+                    return Err(e);
+                }
+            }
+
+            anyhow::bail!("not an image or delta layer: {layer_file_path}");
        }
    }
-    Ok(())
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -402,15 +402,11 @@ fn start_pageserver(
    let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel();
    let (init_done_tx, init_done_rx) = utils::completion::channel();

-    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
-
    let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();

    let order = pageserver::InitializationOrder {
        initial_tenant_load_remote: Some(init_done_tx),
        initial_tenant_load: Some(init_remote_done_tx),
-        initial_logical_size_can_start: init_done_rx.clone(),
-        initial_logical_size_attempt: Some(init_logical_size_done_tx),
        background_jobs_can_start: background_jobs_barrier.clone(),
    };

@@ -464,7 +460,7 @@ fn start_pageserver(
            });

            let WaitForPhaseResult {
-                timeout_remaining: timeout,
+                timeout_remaining: _timeout,
                skipped: init_load_skipped,
            } = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;

@@ -472,26 +468,6 @@ fn start_pageserver(

            scopeguard::ScopeGuard::into_inner(guard);

-            let guard = scopeguard::guard_on_success((), |_| {
-                tracing::info!("Cancelled before initial logical sizes completed")
-            });
-
-            let logical_sizes_done = std::pin::pin!(async {
-                init_logical_size_done_rx.wait().await;
-                startup_checkpoint(
-                    started_startup_at,
-                    "initial_logical_sizes",
-                    "Initial logical sizes completed",
-                );
-            });
-
-            let WaitForPhaseResult {
-                timeout_remaining: _,
-                skipped: logical_sizes_skipped,
-            } = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;
-
-            scopeguard::ScopeGuard::into_inner(guard);
-
            // allow background jobs to start: we either completed prior stages, or they reached timeout
            // and were skipped.  It is important that we do not let them block background jobs indefinitely,
            // because things like consumption metrics for billing are blocked by this barrier.
@@ -514,9 +490,6 @@ fn start_pageserver(
            if let Some(f) = init_load_skipped {
                f.await;
            }
-            if let Some(f) = logical_sizes_skipped {
-                f.await;
-            }
            scopeguard::ScopeGuard::into_inner(guard);

            startup_checkpoint(started_startup_at, "complete", "Startup complete");
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -5,6 +5,7 @@
 //! See also `settings.md` for better description on every parameter.

 use anyhow::{anyhow, bail, ensure, Context, Result};
+use pageserver_api::shard::TenantShardId;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use serde::de::IntoDeserializer;
 use std::env;
@@ -25,7 +26,7 @@ use toml_edit::{Document, Item};
 use camino::{Utf8Path, Utf8PathBuf};
 use postgres_backend::AuthType;
 use utils::{
-    id::{NodeId, TenantId, TimelineId},
+    id::{NodeId, TimelineId},
    logging::LogFormat,
 };

@@ -628,12 +629,13 @@ impl PageServerConf {
        self.deletion_prefix().join(format!("header-{VERSION:02x}"))
    }

-    pub fn tenant_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.tenants_path().join(tenant_id.to_string())
+    pub fn tenant_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+        self.tenants_path().join(tenant_shard_id.to_string())
    }

-    pub fn tenant_ignore_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.tenant_path(tenant_id).join(IGNORED_TENANT_FILE_NAME)
+    pub fn tenant_ignore_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+        self.tenant_path(tenant_shard_id)
+            .join(IGNORED_TENANT_FILE_NAME)
    }

    /// Points to a place in pageserver's local directory,
@@ -641,47 +643,53 @@ impl PageServerConf {
    ///
    /// Legacy: superseded by tenant_location_config_path.  Eventually
    /// remove this function.
-    pub fn tenant_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.tenant_path(tenant_id).join(TENANT_CONFIG_NAME)
+    pub fn tenant_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+        self.tenant_path(tenant_shard_id).join(TENANT_CONFIG_NAME)
    }

-    pub fn tenant_location_config_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.tenant_path(tenant_id)
+    pub fn tenant_location_config_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+        self.tenant_path(tenant_shard_id)
            .join(TENANT_LOCATION_CONFIG_NAME)
    }

-    pub fn timelines_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.tenant_path(tenant_id).join(TIMELINES_SEGMENT_NAME)
+    pub fn timelines_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+        self.tenant_path(tenant_shard_id)
+            .join(TIMELINES_SEGMENT_NAME)
    }

-    pub fn timeline_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
-        self.timelines_path(tenant_id).join(timeline_id.to_string())
+    pub fn timeline_path(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+    ) -> Utf8PathBuf {
+        self.timelines_path(tenant_shard_id)
+            .join(timeline_id.to_string())
    }

    pub fn timeline_uninit_mark_file_path(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
    ) -> Utf8PathBuf {
        path_with_suffix_extension(
-            self.timeline_path(&tenant_id, &timeline_id),
+            self.timeline_path(&tenant_shard_id, &timeline_id),
            TIMELINE_UNINIT_MARK_SUFFIX,
        )
    }

    pub fn timeline_delete_mark_file_path(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
    ) -> Utf8PathBuf {
        path_with_suffix_extension(
-            self.timeline_path(&tenant_id, &timeline_id),
+            self.timeline_path(&tenant_shard_id, &timeline_id),
            TIMELINE_DELETE_MARK_SUFFIX,
        )
    }

-    pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> Utf8PathBuf {
-        self.tenant_path(tenant_id)
+    pub fn tenant_deleted_mark_file_path(&self, tenant_shard_id: &TenantShardId) -> Utf8PathBuf {
+        self.tenant_path(tenant_shard_id)
            .join(TENANT_DELETED_MARKER_FILE_NAME)
    }

@@ -691,20 +699,24 @@ impl PageServerConf {

    pub fn trace_path(
        &self,
-        tenant_id: &TenantId,
+        tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
        connection_id: &ConnectionId,
    ) -> Utf8PathBuf {
        self.traces_path()
-            .join(tenant_id.to_string())
+            .join(tenant_shard_id.to_string())
            .join(timeline_id.to_string())
            .join(connection_id.to_string())
    }

    /// Points to a place in pageserver's local directory,
    /// where certain timeline's metadata file should be located.
-    pub fn metadata_path(&self, tenant_id: &TenantId, timeline_id: &TimelineId) -> Utf8PathBuf {
-        self.timeline_path(tenant_id, timeline_id)
+    pub fn metadata_path(
+        &self,
+        tenant_shard_id: &TenantShardId,
+        timeline_id: &TimelineId,
+    ) -> Utf8PathBuf {
+        self.timeline_path(tenant_shard_id, timeline_id)
            .join(METADATA_FILE_NAME)
    }

@@ -767,7 +779,7 @@ impl PageServerConf {
                    builder.remote_storage_config(RemoteStorageConfig::from_toml(item)?)
                }
                "tenant_config" => {
-                    t_conf = Self::parse_toml_tenant_conf(item)?;
+                    t_conf = TenantConfOpt::try_from(item.to_owned()).context(format!("failed to parse: '{key}'"))?;
                }
                "id" => builder.id(NodeId(parse_toml_u64(key, item)?)),
                "broker_endpoint" => builder.broker_endpoint(parse_toml_string(key, item)?.parse().context("failed to parse broker endpoint")?),
@@ -841,114 +853,10 @@ impl PageServerConf {
        Ok(conf)
    }

-    // subroutine of parse_and_validate to parse `[tenant_conf]` section
-
-    pub fn parse_toml_tenant_conf(item: &toml_edit::Item) -> Result<TenantConfOpt> {
-        let mut t_conf: TenantConfOpt = Default::default();
-        if let Some(checkpoint_distance) = item.get("checkpoint_distance") {
-            t_conf.checkpoint_distance =
-                Some(parse_toml_u64("checkpoint_distance", checkpoint_distance)?);
-        }
-
-        if let Some(checkpoint_timeout) = item.get("checkpoint_timeout") {
-            t_conf.checkpoint_timeout = Some(parse_toml_duration(
-                "checkpoint_timeout",
-                checkpoint_timeout,
-            )?);
-        }
-
-        if let Some(compaction_target_size) = item.get("compaction_target_size") {
-            t_conf.compaction_target_size = Some(parse_toml_u64(
-                "compaction_target_size",
-                compaction_target_size,
-            )?);
-        }
-
-        if let Some(compaction_period) = item.get("compaction_period") {
-            t_conf.compaction_period =
-                Some(parse_toml_duration("compaction_period", compaction_period)?);
-        }
-
-        if let Some(compaction_threshold) = item.get("compaction_threshold") {
-            t_conf.compaction_threshold =
-                Some(parse_toml_u64("compaction_threshold", compaction_threshold)?.try_into()?);
-        }
-
-        if let Some(image_creation_threshold) = item.get("image_creation_threshold") {
-            t_conf.image_creation_threshold = Some(
-                parse_toml_u64("image_creation_threshold", image_creation_threshold)?.try_into()?,
-            );
-        }
-
-        if let Some(gc_horizon) = item.get("gc_horizon") {
-            t_conf.gc_horizon = Some(parse_toml_u64("gc_horizon", gc_horizon)?);
-        }
-
-        if let Some(gc_period) = item.get("gc_period") {
-            t_conf.gc_period = Some(parse_toml_duration("gc_period", gc_period)?);
-        }
-
-        if let Some(pitr_interval) = item.get("pitr_interval") {
-            t_conf.pitr_interval = Some(parse_toml_duration("pitr_interval", pitr_interval)?);
-        }
-        if let Some(walreceiver_connect_timeout) = item.get("walreceiver_connect_timeout") {
-            t_conf.walreceiver_connect_timeout = Some(parse_toml_duration(
-                "walreceiver_connect_timeout",
-                walreceiver_connect_timeout,
-            )?);
-        }
-        if let Some(lagging_wal_timeout) = item.get("lagging_wal_timeout") {
-            t_conf.lagging_wal_timeout = Some(parse_toml_duration(
-                "lagging_wal_timeout",
-                lagging_wal_timeout,
-            )?);
-        }
-        if let Some(max_lsn_wal_lag) = item.get("max_lsn_wal_lag") {
-            t_conf.max_lsn_wal_lag =
-                Some(deserialize_from_item("max_lsn_wal_lag", max_lsn_wal_lag)?);
-        }
-        if let Some(trace_read_requests) = item.get("trace_read_requests") {
-            t_conf.trace_read_requests =
-                Some(trace_read_requests.as_bool().with_context(|| {
-                    "configure option trace_read_requests is not a bool".to_string()
-                })?);
-        }
-
-        if let Some(eviction_policy) = item.get("eviction_policy") {
-            t_conf.eviction_policy = Some(
-                deserialize_from_item("eviction_policy", eviction_policy)
-                    .context("parse eviction_policy")?,
-            );
-        }
-
-        if let Some(item) = item.get("min_resident_size_override") {
-            t_conf.min_resident_size_override = Some(
-                deserialize_from_item("min_resident_size_override", item)
-                    .context("parse min_resident_size_override")?,
-            );
-        }
-
-        if let Some(item) = item.get("evictions_low_residence_duration_metric_threshold") {
-            t_conf.evictions_low_residence_duration_metric_threshold = Some(parse_toml_duration(
-                "evictions_low_residence_duration_metric_threshold",
-                item,
-            )?);
-        }
-
-        if let Some(gc_feedback) = item.get("gc_feedback") {
-            t_conf.gc_feedback = Some(
-                gc_feedback
-                    .as_bool()
-                    .with_context(|| "configure option gc_feedback is not a bool".to_string())?,
-            );
-        }
-
-        Ok(t_conf)
-    }
-
    #[cfg(test)]
    pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
-        Utf8PathBuf::from(format!("../tmp_check/test_{test_name}"))
+        let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
+        Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
    }

    pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
@@ -1417,6 +1325,37 @@ trace_read_requests = {trace_read_requests}"#,
        Ok(())
    }

+    #[test]
+    fn parse_incorrect_tenant_config() -> anyhow::Result<()> {
+        let config_string = r#"
+            [tenant_config]
+            checkpoint_distance = -1 # supposed to be an u64
+        "#
+        .to_string();
+
+        let toml: Document = config_string.parse()?;
+        let item = toml.get("tenant_config").unwrap();
+        let error = TenantConfOpt::try_from(item.to_owned()).unwrap_err();
+
+        let expected_error_str = "checkpoint_distance: invalid value: integer `-1`, expected u64";
+        assert_eq!(error.to_string(), expected_error_str);
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_override_tenant_config() -> anyhow::Result<()> {
+        let config_string = r#"tenant_config={ min_resident_size_override =  400 }"#.to_string();
+
+        let toml: Document = config_string.parse()?;
+        let item = toml.get("tenant_config").unwrap();
+        let conf = TenantConfOpt::try_from(item.to_owned()).unwrap();
+
+        assert_eq!(conf.min_resident_size_override, Some(400));
+
+        Ok(())
+    }
+
    #[test]
    fn eviction_pageserver_config_parse() -> anyhow::Result<()> {
        let tempdir = tempdir()?;
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -1,8 +1,8 @@
-use crate::context::RequestContext;
-use anyhow::Context;
+use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
 use futures::stream::StreamExt;
+use pageserver_api::shard::ShardNumber;
 use std::{sync::Arc, time::SystemTime};
 use utils::{
    id::{TenantId, TimelineId},
@@ -229,6 +229,11 @@ where
    while let Some((tenant_id, tenant)) = tenants.next().await {
        let mut tenant_resident_size = 0;

+        // Sharded tenants report all consumption metrics from shard zero
+        if tenant.tenant_shard_id().shard_number != ShardNumber(0) {
+            continue;
+        }
+
        for timeline in tenant.list_timelines() {
            let timeline_id = timeline.timeline_id;

@@ -351,14 +356,17 @@ impl TimelineSnapshot {
            let last_record_lsn = t.get_last_record_lsn();

            let current_exact_logical_size = {
-                let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_id, timeline_id = %t.timeline_id);
-                let res = span
-                    .in_scope(|| t.get_current_logical_size(ctx))
-                    .context("get_current_logical_size");
-                match res? {
+                let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
+                let size = span.in_scope(|| {
+                    t.get_current_logical_size(
+                        crate::tenant::timeline::GetLogicalSizePriority::Background,
+                        ctx,
+                    )
+                });
+                match size {
                    // Only send timeline logical size when it is fully calculated.
-                    (size, is_exact) if is_exact => Some(size),
-                    (_, _) => None,
+                    CurrentLogicalSize::Exact(ref size) => Some(size.into()),
+                    CurrentLogicalSize::Approximate(_) => None,
                }
            };

--- a/pageserver/src/control_plane_client.rs
+++ b/pageserver/src/control_plane_client.rs
@@ -1,16 +1,15 @@
 use std::collections::HashMap;

-use pageserver_api::control_api::{
-    ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
+use pageserver_api::{
+    control_api::{
+        ReAttachRequest, ReAttachResponse, ValidateRequest, ValidateRequestTenant, ValidateResponse,
+    },
+    shard::TenantShardId,
 };
 use serde::{de::DeserializeOwned, Serialize};
 use tokio_util::sync::CancellationToken;
 use url::Url;
-use utils::{
-    backoff,
-    generation::Generation,
-    id::{NodeId, TenantId},
-};
+use utils::{backoff, generation::Generation, id::NodeId};

 use crate::config::PageServerConf;

@@ -31,11 +30,11 @@ pub enum RetryForeverError {

 #[async_trait::async_trait]
 pub trait ControlPlaneGenerationsApi {
-    async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError>;
+    async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError>;
    async fn validate(
        &self,
-        tenants: Vec<(TenantId, Generation)>,
-    ) -> Result<HashMap<TenantId, bool>, RetryForeverError>;
+        tenants: Vec<(TenantShardId, Generation)>,
+    ) -> Result<HashMap<TenantShardId, bool>, RetryForeverError>;
 }

 impl ControlPlaneClient {
@@ -127,7 +126,7 @@ impl ControlPlaneClient {
 #[async_trait::async_trait]
 impl ControlPlaneGenerationsApi for ControlPlaneClient {
    /// Block until we get a successful response, or error out if we are shut down
-    async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
+    async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
        let re_attach_path = self
            .base_url
            .join("re-attach")
@@ -154,8 +153,8 @@ impl ControlPlaneGenerationsApi for ControlPlaneClient {
    /// Block until we get a successful response, or error out if we are shut down
    async fn validate(
        &self,
-        tenants: Vec<(TenantId, Generation)>,
-    ) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
+        tenants: Vec<(TenantShardId, Generation)>,
+    ) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
        let re_attach_path = self
            .base_url
            .join("validate")
--- a/pageserver/src/deletion_queue.rs
+++ b/pageserver/src/deletion_queue.rs
@@ -10,11 +10,12 @@ use crate::control_plane_client::ControlPlaneGenerationsApi;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
 use crate::tenant::remote_timeline_client::remote_timeline_path;
+use crate::tenant::remote_timeline_client::LayerFileMetadata;
 use crate::virtual_file::MaybeFatalIo;
 use crate::virtual_file::VirtualFile;
 use anyhow::Context;
 use camino::Utf8PathBuf;
-use hex::FromHex;
+use pageserver_api::shard::TenantShardId;
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use serde::Deserialize;
 use serde::Serialize;
@@ -25,7 +26,7 @@ use tracing::Instrument;
 use tracing::{self, debug, error};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::generation::Generation;
-use utils::id::{TenantId, TimelineId};
+use utils::id::TimelineId;
 use utils::lsn::AtomicLsn;
 use utils::lsn::Lsn;

@@ -159,11 +160,10 @@ pub struct DeletionQueueClient {
    lsn_table: Arc<std::sync::RwLock<VisibleLsnUpdates>>,
 }

-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
 struct TenantDeletionList {
    /// For each Timeline, a list of key fragments to append to the timeline remote path
    /// when reconstructing a full key
-    #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
    timelines: HashMap<TimelineId, Vec<String>>,

    /// The generation in which this deletion was emitted: note that this may not be the
@@ -178,43 +178,11 @@ impl TenantDeletionList {
    }
 }

-/// For HashMaps using a `hex` compatible key, where we would like to encode the key as a string
-fn to_hex_map<S, V, I>(input: &HashMap<I, V>, serializer: S) -> Result<S::Ok, S::Error>
-where
-    S: serde::Serializer,
-    V: Serialize,
-    I: AsRef<[u8]>,
-{
-    let transformed = input.iter().map(|(k, v)| (hex::encode(k), v));
-
-    transformed
-        .collect::<HashMap<String, &V>>()
-        .serialize(serializer)
-}
-
-/// For HashMaps using a FromHex key, where we would like to decode the key
-fn from_hex_map<'de, D, V, I>(deserializer: D) -> Result<HashMap<I, V>, D::Error>
-where
-    D: serde::de::Deserializer<'de>,
-    V: Deserialize<'de>,
-    I: FromHex + std::hash::Hash + Eq,
-{
-    let hex_map = HashMap::<String, V>::deserialize(deserializer)?;
-    hex_map
-        .into_iter()
-        .map(|(k, v)| {
-            I::from_hex(k)
-                .map(|k| (k, v))
-                .map_err(|_| serde::de::Error::custom("Invalid hex ID"))
-        })
-        .collect()
-}
-
 /// Files ending with this suffix will be ignored and erased
 /// during recovery as startup.
 const TEMP_SUFFIX: &str = "tmp";

-#[derive(Debug, Serialize, Deserialize)]
+#[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
 struct DeletionList {
    /// Serialization version, for future use
    version: u8,
@@ -226,8 +194,7 @@ struct DeletionList {
    /// nested HashMaps by TenantTimelineID.  Each Tenant only appears once
    /// with one unique generation ID: if someone tries to push a second generation
    /// ID for the same tenant, we will start a new DeletionList.
-    #[serde(serialize_with = "to_hex_map", deserialize_with = "from_hex_map")]
-    tenants: HashMap<TenantId, TenantDeletionList>,
+    tenants: HashMap<TenantShardId, TenantDeletionList>,

    /// Avoid having to walk `tenants` to calculate the number of keys in
    /// the nested deletion lists
@@ -299,7 +266,7 @@ impl DeletionList {
    /// deletion list.
    fn push(
        &mut self,
-        tenant: &TenantId,
+        tenant: &TenantShardId,
        timeline: &TimelineId,
        generation: Generation,
        objects: &mut Vec<RemotePath>,
@@ -391,7 +358,7 @@ struct TenantLsnState {

 #[derive(Default)]
 struct VisibleLsnUpdates {
-    tenants: HashMap<TenantId, TenantLsnState>,
+    tenants: HashMap<TenantShardId, TenantLsnState>,
 }

 impl VisibleLsnUpdates {
@@ -448,7 +415,7 @@ impl DeletionQueueClient {

    pub(crate) fn recover(
        &self,
-        attached_tenants: HashMap<TenantId, Generation>,
+        attached_tenants: HashMap<TenantShardId, Generation>,
    ) -> Result<(), DeletionQueueError> {
        self.do_push(
            &self.tx,
@@ -465,7 +432,7 @@ impl DeletionQueueClient {
    /// backend will later wake up and notice that the tenant's generation requires validation.
    pub(crate) async fn update_remote_consistent_lsn(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        current_generation: Generation,
        lsn: Lsn,
@@ -476,10 +443,13 @@ impl DeletionQueueClient {
            .write()
            .expect("Lock should never be poisoned");

-        let tenant_entry = locked.tenants.entry(tenant_id).or_insert(TenantLsnState {
-            timelines: HashMap::new(),
-            generation: current_generation,
-        });
+        let tenant_entry = locked
+            .tenants
+            .entry(tenant_shard_id)
+            .or_insert(TenantLsnState {
+                timelines: HashMap::new(),
+                generation: current_generation,
+            });

        if tenant_entry.generation != current_generation {
            // Generation might have changed if we were detached and then re-attached: in this case,
@@ -506,28 +476,29 @@ impl DeletionQueueClient {
    /// generations in `layers` are the generations in which those layers were written.
    pub(crate) async fn push_layers(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        current_generation: Generation,
-        layers: Vec<(LayerFileName, Generation)>,
+        layers: Vec<(LayerFileName, LayerFileMetadata)>,
    ) -> Result<(), DeletionQueueError> {
        if current_generation.is_none() {
            debug!("Enqueuing deletions in legacy mode, skipping queue");

            let mut layer_paths = Vec::new();
-            for (layer, generation) in layers {
+            for (layer, meta) in layers {
                layer_paths.push(remote_layer_path(
-                    &tenant_id,
+                    &tenant_shard_id.tenant_id,
                    &timeline_id,
+                    meta.shard,
                    &layer,
-                    generation,
+                    meta.generation,
                ));
            }
            self.push_immediate(layer_paths).await?;
            return self.flush_immediate().await;
        }

-        self.push_layers_sync(tenant_id, timeline_id, current_generation, layers)
+        self.push_layers_sync(tenant_shard_id, timeline_id, current_generation, layers)
    }

    /// When a Tenant has a generation, push_layers is always synchronous because
@@ -537,10 +508,10 @@ impl DeletionQueueClient {
    /// support (`<https://github.com/neondatabase/neon/issues/5395>`)
    pub(crate) fn push_layers_sync(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        current_generation: Generation,
-        layers: Vec<(LayerFileName, Generation)>,
+        layers: Vec<(LayerFileName, LayerFileMetadata)>,
    ) -> Result<(), DeletionQueueError> {
        metrics::DELETION_QUEUE
            .keys_submitted
@@ -548,7 +519,7 @@ impl DeletionQueueClient {
        self.do_push(
            &self.tx,
            ListWriterQueueMessage::Delete(DeletionOp {
-                tenant_id,
+                tenant_shard_id,
                timeline_id,
                layers,
                generation: current_generation,
@@ -751,6 +722,7 @@ impl DeletionQueue {
 mod test {
    use camino::Utf8Path;
    use hex_literal::hex;
+    use pageserver_api::shard::ShardIndex;
    use std::{io::ErrorKind, time::Duration};
    use tracing::info;

@@ -815,12 +787,12 @@ mod test {
        }

        fn set_latest_generation(&self, gen: Generation) {
-            let tenant_id = self.harness.tenant_id;
+            let tenant_shard_id = self.harness.tenant_shard_id;
            self.mock_control_plane
                .latest_generation
                .lock()
                .unwrap()
-                .insert(tenant_id, gen);
+                .insert(tenant_shard_id, gen);
        }

        /// Returns remote layer file name, suitable for use in assert_remote_files
@@ -829,8 +801,8 @@ mod test {
            file_name: LayerFileName,
            gen: Generation,
        ) -> anyhow::Result<String> {
-            let tenant_id = self.harness.tenant_id;
-            let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+            let tenant_shard_id = self.harness.tenant_shard_id;
+            let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
            let remote_timeline_path = self.remote_fs_dir.join(relative_remote_path.get_path());
            std::fs::create_dir_all(&remote_timeline_path)?;
            let remote_layer_file_name = format!("{}{}", file_name, gen.get_suffix());
@@ -848,7 +820,7 @@ mod test {

    #[derive(Debug, Clone)]
    struct MockControlPlane {
-        pub latest_generation: std::sync::Arc<std::sync::Mutex<HashMap<TenantId, Generation>>>,
+        pub latest_generation: std::sync::Arc<std::sync::Mutex<HashMap<TenantShardId, Generation>>>,
    }

    impl MockControlPlane {
@@ -862,20 +834,20 @@ mod test {
    #[async_trait::async_trait]
    impl ControlPlaneGenerationsApi for MockControlPlane {
        #[allow(clippy::diverging_sub_expression)] // False positive via async_trait
-        async fn re_attach(&self) -> Result<HashMap<TenantId, Generation>, RetryForeverError> {
+        async fn re_attach(&self) -> Result<HashMap<TenantShardId, Generation>, RetryForeverError> {
            unimplemented!()
        }
        async fn validate(
            &self,
-            tenants: Vec<(TenantId, Generation)>,
-        ) -> Result<HashMap<TenantId, bool>, RetryForeverError> {
+            tenants: Vec<(TenantShardId, Generation)>,
+        ) -> Result<HashMap<TenantShardId, bool>, RetryForeverError> {
            let mut result = HashMap::new();

            let latest_generation = self.latest_generation.lock().unwrap();

-            for (tenant_id, generation) in tenants {
-                if let Some(latest) = latest_generation.get(&tenant_id) {
-                    result.insert(tenant_id, *latest == generation);
+            for (tenant_shard_id, generation) in tenants {
+                if let Some(latest) = latest_generation.get(&tenant_shard_id) {
+                    result.insert(tenant_shard_id, *latest == generation);
                }
            }

@@ -979,10 +951,10 @@ mod test {
        client.recover(HashMap::new())?;

        let layer_file_name_1: LayerFileName = "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap();
-        let tenant_id = ctx.harness.tenant_id;
+        let tenant_shard_id = ctx.harness.tenant_shard_id;

        let content: Vec<u8> = "victim1 contents".into();
-        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+        let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
        let deletion_prefix = ctx.harness.conf.deletion_prefix();

@@ -990,6 +962,8 @@ mod test {
        // we delete, and the generation of the running Tenant.
        let layer_generation = Generation::new(0xdeadbeef);
        let now_generation = Generation::new(0xfeedbeef);
+        let layer_metadata =
+            LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());

        let remote_layer_file_name_1 =
            format!("{}{}", layer_file_name_1, layer_generation.get_suffix());
@@ -1010,10 +984,10 @@ mod test {
        info!("Pushing");
        client
            .push_layers(
-                tenant_id,
+                tenant_shard_id,
                TIMELINE_ID,
                now_generation,
-                [(layer_file_name_1.clone(), layer_generation)].to_vec(),
+                [(layer_file_name_1.clone(), layer_metadata)].to_vec(),
            )
            .await?;
        assert_remote_files(&[&remote_layer_file_name_1], &remote_timeline_path);
@@ -1052,11 +1026,13 @@ mod test {
        let stale_generation = latest_generation.previous();
        // Generation that our example layer file was written with
        let layer_generation = stale_generation.previous();
+        let layer_metadata =
+            LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());

        ctx.set_latest_generation(latest_generation);

-        let tenant_id = ctx.harness.tenant_id;
-        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+        let tenant_shard_id = ctx.harness.tenant_shard_id;
+        let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());

        // Initial state: a remote layer exists
@@ -1066,10 +1042,10 @@ mod test {
        tracing::debug!("Pushing...");
        client
            .push_layers(
-                tenant_id,
+                tenant_shard_id,
                TIMELINE_ID,
                stale_generation,
-                [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+                [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
            )
            .await?;

@@ -1081,10 +1057,10 @@ mod test {
        tracing::debug!("Pushing...");
        client
            .push_layers(
-                tenant_id,
+                tenant_shard_id,
                TIMELINE_ID,
                latest_generation,
-                [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+                [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
            )
            .await?;

@@ -1103,14 +1079,16 @@ mod test {
        let client = ctx.deletion_queue.new_client();
        client.recover(HashMap::new())?;

-        let tenant_id = ctx.harness.tenant_id;
+        let tenant_shard_id = ctx.harness.tenant_shard_id;

-        let relative_remote_path = remote_timeline_path(&tenant_id, &TIMELINE_ID);
+        let relative_remote_path = remote_timeline_path(&tenant_shard_id, &TIMELINE_ID);
        let remote_timeline_path = ctx.remote_fs_dir.join(relative_remote_path.get_path());
        let deletion_prefix = ctx.harness.conf.deletion_prefix();

        let layer_generation = Generation::new(0xdeadbeef);
        let now_generation = Generation::new(0xfeedbeef);
+        let layer_metadata =
+            LayerFileMetadata::new(0xf00, layer_generation, ShardIndex::unsharded());

        // Inject a deletion in the generation before generation_now: after restart,
        // this deletion should _not_ get executed (only the immediately previous
@@ -1119,10 +1097,10 @@ mod test {
            ctx.write_remote_layer(EXAMPLE_LAYER_NAME, layer_generation)?;
        client
            .push_layers(
-                tenant_id,
+                tenant_shard_id,
                TIMELINE_ID,
                now_generation.previous(),
-                [(EXAMPLE_LAYER_NAME.clone(), layer_generation)].to_vec(),
+                [(EXAMPLE_LAYER_NAME.clone(), layer_metadata.clone())].to_vec(),
            )
            .await?;

@@ -1133,10 +1111,10 @@ mod test {
            ctx.write_remote_layer(EXAMPLE_LAYER_NAME_ALT, layer_generation)?;
        client
            .push_layers(
-                tenant_id,
+                tenant_shard_id,
                TIMELINE_ID,
                now_generation,
-                [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_generation)].to_vec(),
+                [(EXAMPLE_LAYER_NAME_ALT.clone(), layer_metadata.clone())].to_vec(),
            )
            .await?;

@@ -1164,7 +1142,7 @@ mod test {
        drop(client);
        ctx.restart().await;
        let client = ctx.deletion_queue.new_client();
-        client.recover(HashMap::from([(tenant_id, now_generation)]))?;
+        client.recover(HashMap::from([(tenant_shard_id, now_generation)]))?;

        info!("Flush-executing");
        client.flush_execute().await?;
@@ -1226,12 +1204,13 @@ pub(crate) mod mock {
                match msg {
                    ListWriterQueueMessage::Delete(op) => {
                        let mut objects = op.objects;
-                        for (layer, generation) in op.layers {
+                        for (layer, meta) in op.layers {
                            objects.push(remote_layer_path(
-                                &op.tenant_id,
+                                &op.tenant_shard_id.tenant_id,
                                &op.timeline_id,
+                                meta.shard,
                                &layer,
-                                generation,
+                                meta.generation,
                            ));
                        }

@@ -1311,4 +1290,34 @@ pub(crate) mod mock {
            }
        }
    }
+
+    /// Test round-trip serialization/deserialization, and test stability of the format
+    /// vs. a static expected string for the serialized version.
+    #[test]
+    fn deletion_list_serialization() -> anyhow::Result<()> {
+        let tenant_id = "ad6c1a56f5680419d3a16ff55d97ec3c"
+            .to_string()
+            .parse::<TenantShardId>()?;
+        let timeline_id = "be322c834ed9e709e63b5c9698691910"
+            .to_string()
+            .parse::<TimelineId>()?;
+        let generation = Generation::new(123);
+
+        let object =
+            RemotePath::from_string(&format!("tenants/{tenant_id}/timelines/{timeline_id}/foo"))?;
+        let mut objects = [object].to_vec();
+
+        let mut example = DeletionList::new(1);
+        example.push(&tenant_id, &timeline_id, generation, &mut objects);
+
+        let encoded = serde_json::to_string(&example)?;
+
+        let expected = "{\"version\":1,\"sequence\":1,\"tenants\":{\"ad6c1a56f5680419d3a16ff55d97ec3c\":{\"timelines\":{\"be322c834ed9e709e63b5c9698691910\":[\"foo\"]},\"generation\":123}},\"size\":1}".to_string();
+        assert_eq!(encoded, expected);
+
+        let decoded = serde_json::from_str::<DeletionList>(&encoded)?;
+        assert_eq!(example, decoded);
+
+        Ok(())
+    }
 }
--- a/pageserver/src/deletion_queue/list_writer.rs
+++ b/pageserver/src/deletion_queue/list_writer.rs
@@ -19,6 +19,7 @@ use std::collections::HashMap;
 use std::fs::create_dir_all;
 use std::time::Duration;

+use pageserver_api::shard::TenantShardId;
 use regex::Regex;
 use remote_storage::RemotePath;
 use tokio_util::sync::CancellationToken;
@@ -26,13 +27,13 @@ use tracing::debug;
 use tracing::info;
 use tracing::warn;
 use utils::generation::Generation;
-use utils::id::TenantId;
 use utils::id::TimelineId;

 use crate::config::PageServerConf;
 use crate::deletion_queue::TEMP_SUFFIX;
 use crate::metrics;
 use crate::tenant::remote_timeline_client::remote_layer_path;
+use crate::tenant::remote_timeline_client::LayerFileMetadata;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::virtual_file::on_fatal_io_error;
 use crate::virtual_file::MaybeFatalIo;
@@ -53,22 +54,22 @@ const FRONTEND_FLUSHING_TIMEOUT: Duration = Duration::from_millis(100);

 #[derive(Debug)]
 pub(super) struct DeletionOp {
-    pub(super) tenant_id: TenantId,
+    pub(super) tenant_shard_id: TenantShardId,
    pub(super) timeline_id: TimelineId,
    // `layers` and `objects` are both just lists of objects.  `layers` is used if you do not
    // have a config object handy to project it to a remote key, and need the consuming worker
    // to do it for you.
-    pub(super) layers: Vec<(LayerFileName, Generation)>,
+    pub(super) layers: Vec<(LayerFileName, LayerFileMetadata)>,
    pub(super) objects: Vec<RemotePath>,

-    /// The _current_ generation of the Tenant attachment in which we are enqueuing
+    /// The _current_ generation of the Tenant shard attachment in which we are enqueuing
    /// this deletion.
    pub(super) generation: Generation,
 }

 #[derive(Debug)]
 pub(super) struct RecoverOp {
-    pub(super) attached_tenants: HashMap<TenantId, Generation>,
+    pub(super) attached_tenants: HashMap<TenantShardId, Generation>,
 }

 #[derive(Debug)]
@@ -205,7 +206,7 @@ impl ListWriter {

    async fn recover(
        &mut self,
-        attached_tenants: HashMap<TenantId, Generation>,
+        attached_tenants: HashMap<TenantShardId, Generation>,
    ) -> Result<(), anyhow::Error> {
        debug!(
            "recovering with {} attached tenants",
@@ -308,8 +309,8 @@ impl ListWriter {
                // generation was issued to another node in the interval while we restarted,
                // then we may treat deletion lists from the previous generation as if they
                // belong to our currently attached generation, and proceed to validate & execute.
-                for (tenant_id, tenant_list) in &mut deletion_list.tenants {
-                    if let Some(attached_gen) = attached_tenants.get(tenant_id) {
+                for (tenant_shard_id, tenant_list) in &mut deletion_list.tenants {
+                    if let Some(attached_gen) = attached_tenants.get(tenant_shard_id) {
                        if attached_gen.previous() == tenant_list.generation {
                            tenant_list.generation = *attached_gen;
                        }
@@ -387,25 +388,26 @@ impl ListWriter {
                    );

                    let mut layer_paths = Vec::new();
-                    for (layer, generation) in op.layers {
+                    for (layer, meta) in op.layers {
                        layer_paths.push(remote_layer_path(
-                            &op.tenant_id,
+                            &op.tenant_shard_id.tenant_id,
                            &op.timeline_id,
+                            meta.shard,
                            &layer,
-                            generation,
+                            meta.generation,
                        ));
                    }
                    layer_paths.extend(op.objects);

                    if !self.pending.push(
-                        &op.tenant_id,
+                        &op.tenant_shard_id,
                        &op.timeline_id,
                        op.generation,
                        &mut layer_paths,
                    ) {
                        self.flush().await;
                        let retry_succeeded = self.pending.push(
-                            &op.tenant_id,
+                            &op.tenant_shard_id,
                            &op.timeline_id,
                            op.generation,
                            &mut layer_paths,
--- a/pageserver/src/deletion_queue/validator.rs
+++ b/pageserver/src/deletion_queue/validator.rs
@@ -178,7 +178,14 @@ where
                .unwrap_or(false);

            if valid && *validated_generation == tenant_lsn_state.generation {
-                for (_timeline_id, pending_lsn) in tenant_lsn_state.timelines {
+                for (timeline_id, pending_lsn) in tenant_lsn_state.timelines {
+                    tracing::debug!(
+                        %tenant_id,
+                        %timeline_id,
+                        current = %pending_lsn.result_slot.load(),
+                        projected = %pending_lsn.projected,
+                        "advancing validated remote_consistent_lsn",
+                    );
                    pending_lsn.result_slot.store(pending_lsn.projected);
                }
            } else {
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -310,7 +310,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                .unwrap()
                .as_micros(),
            partition,
-            desc.tenant_id,
+            desc.tenant_shard_id,
            desc.timeline_id,
            candidate.layer,
        );
@@ -380,7 +380,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
    let limit = Arc::new(tokio::sync::Semaphore::new(1000.max(max_batch_size)));

    for (timeline, batch) in batched {
-        let tenant_id = timeline.tenant_id;
+        let tenant_shard_id = timeline.tenant_shard_id;
        let timeline_id = timeline.timeline_id;
        let batch_size =
            u32::try_from(batch.len()).expect("batch size limited to u32::MAX during partitioning");
@@ -431,7 +431,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                (evicted_bytes, evictions_failed)
            }
        }
-        .instrument(tracing::info_span!("evict_batch", %tenant_id, %timeline_id, batch_size));
+        .instrument(tracing::info_span!("evict_batch", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, batch_size));

        js.spawn(evict);

@@ -572,7 +572,7 @@ async fn collect_eviction_candidates(
                continue;
            }
            let info = tl.get_local_layers_for_disk_usage_eviction().await;
-            debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
+            debug!(tenant_id=%tl.tenant_shard_id.tenant_id, shard_id=%tl.tenant_shard_id.shard_slug(), timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len());
            tenant_candidates.extend(
                info.resident_layers
                    .into_iter()
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -624,6 +624,99 @@ paths:
                $ref: "#/components/schemas/ServiceUnavailableError"


+  /v1/tenant/{tenant_id}/location_config:
+    parameters:
+      - name: tenant_id
+        in: path
+        required: true
+        schema:
+          type: string
+          format: hex
+      - name: flush_ms
+        in: query
+        required: false
+        schema:
+          type: integer
+    put:
+      description: |
+        Configures a _tenant location_, that is how a particular pageserver handles
+        a particular tenant.  This includes _attached_ tenants, i.e. those ingesting WAL
+        and page service requests, and _secondary_ tenants, i.e. those which are just keeping
+        a warm cache in anticipation of transitioning to attached state in the future.
+
+        This is a declarative, idempotent API: there are not separate endpoints
+        for different tenant location configurations.  Rather, this single endpoint accepts
+        a description of the desired location configuration, and makes whatever changes
+        are required to reach that state.
+
+        In imperative terms, this API is used to attach and detach tenants, and
+        to transition tenants to and from secondary mode.
+
+        This is a synchronous API: there is no 202 response.  State transitions should always
+        be fast (milliseconds), with the exception of requests setting `flush_ms`, in which case
+        the caller controls the runtime of the request.
+
+        In some state transitions, it makes sense to flush dirty data to remote storage: this includes transitions
+        to AttachedStale and Detached.  Flushing is never necessary for correctness, but is an
+        important optimization when doing migrations.  The `flush_ms` parameter controls whether
+        flushing should be attempted, and how much time is allowed for flushing.  If the time limit expires,
+        the requested transition will continue without waiting for any outstanding data to flush.  Callers
+        should use a duration which is substantially less than their HTTP client's request
+        timeout.  It is safe to supply flush_ms irrespective of the request body: in state transitions
+        where flushing doesn't make sense, the server will ignore it.
+
+        It is safe to retry requests, but if one receives a 409 or 503 response, it is not
+        useful to retry aggressively: there is probably an existing request still ongoing.
+      requestBody:
+        required: false
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/TenantLocationConfigRequest"
+      responses:
+        "200":
+          description: Tenant is now in requested state
+        "503":
+          description: Tenant's state cannot be changed right now.  Wait a few seconds and retry.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+        "401":
+          description: Unauthorized Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/UnauthorizedError"
+        "403":
+          description: Forbidden Error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ForbiddenError"
+        "409":
+          description: |
+            The tenant is already known to Pageserver in some way,
+            and hence this `/attach` call has been rejected.
+
+            Some examples of how this can happen:
+            - tenant was created on this pageserver
+            - tenant attachment was started by an earlier call to `/attach`.
+
+            Callers should poll the tenant status's `attachment_status` field,
+            like for status 202. See the longer description for `POST /attach`
+            for details.
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/ConflictError"
+        "500":
+          description: Generic operation error
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/Error"
+
  /v1/tenant/{tenant_id}/detach:
    parameters:
      - name: tenant_id
@@ -935,6 +1028,9 @@ paths:
                  format: hex
                pg_version:
                  type: integer
+                existing_initdb_timeline_id:
+                  type: string
+                  format: hex
      responses:
        "201":
          description: TimelineInfo
@@ -1274,6 +1370,31 @@ components:
            tenant_id:
              type: string
              format: hex
+    TenantLocationConfigRequest:
+      type: object
+      required:
+        - tenant_id
+      properties:
+        tenant_id:
+          type: string
+          format: hex
+        mode:
+          type: string
+          enum: ["AttachedSingle", "AttachedMulti", "AttachedStale", "Secondary", "Detached"]
+          description: Mode of functionality that this pageserver will run in for this tenant.
+        generation:
+          type: integer
+          description: Attachment generation number, mandatory when `mode` is an attached state
+        secondary_conf:
+          $ref: '#/components/schemas/SecondaryConfig'
+        tenant_conf:
+          $ref: '#/components/schemas/TenantConfig'
+    SecondaryConfig:
+      type: object
+      properties:
+        warm:
+          type: boolean
+          description: Whether to poll remote storage for layers to download.  If false, secondary locations don't download anything.
    TenantConfig:
      type: object
      properties:
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -4,6 +4,7 @@
 use std::collections::HashMap;
 use std::str::FromStr;
 use std::sync::Arc;
+use std::time::Duration;

 use anyhow::{anyhow, Context, Result};
 use enumset::EnumSet;
@@ -337,13 +338,8 @@ async fn build_timeline_info_common(
        Lsn(0) => None,
        lsn @ Lsn(_) => Some(lsn),
    };
-    let current_logical_size = match timeline.get_current_logical_size(ctx) {
-        Ok((size, _)) => Some(size),
-        Err(err) => {
-            error!("Timeline info creation failed to get current logical size: {err:?}");
-            None
-        }
-    };
+    let current_logical_size =
+        timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx);
    let current_physical_size = Some(timeline.layer_size_sum().await);
    let state = timeline.current_state();
    let remote_consistent_lsn_projected = timeline
@@ -356,7 +352,8 @@ async fn build_timeline_info_common(
    let walreceiver_status = timeline.walreceiver_status();

    let info = TimelineInfo {
-        tenant_id: timeline.tenant_id,
+        // TODO(sharding): add a shard_id field, or make tenant_id into a tenant_shard_id
+        tenant_id: timeline.tenant_shard_id.tenant_id,
        timeline_id: timeline.timeline_id,
        ancestor_timeline_id,
        ancestor_lsn,
@@ -366,7 +363,11 @@ async fn build_timeline_info_common(
        last_record_lsn,
        prev_record_lsn: Some(timeline.get_prev_record_lsn()),
        latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
-        current_logical_size,
+        current_logical_size: current_logical_size.size_dont_care_about_accuracy(),
+        current_logical_size_is_accurate: match current_logical_size.accuracy() {
+            tenant::timeline::logical_size::Accuracy::Approximate => false,
+            tenant::timeline::logical_size::Accuracy::Exact => true,
+        },
        current_physical_size,
        current_logical_size_non_incremental: None,
        timeline_dir_layer_file_size_sum: None,
@@ -439,6 +440,7 @@ async fn timeline_create_handler(
            request_data.ancestor_timeline_id.map(TimelineId::from),
            request_data.ancestor_start_lsn,
            request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
+            request_data.existing_initdb_timeline_id,
            state.broker_client.clone(),
            &ctx,
        )
@@ -707,6 +709,26 @@ async fn tenant_detach_handler(
    json_response(StatusCode::OK, ())
 }

+async fn tenant_reset_handler(
+    request: Request<Body>,
+    _cancel: CancellationToken,
+) -> Result<Response<Body>, ApiError> {
+    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
+    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
+
+    let drop_cache: Option<bool> = parse_query_param(&request, "drop_cache")?;
+
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+    let state = get_state(&request);
+    state
+        .tenant_manager
+        .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
+        .await
+        .map_err(ApiError::InternalServerError)?;
+
+    json_response(StatusCode::OK, ())
+}
+
 async fn tenant_load_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -1157,6 +1179,7 @@ async fn put_tenant_location_config_handler(
    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;

    let request_data: TenantLocationConfigRequest = json_request(&mut request).await?;
+    let flush = parse_query_param(&request, "flush_ms")?.map(Duration::from_millis);
    check_permission(&request, Some(tenant_shard_id.tenant_id))?;

    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
@@ -1189,7 +1212,7 @@ async fn put_tenant_location_config_handler(

    state
        .tenant_manager
-        .upsert_location(tenant_shard_id, location_conf, &ctx)
+        .upsert_location(tenant_shard_id, location_conf, flush, &ctx)
        .await
        // TODO: badrequest assumes the caller was asking for something unreasonable, but in
        // principle we might have hit something like concurrent API calls to the same tenant,
@@ -1825,6 +1848,9 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_id/detach", |r| {
            api_handler(r, tenant_detach_handler)
        })
+        .post("/v1/tenant/:tenant_shard_id/reset", |r| {
+            api_handler(r, tenant_reset_handler)
+        })
        .post("/v1/tenant/:tenant_id/load", |r| {
            api_handler(r, tenant_load_handler)
        })
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -7,12 +7,13 @@ use std::pin::Pin;
 use std::task::{self, Poll};

 use anyhow::{bail, ensure, Context, Result};
+use async_compression::tokio::bufread::ZstdDecoder;
 use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
 use nix::NixPath;
-use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
+use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
 use tokio_tar::Archive;
 use tokio_tar::Builder;
 use tokio_tar::HeaderMode;
@@ -732,3 +733,13 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
    }
    Ok(compressed.buf)
 }
+
+pub async fn extract_tar_zst(
+    pgdata_path: &Utf8Path,
+    tar_zst: impl AsyncBufRead + Unpin,
+) -> Result<()> {
+    let tar = Box::pin(ZstdDecoder::new(tar_zst));
+    let mut archive = Archive::new(tar);
+    archive.unpack(pgdata_path).await?;
+    Ok(())
+}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -186,13 +186,6 @@ pub struct InitializationOrder {
    /// Each initial tenant load task carries this until completion.
    pub initial_tenant_load: Option<utils::completion::Completion>,

-    /// Barrier for when we can start initial logical size calculations.
-    pub initial_logical_size_can_start: utils::completion::Barrier,
-
-    /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
-    /// attempt. It is important to drop this once the attempt has completed.
-    pub initial_logical_size_attempt: Option<utils::completion::Completion>,
-
    /// Barrier for when we can start any background jobs.
    ///
    /// This can be broken up later on, but right now there is just one class of a background job.
@@ -212,7 +205,7 @@ async fn timed<Fut: std::future::Future>(
    match tokio::time::timeout(warn_at, &mut fut).await {
        Ok(ret) => {
            tracing::info!(
-                task = name,
+                stage = name,
                elapsed_ms = started.elapsed().as_millis(),
                "completed"
            );
@@ -220,7 +213,7 @@ async fn timed<Fut: std::future::Future>(
        }
        Err(_) => {
            tracing::info!(
-                task = name,
+                stage = name,
                elapsed_ms = started.elapsed().as_millis(),
                "still waiting, taking longer than expected..."
            );
@@ -229,7 +222,7 @@ async fn timed<Fut: std::future::Future>(

            // this has a global allowed_errors
            tracing::warn!(
-                task = name,
+                stage = name,
                elapsed_ms = started.elapsed().as_millis(),
                "completed, took longer than expected"
            );
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -7,6 +7,7 @@ use metrics::{
    HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
+use pageserver_api::shard::TenantShardId;
 use strum::{EnumCount, IntoEnumIterator, VariantNames};
 use strum_macros::{EnumVariantNames, IntoStaticStr};
 use utils::id::{TenantId, TimelineId};
@@ -314,6 +315,7 @@ static PAGE_CACHE_ERRORS: Lazy<IntCounterVec> = Lazy::new(|| {
 #[strum(serialize_all = "kebab_case")]
 pub(crate) enum PageCacheErrorKind {
    AcquirePinnedSlotTimeout,
+    EvictIterLimit,
 }

 pub(crate) fn page_cache_errors_inc(error_kind: PageCacheErrorKind) {
@@ -401,6 +403,129 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define current logical size metric")
 });

+pub(crate) mod initial_logical_size {
+    use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
+    use once_cell::sync::Lazy;
+
+    pub(crate) struct StartCalculation(IntCounterVec);
+    pub(crate) static START_CALCULATION: Lazy<StartCalculation> = Lazy::new(|| {
+        StartCalculation(
+            register_int_counter_vec!(
+                "pageserver_initial_logical_size_start_calculation",
+                "Incremented each time we start an initial logical size calculation attempt. \
+                 The `circumstances` label provides some additional details.",
+                &["attempt", "circumstances"]
+            )
+            .unwrap(),
+        )
+    });
+
+    struct DropCalculation {
+        first: IntCounter,
+        retry: IntCounter,
+    }
+
+    static DROP_CALCULATION: Lazy<DropCalculation> = Lazy::new(|| {
+        let vec = register_int_counter_vec!(
+            "pageserver_initial_logical_size_drop_calculation",
+            "Incremented each time we abort a started size calculation attmpt.",
+            &["attempt"]
+        )
+        .unwrap();
+        DropCalculation {
+            first: vec.with_label_values(&["first"]),
+            retry: vec.with_label_values(&["retry"]),
+        }
+    });
+
+    pub(crate) struct Calculated {
+        pub(crate) births: IntCounter,
+        pub(crate) deaths: IntCounter,
+    }
+
+    pub(crate) static CALCULATED: Lazy<Calculated> = Lazy::new(|| Calculated {
+        births: register_int_counter!(
+            "pageserver_initial_logical_size_finish_calculation",
+            "Incremented every time we finish calculation of initial logical size.\
+             If everything is working well, this should happen at most once per Timeline object."
+        )
+        .unwrap(),
+        deaths: register_int_counter!(
+            "pageserver_initial_logical_size_drop_finished_calculation",
+            "Incremented when we drop a finished initial logical size calculation result.\
+             Mainly useful to turn pageserver_initial_logical_size_finish_calculation into a gauge."
+        )
+        .unwrap(),
+    });
+
+    pub(crate) struct OngoingCalculationGuard {
+        inc_drop_calculation: Option<IntCounter>,
+    }
+
+    #[derive(strum_macros::IntoStaticStr)]
+    pub(crate) enum StartCircumstances {
+        EmptyInitial,
+        SkippedConcurrencyLimiter,
+        AfterBackgroundTasksRateLimit,
+    }
+
+    impl StartCalculation {
+        pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
+            let circumstances_label: &'static str = circumstances.into();
+            self.0.with_label_values(&["first", circumstances_label]);
+            OngoingCalculationGuard {
+                inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
+            }
+        }
+        pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
+            let circumstances_label: &'static str = circumstances.into();
+            self.0.with_label_values(&["retry", circumstances_label]);
+            OngoingCalculationGuard {
+                inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
+            }
+        }
+    }
+
+    impl Drop for OngoingCalculationGuard {
+        fn drop(&mut self) {
+            if let Some(counter) = self.inc_drop_calculation.take() {
+                counter.inc();
+            }
+        }
+    }
+
+    impl OngoingCalculationGuard {
+        pub(crate) fn calculation_result_saved(mut self) -> FinishedCalculationGuard {
+            drop(self.inc_drop_calculation.take());
+            CALCULATED.births.inc();
+            FinishedCalculationGuard {
+                inc_on_drop: CALCULATED.deaths.clone(),
+            }
+        }
+    }
+
+    pub(crate) struct FinishedCalculationGuard {
+        inc_on_drop: IntCounter,
+    }
+
+    impl Drop for FinishedCalculationGuard {
+        fn drop(&mut self) {
+            self.inc_on_drop.inc();
+        }
+    }
+
+    // context: https://github.com/neondatabase/neon/issues/5963
+    pub(crate) static TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE: Lazy<IntCounter> =
+        Lazy::new(|| {
+            register_int_counter!(
+                "pageserver_initial_logical_size_timelines_where_walreceiver_got_approximate_size",
+                "Counter for the following event: walreceiver calls\
+                 Timeline::get_current_logical_size() and it returns `Approximate` for the first time."
+            )
+            .unwrap()
+        });
+}
+
 pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_states_count",
@@ -1251,9 +1376,20 @@ pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
    .unwrap()
 });

+pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "pageserver_wal_redo_process_launch_duration",
+        "Histogram of the duration of successful WalRedoProcess::launch calls",
+        redo_histogram_time_buckets!(),
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) struct WalRedoProcessCounters {
    pub(crate) started: IntCounter,
    pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
+    pub(crate) active_stderr_logger_tasks_started: IntCounter,
+    pub(crate) active_stderr_logger_tasks_finished: IntCounter,
 }

 #[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
@@ -1277,6 +1413,19 @@ impl Default for WalRedoProcessCounters {
            &["cause"],
        )
        .unwrap();
+
+        let active_stderr_logger_tasks_started = register_int_counter!(
+            "pageserver_walredo_stderr_logger_tasks_started_total",
+            "Number of active walredo stderr logger tasks that have started",
+        )
+        .unwrap();
+
+        let active_stderr_logger_tasks_finished = register_int_counter!(
+            "pageserver_walredo_stderr_logger_tasks_finished_total",
+            "Number of active walredo stderr logger tasks that have finished",
+        )
+        .unwrap();
+
        Self {
            started,
            killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
@@ -1284,6 +1433,8 @@ impl Default for WalRedoProcessCounters {
                let cause_str: &'static str = cause.into();
                killed.with_label_values(&[cause_str])
            })),
+            active_stderr_logger_tasks_started,
+            active_stderr_logger_tasks_finished,
        }
    }
 }
@@ -1570,9 +1721,9 @@ pub struct RemoteTimelineClientMetrics {
 }

 impl RemoteTimelineClientMetrics {
-    pub fn new(tenant_id: &TenantId, timeline_id: &TimelineId) -> Self {
+    pub fn new(tenant_shard_id: &TenantShardId, timeline_id: &TimelineId) -> Self {
        RemoteTimelineClientMetrics {
-            tenant_id: tenant_id.to_string(),
+            tenant_id: tenant_shard_id.tenant_id.to_string(),
            timeline_id: timeline_id.to_string(),
            calls_unfinished_gauge: Mutex::new(HashMap::default()),
            bytes_started_counter: Mutex::new(HashMap::default()),
@@ -1960,6 +2111,7 @@ pub fn preinitialize_metrics() {
        &WAL_REDO_TIME,
        &WAL_REDO_RECORDS_HISTOGRAM,
        &WAL_REDO_BYTES_HISTOGRAM,
+        &WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM,
    ]
    .into_iter()
    .for_each(|h| {
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -83,7 +83,6 @@ use std::{

 use anyhow::Context;
 use once_cell::sync::OnceCell;
-use tracing::instrument;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
@@ -253,9 +252,6 @@ pub struct PageCache {
    next_evict_slot: AtomicUsize,

    size_metrics: &'static PageCacheSizeMetrics,
-
-    find_victim_waiters:
-        nostarve_queue::Queue<(usize, tokio::sync::RwLockWriteGuard<'static, SlotInner>)>,
 }

 struct PinnedSlotsPermit(tokio::sync::OwnedSemaphorePermit);
@@ -434,9 +430,8 @@ impl PageCache {
    ///
    /// Store an image of the given page in the cache.
    ///
-    #[cfg_attr(test, instrument(skip_all, level = "trace", fields(%key, %lsn)))]
    pub async fn memorize_materialized_page(
-        &'static self,
+        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
        key: Key,
@@ -527,9 +522,8 @@ impl PageCache {

    // Section 1.2: Public interface functions for working with immutable file pages.

-    #[cfg_attr(test, instrument(skip_all, level = "trace", fields(?file_id, ?blkno)))]
    pub async fn read_immutable_buf(
-        &'static self,
+        &self,
        file_id: FileId,
        blkno: u32,
        ctx: &RequestContext,
@@ -635,7 +629,7 @@ impl PageCache {
    /// ```
    ///
    async fn lock_for_read(
-        &'static self,
+        &self,
        cache_key: &mut CacheKey,
        ctx: &RequestContext,
    ) -> anyhow::Result<ReadBufResult> {
@@ -857,15 +851,10 @@ impl PageCache {
    ///
    /// On return, the slot is empty and write-locked.
    async fn find_victim(
-        &'static self,
+        &self,
        _permit_witness: &PinnedSlotsPermit,
    ) -> anyhow::Result<(usize, tokio::sync::RwLockWriteGuard<SlotInner>)> {
-        let nostarve_position = self.find_victim_waiters.begin()
-            .expect("we initialize the nostarve queue to the same size as the slots semaphore, and the caller is presenting a permit");
-
-        let span = tracing::info_span!("find_victim", ?nostarve_position);
-        let _enter = span.enter();
-
+        let iter_limit = self.slots.len() * 10;
        let mut iters = 0;
        loop {
            iters += 1;
@@ -877,8 +866,41 @@ impl PageCache {
                let mut inner = match slot.inner.try_write() {
                    Ok(inner) => inner,
                    Err(_err) => {
-                        if iters > self.slots.len() * (MAX_USAGE_COUNT as usize) {
-                            unreachable!("find_victim_waiters prevents starvation");
+                        if iters > iter_limit {
+                            // NB: Even with the permits, there's no hard guarantee that we will find a slot with
+                            // any particular number of iterations: other threads might race ahead and acquire and
+                            // release pins just as we're scanning the array.
+                            //
+                            // Imagine that nslots is 2, and as starting point, usage_count==1 on all
+                            // slots. There are two threads running concurrently, A and B. A has just
+                            // acquired the permit from the semaphore.
+                            //
+                            //   A: Look at slot 1. Its usage_count == 1, so decrement it to zero, and continue the search
+                            //   B: Acquire permit.
+                            //   B: Look at slot 2, decrement its usage_count to zero and continue the search
+                            //   B: Look at slot 1. Its usage_count is zero, so pin it and bump up its usage_count to 1.
+                            //   B: Release pin and permit again
+                            //   B: Acquire permit.
+                            //   B: Look at slot 2. Its usage_count is zero, so pin it and bump up its usage_count to 1.
+                            //   B: Release pin and permit again
+                            //
+                            // Now we're back in the starting situation that both slots have
+                            // usage_count 1, but A has now been through one iteration of the
+                            // find_victim() loop. This can repeat indefinitely and on each
+                            // iteration, A's iteration count increases by one.
+                            //
+                            // So, even though the semaphore for the permits is fair, the victim search
+                            // itself happens in parallel and is not fair.
+                            // Hence even with a permit, a task can theoretically be starved.
+                            // To avoid this, we'd need tokio to give priority to tasks that are holding
+                            // permits for longer.
+                            // Note that just yielding to tokio during iteration without such
+                            // priority boosting is likely counter-productive. We'd just give more opportunities
+                            // for B to bump usage count, further starving A.
+                            crate::metrics::page_cache_errors_inc(
+                                crate::metrics::PageCacheErrorKind::EvictIterLimit,
+                            );
+                            anyhow::bail!("exceeded evict iter limit");
                        }
                        continue;
                    }
@@ -889,8 +911,7 @@ impl PageCache {
                    inner.key = None;
                }
                crate::metrics::PAGE_CACHE_FIND_VICTIMS_ITERS_TOTAL.inc_by(iters as u64);
-
-                return Ok(nostarve_position.complete_and_wait((slot_idx, inner)).await);
+                return Ok((slot_idx, inner));
            }
        }
    }
@@ -934,7 +955,6 @@ impl PageCache {
            next_evict_slot: AtomicUsize::new(0),
            size_metrics,
            pinned_slots: Arc::new(tokio::sync::Semaphore::new(num_pages)),
-            find_victim_waiters: ::nostarve_queue::Queue::new(num_pages),
        }
    }
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -53,12 +53,14 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
+use crate::pgdatadir_mapping::rel_block_to_key;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
 use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
+use crate::tenant::mgr::ShardSelector;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;

@@ -399,18 +401,25 @@ impl PageServerHandler {
    {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        // Make request tracer if needed
+        // Note that since one connection may contain getpage requests that target different
+        // shards (e.g. during splitting when the compute is not yet aware of the split), the tenant
+        // that we look up here may not be the one that serves all the actual requests: we will double
+        // check the mapping of key->shard later before calling into Timeline for getpage requests.
        let tenant = mgr::get_active_tenant_with_timeout(
            tenant_id,
+            ShardSelector::First,
            ACTIVE_TENANT_TIMEOUT,
            &task_mgr::shutdown_token(),
        )
        .await?;
+
+        // Make request tracer if needed
        let mut tracer = if tenant.get_trace_read_requests() {
            let connection_id = ConnectionId::generate();
-            let path = tenant
-                .conf
-                .trace_path(&tenant_id, &timeline_id, &connection_id);
+            let path =
+                tenant
+                    .conf
+                    .trace_path(&tenant.tenant_shard_id(), &timeline_id, &connection_id);
            Some(Tracer::new(path))
        } else {
            None
@@ -562,6 +571,7 @@ impl PageServerHandler {
        info!("creating new timeline");
        let tenant = get_active_tenant_with_timeout(
            tenant_id,
+            ShardSelector::Zero,
            ACTIVE_TENANT_TIMEOUT,
            &task_mgr::shutdown_token(),
        )
@@ -624,7 +634,7 @@ impl PageServerHandler {
        debug_assert_current_span_has_tenant_and_timeline_id();

        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id)
+            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
            .await?;
        let last_record_lsn = timeline.get_last_record_lsn();
        if last_record_lsn != start_lsn {
@@ -803,9 +813,49 @@ impl PageServerHandler {
        }
        */

-        let page = timeline
-            .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
-            .await?;
+        let key = rel_block_to_key(req.rel, req.blkno);
+        let page = if timeline.get_shard_identity().is_key_local(&key) {
+            timeline
+                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
+                .await?
+        } else {
+            // The Tenant shard we looked up at connection start does not hold this particular
+            // key: look for other shards in this tenant.  This scenario occurs if a pageserver
+            // has multiple shards for the same tenant.
+            //
+            // TODO: optimize this (https://github.com/neondatabase/neon/pull/6037)
+            let timeline = match self
+                .get_active_tenant_timeline(
+                    timeline.tenant_shard_id.tenant_id,
+                    timeline.timeline_id,
+                    ShardSelector::Page(key),
+                )
+                .await
+            {
+                Ok(t) => t,
+                Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
+                    // We already know this tenant exists in general, because we resolved it at
+                    // start of connection.  Getting a NotFound here indicates that the shard containing
+                    // the requested page is not present on this node.
+
+                    // TODO: this should be some kind of structured error that the client will understand,
+                    // so that it can block until its config is updated: this error is expected in the case
+                    // that the Tenant's shards' placements are being updated and the client hasn't been
+                    // informed yet.
+                    //
+                    // https://github.com/neondatabase/neon/issues/6038
+                    return Err(anyhow::anyhow!("Request routed to wrong shard"));
+                }
+                Err(e) => return Err(e.into()),
+            };
+
+            // Take a GateGuard for the duration of this request.  If we were using our main Timeline object,
+            // the GateGuard was already held over the whole connection.
+            let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
+            timeline
+                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
+                .await?
+        };

        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
            page,
@@ -834,7 +884,7 @@ impl PageServerHandler {

        // check that the timeline exists
        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id)
+            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
            .await?;
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
@@ -940,9 +990,11 @@ impl PageServerHandler {
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
+        selector: ShardSelector,
    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
        let tenant = get_active_tenant_with_timeout(
            tenant_id,
+            selector,
            ACTIVE_TENANT_TIMEOUT,
            &task_mgr::shutdown_token(),
        )
@@ -1116,7 +1168,7 @@ where

            self.check_permission(Some(tenant_id))?;
            let timeline = self
-                .get_active_tenant_timeline(tenant_id, timeline_id)
+                .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
                .await?;

            let end_of_timeline = timeline.get_last_record_rlsn();
@@ -1303,6 +1355,7 @@ where

            let tenant = get_active_tenant_with_timeout(
                tenant_id,
+                ShardSelector::Zero,
                ACTIVE_TENANT_TIMEOUT,
                &task_mgr::shutdown_token(),
            )
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -13,6 +13,7 @@ use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use bytes::{Buf, Bytes};
+use pageserver_api::key::is_rel_block_key;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -282,6 +283,10 @@ impl Timeline {
    }

    /// Get a list of all existing relations in given tablespace and database.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
    pub async fn list_rels(
        &self,
        spcnode: Oid,
@@ -630,6 +635,10 @@ impl Timeline {
    ///
    /// Only relation blocks are counted currently. That excludes metadata,
    /// SLRUs, twophase files etc.
+    ///
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
    pub async fn get_current_logical_size_non_incremental(
        &self,
        lsn: Lsn,
@@ -1314,7 +1323,7 @@ impl<'a> DatadirModification<'a> {
        // Flush relation and  SLRU data blocks, keep metadata.
        let mut retained_pending_updates = HashMap::new();
        for (key, value) in self.pending_updates.drain() {
-            if is_rel_block_key(key) || is_slru_block_key(key) {
+            if is_rel_block_key(&key) || is_slru_block_key(key) {
                // This bails out on first error without modifying pending_updates.
                // That's Ok, cf this function's doc comment.
                writer.put(key, self.lsn, &value, ctx).await?;
@@ -1570,7 +1579,7 @@ fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
    }
 }

-fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
+pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
    Key {
        field1: 0x00,
        field2: rel.spcnode,
@@ -1769,10 +1778,6 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
    })
 }

-fn is_rel_block_key(key: Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0
-}
-
 pub fn is_rel_fsm_block_key(key: Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
 }
--- a/pageserver/src/repository.rs
+++ b/pageserver/src/repository.rs
@@ -138,6 +138,14 @@ pub struct GcResult {

    #[serde(serialize_with = "serialize_duration_as_millis")]
    pub elapsed: Duration,
+
+    /// The layers which were garbage collected.
+    ///
+    /// Used in `/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc` to wait for the layers to be
+    /// dropped in tests.
+    #[cfg(feature = "testing")]
+    #[serde(skip)]
+    pub(crate) doomed_layers: Vec<crate::tenant::storage_layer::Layer>,
 }

 // helper function for `GcResult`, serializing a `Duration` as an integer number of milliseconds
@@ -158,5 +166,11 @@ impl AddAssign for GcResult {
        self.layers_removed += other.layers_removed;

        self.elapsed += other.elapsed;
+
+        #[cfg(feature = "testing")]
+        {
+            let mut other = other;
+            self.doomed_layers.append(&mut other.doomed_layers);
+        }
    }
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -8,9 +8,12 @@
 //! We cannot use global or default config instead, because wrong settings
 //! may lead to a data loss.
 //!
-use anyhow::Context;
+use anyhow::bail;
 use pageserver_api::models;
+use pageserver_api::shard::{ShardCount, ShardIdentity, ShardNumber, ShardStripeSize};
+use serde::de::IntoDeserializer;
 use serde::{Deserialize, Serialize};
+use serde_json::Value;
 use std::num::NonZeroU64;
 use std::time::Duration;
 use utils::generation::Generation;
@@ -88,6 +91,14 @@ pub(crate) struct LocationConf {
    /// The location-specific part of the configuration, describes the operating
    /// mode of this pageserver for this tenant.
    pub(crate) mode: LocationMode,
+
+    /// The detailed shard identity.  This structure is already scoped within
+    /// a TenantShardId, but we need the full ShardIdentity to enable calculating
+    /// key->shard mappings.
+    #[serde(default = "ShardIdentity::unsharded")]
+    #[serde(skip_serializing_if = "ShardIdentity::is_unsharded")]
+    pub(crate) shard: ShardIdentity,
+
    /// The pan-cluster tenant configuration, the same on all locations
    pub(crate) tenant_conf: TenantConfOpt,
 }
@@ -160,6 +171,8 @@ impl LocationConf {
                generation,
                attach_mode: AttachmentMode::Single,
            }),
+            // Legacy configuration loads are always from tenants created before sharding existed.
+            shard: ShardIdentity::unsharded(),
            tenant_conf,
        }
    }
@@ -187,6 +200,7 @@ impl LocationConf {

        fn get_generation(conf: &'_ models::LocationConfig) -> Result<Generation, anyhow::Error> {
            conf.generation
+                .map(Generation::new)
                .ok_or_else(|| anyhow::anyhow!("Generation must be set when attaching"))
        }

@@ -226,7 +240,21 @@ impl LocationConf {
            }
        };

-        Ok(Self { mode, tenant_conf })
+        let shard = if conf.shard_count == 0 {
+            ShardIdentity::unsharded()
+        } else {
+            ShardIdentity::new(
+                ShardNumber(conf.shard_number),
+                ShardCount(conf.shard_count),
+                ShardStripeSize(conf.shard_stripe_size),
+            )?
+        };
+
+        Ok(Self {
+            shard,
+            mode,
+            tenant_conf,
+        })
    }
 }

@@ -241,6 +269,7 @@ impl Default for LocationConf {
                attach_mode: AttachmentMode::Single,
            }),
            tenant_conf: TenantConfOpt::default(),
+            shard: ShardIdentity::unsharded(),
        }
    }
 }
@@ -494,105 +523,49 @@ impl Default for TenantConf {
    }
 }

-// Helper function to standardize the error messages we produce on bad durations
-//
-// Intended to be used with anyhow's `with_context`, e.g.:
-//
-//   let value = result.with_context(bad_duration("name", &value))?;
-//
-fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn() -> String {
-    move || format!("Cannot parse `{field_name}` duration {value:?}")
-}
-
 impl TryFrom<&'_ models::TenantConfig> for TenantConfOpt {
    type Error = anyhow::Error;

    fn try_from(request_data: &'_ models::TenantConfig) -> Result<Self, Self::Error> {
-        let mut tenant_conf = TenantConfOpt::default();
+        // Convert the request_data to a JSON Value
+        let json_value: Value = serde_json::to_value(request_data)?;

-        if let Some(gc_period) = &request_data.gc_period {
-            tenant_conf.gc_period = Some(
-                humantime::parse_duration(gc_period)
-                    .with_context(bad_duration("gc_period", gc_period))?,
-            );
-        }
-        tenant_conf.gc_horizon = request_data.gc_horizon;
-        tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
+        // Create a Deserializer from the JSON Value
+        let deserializer = json_value.into_deserializer();

-        if let Some(pitr_interval) = &request_data.pitr_interval {
-            tenant_conf.pitr_interval = Some(
-                humantime::parse_duration(pitr_interval)
-                    .with_context(bad_duration("pitr_interval", pitr_interval))?,
-            );
-        }
-
-        if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout {
-            tenant_conf.walreceiver_connect_timeout = Some(
-                humantime::parse_duration(walreceiver_connect_timeout).with_context(
-                    bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout),
-                )?,
-            );
-        }
-        if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout {
-            tenant_conf.lagging_wal_timeout = Some(
-                humantime::parse_duration(lagging_wal_timeout)
-                    .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?,
-            );
-        }
-        if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
-            tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
-        }
-        if let Some(trace_read_requests) = request_data.trace_read_requests {
-            tenant_conf.trace_read_requests = Some(trace_read_requests);
-        }
-
-        tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
-        if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout {
-            tenant_conf.checkpoint_timeout = Some(
-                humantime::parse_duration(checkpoint_timeout)
-                    .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?,
-            );
-        }
-
-        tenant_conf.compaction_target_size = request_data.compaction_target_size;
-        tenant_conf.compaction_threshold = request_data.compaction_threshold;
-
-        if let Some(compaction_period) = &request_data.compaction_period {
-            tenant_conf.compaction_period = Some(
-                humantime::parse_duration(compaction_period)
-                    .with_context(bad_duration("compaction_period", compaction_period))?,
-            );
-        }
-
-        if let Some(eviction_policy) = &request_data.eviction_policy {
-            tenant_conf.eviction_policy = Some(
-                serde::Deserialize::deserialize(eviction_policy)
-                    .context("parse field `eviction_policy`")?,
-            );
-        }
-
-        tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
-
-        if let Some(evictions_low_residence_duration_metric_threshold) =
-            &request_data.evictions_low_residence_duration_metric_threshold
-        {
-            tenant_conf.evictions_low_residence_duration_metric_threshold = Some(
-                humantime::parse_duration(evictions_low_residence_duration_metric_threshold)
-                    .with_context(bad_duration(
-                        "evictions_low_residence_duration_metric_threshold",
-                        evictions_low_residence_duration_metric_threshold,
-                    ))?,
-            );
-        }
-        tenant_conf.gc_feedback = request_data.gc_feedback;
+        // Use serde_path_to_error to deserialize the JSON Value into TenantConfOpt
+        let tenant_conf: TenantConfOpt = serde_path_to_error::deserialize(deserializer)?;

        Ok(tenant_conf)
    }
 }

+impl TryFrom<toml_edit::Item> for TenantConfOpt {
+    type Error = anyhow::Error;
+
+    fn try_from(item: toml_edit::Item) -> Result<Self, Self::Error> {
+        match item {
+            toml_edit::Item::Value(value) => {
+                let d = value.into_deserializer();
+                return serde_path_to_error::deserialize(d)
+                    .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
+            }
+            toml_edit::Item::Table(table) => {
+                let deserializer = toml_edit::de::Deserializer::new(table.into());
+                return serde_path_to_error::deserialize(deserializer)
+                    .map_err(|e| anyhow::anyhow!("{}: {}", e.path(), e.inner().message()));
+            }
+            _ => {
+                bail!("expected non-inline table but found {item}")
+            }
+        }
+    }
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
+    use models::TenantConfig;

    #[test]
    fn de_serializing_pageserver_config_omits_empty_values() {
@@ -609,4 +582,38 @@ mod tests {
        assert_eq!(json_form, "{\"gc_horizon\":42}");
        assert_eq!(small_conf, serde_json::from_str(&json_form).unwrap());
    }
+
+    #[test]
+    fn test_try_from_models_tenant_config_err() {
+        let tenant_config = models::TenantConfig {
+            lagging_wal_timeout: Some("5a".to_string()),
+            ..TenantConfig::default()
+        };
+
+        let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config);
+
+        assert!(
+            tenant_conf_opt.is_err(),
+            "Suceeded to convert TenantConfig to TenantConfOpt"
+        );
+
+        let expected_error_str =
+            "lagging_wal_timeout: invalid value: string \"5a\", expected a duration";
+        assert_eq!(tenant_conf_opt.unwrap_err().to_string(), expected_error_str);
+    }
+
+    #[test]
+    fn test_try_from_models_tenant_config_success() {
+        let tenant_config = models::TenantConfig {
+            lagging_wal_timeout: Some("5s".to_string()),
+            ..TenantConfig::default()
+        };
+
+        let tenant_conf_opt = TenantConfOpt::try_from(&tenant_config).unwrap();
+
+        assert_eq!(
+            tenant_conf_opt.lagging_wal_timeout,
+            Some(Duration::from_secs(5))
+        );
+    }
 }
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -2,22 +2,19 @@ use std::sync::Arc;

 use anyhow::Context;
 use camino::{Utf8Path, Utf8PathBuf};
-use pageserver_api::models::TenantState;
+use pageserver_api::{models::TenantState, shard::TenantShardId};
 use remote_storage::{GenericRemoteStorage, RemotePath};
 use tokio::sync::OwnedMutexGuard;
 use tokio_util::sync::CancellationToken;
-use tracing::{error, instrument, warn, Instrument, Span};
+use tracing::{error, instrument, Instrument, Span};

-use utils::{
-    backoff, completion, crashsafe, fs_ext,
-    id::{TenantId, TimelineId},
-};
+use utils::{backoff, completion, crashsafe, fs_ext, id::TimelineId};

 use crate::{
    config::PageServerConf,
    context::RequestContext,
    task_mgr::{self, TaskKind},
-    InitializationOrder,
+    tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
 };

 use super::{
@@ -59,10 +56,10 @@ type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;

 fn remote_tenant_delete_mark_path(
    conf: &PageServerConf,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
 ) -> anyhow::Result<RemotePath> {
    let tenant_remote_path = conf
-        .tenant_path(tenant_id)
+        .tenant_path(tenant_shard_id)
        .strip_prefix(&conf.workdir)
        .context("Failed to strip workdir prefix")
        .and_then(RemotePath::new)
@@ -73,9 +70,9 @@ fn remote_tenant_delete_mark_path(
 async fn create_remote_delete_mark(
    conf: &PageServerConf,
    remote_storage: &GenericRemoteStorage,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
 ) -> Result<(), DeleteTenantError> {
-    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
+    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;

    let data: &[u8] = &[];
    backoff::retry(
@@ -99,9 +96,9 @@ async fn create_remote_delete_mark(

 async fn create_local_delete_mark(
    conf: &PageServerConf,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
 ) -> Result<(), DeleteTenantError> {
-    let marker_path = conf.tenant_deleted_mark_file_path(tenant_id);
+    let marker_path = conf.tenant_deleted_mark_file_path(tenant_shard_id);

    // Note: we're ok to replace existing file.
    let _ = std::fs::OpenOptions::new()
@@ -170,10 +167,10 @@ async fn ensure_timelines_dir_empty(timelines_path: &Utf8Path) -> Result<(), Del
 async fn remove_tenant_remote_delete_mark(
    conf: &PageServerConf,
    remote_storage: Option<&GenericRemoteStorage>,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
 ) -> Result<(), DeleteTenantError> {
    if let Some(remote_storage) = remote_storage {
-        let path = remote_tenant_delete_mark_path(conf, tenant_id)?;
+        let path = remote_tenant_delete_mark_path(conf, tenant_shard_id)?;
        backoff::retry(
            || async { remote_storage.delete(&path).await },
            |_e| false,
@@ -192,7 +189,7 @@ async fn remove_tenant_remote_delete_mark(
 // Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
 async fn cleanup_remaining_fs_traces(
    conf: &PageServerConf,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
 ) -> Result<(), DeleteTenantError> {
    let rm = |p: Utf8PathBuf, is_dir: bool| async move {
        if is_dir {
@@ -204,8 +201,8 @@ async fn cleanup_remaining_fs_traces(
        .with_context(|| format!("failed to delete {p}"))
    };

-    rm(conf.tenant_config_path(tenant_id), false).await?;
-    rm(conf.tenant_location_config_path(tenant_id), false).await?;
+    rm(conf.tenant_config_path(tenant_shard_id), false).await?;
+    rm(conf.tenant_location_config_path(tenant_shard_id), false).await?;

    fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
        Err(anyhow::anyhow!(
@@ -213,7 +210,7 @@ async fn cleanup_remaining_fs_traces(
        ))?
    });

-    rm(conf.timelines_path(tenant_id), true).await?;
+    rm(conf.timelines_path(tenant_shard_id), true).await?;

    fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
        Err(anyhow::anyhow!(
@@ -227,14 +224,14 @@ async fn cleanup_remaining_fs_traces(
    // to be reordered later and thus missed if a crash occurs.
    // Note that we dont need to sync after mark file is removed
    // because we can tolerate the case when mark file reappears on startup.
-    let tenant_path = &conf.tenant_path(tenant_id);
+    let tenant_path = &conf.tenant_path(tenant_shard_id);
    if tenant_path.exists() {
-        crashsafe::fsync_async(&conf.tenant_path(tenant_id))
+        crashsafe::fsync_async(&conf.tenant_path(tenant_shard_id))
            .await
            .context("fsync_pre_mark_remove")?;
    }

-    rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
+    rm(conf.tenant_deleted_mark_file_path(tenant_shard_id), false).await?;

    fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
        Err(anyhow::anyhow!(
@@ -242,7 +239,7 @@ async fn cleanup_remaining_fs_traces(
        ))?
    });

-    rm(conf.tenant_path(tenant_id), true).await?;
+    rm(conf.tenant_path(tenant_shard_id), true).await?;

    Ok(())
 }
@@ -287,6 +284,8 @@ impl DeleteTenantFlow {
    ) -> Result<(), DeleteTenantError> {
        span::debug_assert_current_span_has_tenant_id();

+        pausable_failpoint!("tenant-delete-before-run");
+
        let mut guard = Self::prepare(&tenant).await?;

        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
@@ -321,7 +320,7 @@ impl DeleteTenantFlow {
        // Though sounds scary, different mark name?
        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
        if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
+            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_shard_id)
                .await
                .context("remote_mark")?
        }
@@ -332,7 +331,7 @@ impl DeleteTenantFlow {
            ))?
        });

-        create_local_delete_mark(conf, &tenant.tenant_id)
+        create_local_delete_mark(conf, &tenant.tenant_shard_id)
            .await
            .context("local delete mark")?;

@@ -374,9 +373,11 @@ impl DeleteTenantFlow {
            return Ok(acquire(tenant));
        }

-        let tenant_id = tenant.tenant_id;
        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
-        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
+        if conf
+            .tenant_deleted_mark_file_path(&tenant.tenant_shard_id)
+            .exists()
+        {
            Ok(acquire(tenant))
        } else {
            Ok(None)
@@ -388,7 +389,6 @@ impl DeleteTenantFlow {
        tenant: &Arc<Tenant>,
        preload: Option<TenantPreload>,
        tenants: &'static std::sync::RwLock<TenantsMap>,
-        init_order: Option<InitializationOrder>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
        let (_, progress) = completion::channel();
@@ -398,10 +398,7 @@ impl DeleteTenantFlow {
            .await
            .expect("cant be stopping or broken");

-        tenant
-            .attach(init_order, preload, ctx)
-            .await
-            .context("attach")?;
+        tenant.attach(preload, ctx).await.context("attach")?;

        Self::background(
            guard,
@@ -459,12 +456,12 @@ impl DeleteTenantFlow {
        tenants: &'static std::sync::RwLock<TenantsMap>,
        tenant: Arc<Tenant>,
    ) {
-        let tenant_id = tenant.tenant_id;
+        let tenant_shard_id = tenant.tenant_shard_id;

        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            TaskKind::TimelineDeletionWorker,
-            Some(tenant_id),
+            Some(tenant_shard_id.tenant_id),
            None,
            "tenant_delete",
            false,
@@ -478,7 +475,7 @@ impl DeleteTenantFlow {
                Ok(())
            }
            .instrument({
-                let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
+                let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug());
                span.follows_from(Span::current());
                span
            }),
@@ -516,7 +513,7 @@ impl DeleteTenantFlow {
            }
        }

-        let timelines_path = conf.timelines_path(&tenant.tenant_id);
+        let timelines_path = conf.timelines_path(&tenant.tenant_shard_id);
        // May not exist if we fail in cleanup_remaining_fs_traces after removing it
        if timelines_path.exists() {
            // sanity check to guard against layout changes
@@ -525,7 +522,8 @@ impl DeleteTenantFlow {
                .context("timelines dir not empty")?;
        }

-        remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
+        remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_shard_id)
+            .await?;

        fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
            Err(anyhow::anyhow!(
@@ -533,21 +531,73 @@ impl DeleteTenantFlow {
            ))?
        });

-        cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
+        cleanup_remaining_fs_traces(conf, &tenant.tenant_shard_id)
            .await
            .context("cleanup_remaining_fs_traces")?;

        {
-            let mut locked = tenants.write().unwrap();
-            if locked.remove(&tenant.tenant_id).is_none() {
-                warn!("Tenant got removed from tenants map during deletion");
-            };
+            pausable_failpoint!("tenant-delete-before-map-remove");

-            // FIXME: we should not be modifying this from outside of mgr.rs.
-            // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
-            crate::metrics::TENANT_MANAGER
-                .tenant_slots
-                .set(locked.len() as u64);
+            // This block is simply removing the TenantSlot for this tenant.  It requires a loop because
+            // we might conflict with a TenantSlot::InProgress marker and need to wait for it.
+            //
+            // This complexity will go away when we simplify how deletion works:
+            // https://github.com/neondatabase/neon/issues/5080
+            loop {
+                // Under the TenantMap lock, try to remove the tenant.  We usually succeed, but if
+                // we encounter an InProgress marker, yield the barrier it contains and wait on it.
+                let barrier = {
+                    let mut locked = tenants.write().unwrap();
+                    let removed = locked.remove(&tenant.tenant_shard_id.tenant_id);
+
+                    // FIXME: we should not be modifying this from outside of mgr.rs.
+                    // This will go away when we simplify deletion (https://github.com/neondatabase/neon/issues/5080)
+                    crate::metrics::TENANT_MANAGER
+                        .tenant_slots
+                        .set(locked.len() as u64);
+
+                    match removed {
+                        TenantsMapRemoveResult::Occupied(TenantSlot::Attached(tenant)) => {
+                            match tenant.current_state() {
+                                TenantState::Stopping { .. } | TenantState::Broken { .. } => {
+                                    // Expected: we put the tenant into stopping state before we start deleting it
+                                }
+                                state => {
+                                    // Unexpected state
+                                    tracing::warn!(
+                                        "Tenant in unexpected state {state} after deletion"
+                                    );
+                                }
+                            }
+                            break;
+                        }
+                        TenantsMapRemoveResult::Occupied(TenantSlot::Secondary) => {
+                            // This is unexpected: this secondary tenants should not have been created, and we
+                            // are not in a position to shut it down from here.
+                            tracing::warn!("Tenant transitioned to secondary mode while deleting!");
+                            break;
+                        }
+                        TenantsMapRemoveResult::Occupied(TenantSlot::InProgress(_)) => {
+                            unreachable!("TenantsMap::remove handles InProgress separately, should never return it here");
+                        }
+                        TenantsMapRemoveResult::Vacant => {
+                            tracing::warn!(
+                                "Tenant removed from TenantsMap before deletion completed"
+                            );
+                            break;
+                        }
+                        TenantsMapRemoveResult::InProgress(barrier) => {
+                            // An InProgress entry was found, we must wait on its barrier
+                            barrier
+                        }
+                    }
+                };
+
+                tracing::info!(
+                    "Waiting for competing operation to complete before deleting state for tenant"
+                );
+                barrier.wait().await;
+            }
        }

        *guard = Self::Finished;
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -7,18 +7,19 @@ use crate::page_cache::{self, PAGE_SZ};
 use crate::tenant::block_io::{BlockCursor, BlockLease, BlockReader};
 use crate::virtual_file::VirtualFile;
 use camino::Utf8PathBuf;
+use pageserver_api::shard::TenantShardId;
 use std::cmp::min;
 use std::fs::OpenOptions;
 use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
 use std::sync::atomic::AtomicU64;
 use tracing::*;
-use utils::id::{TenantId, TimelineId};
+use utils::id::TimelineId;

 pub struct EphemeralFile {
    page_cache_file_id: page_cache::FileId,

-    _tenant_id: TenantId,
+    _tenant_shard_id: TenantShardId,
    _timeline_id: TimelineId,
    file: VirtualFile,
    len: u64,
@@ -31,7 +32,7 @@ pub struct EphemeralFile {
 impl EphemeralFile {
    pub async fn create(
        conf: &PageServerConf,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
    ) -> Result<EphemeralFile, io::Error> {
        static NEXT_FILENAME: AtomicU64 = AtomicU64::new(1);
@@ -39,7 +40,7 @@ impl EphemeralFile {
            NEXT_FILENAME.fetch_add(1, std::sync::atomic::Ordering::Relaxed);

        let filename = conf
-            .timeline_path(&tenant_id, &timeline_id)
+            .timeline_path(&tenant_shard_id, &timeline_id)
            .join(Utf8PathBuf::from(format!(
                "ephemeral-{filename_disambiguator}"
            )));
@@ -52,7 +53,7 @@ impl EphemeralFile {

        Ok(EphemeralFile {
            page_cache_file_id: page_cache::next_file_id(),
-            _tenant_id: tenant_id,
+            _tenant_shard_id: tenant_shard_id,
            _timeline_id: timeline_id,
            file,
            len: 0,
@@ -282,7 +283,7 @@ mod tests {
    ) -> Result<
        (
            &'static PageServerConf,
-            TenantId,
+            TenantShardId,
            TimelineId,
            RequestContext,
        ),
@@ -295,13 +296,13 @@ mod tests {
        // OK in a test.
        let conf: &'static PageServerConf = Box::leak(Box::new(conf));

-        let tenant_id = TenantId::from_str("11000000000000000000000000000000").unwrap();
+        let tenant_shard_id = TenantShardId::from_str("11000000000000000000000000000000").unwrap();
        let timeline_id = TimelineId::from_str("22000000000000000000000000000000").unwrap();
-        fs::create_dir_all(conf.timeline_path(&tenant_id, &timeline_id))?;
+        fs::create_dir_all(conf.timeline_path(&tenant_shard_id, &timeline_id))?;

        let ctx = RequestContext::new(TaskKind::UnitTest, DownloadBehavior::Error);

-        Ok((conf, tenant_id, timeline_id, ctx))
+        Ok((conf, tenant_shard_id, timeline_id, ctx))
    }

    #[tokio::test]
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -11,15 +11,12 @@
 use std::io::{self};

 use anyhow::{ensure, Context};
+use pageserver_api::shard::TenantShardId;
 use serde::{de::Error, Deserialize, Serialize, Serializer};
 use thiserror::Error;
 use utils::bin_ser::SerializeError;
 use utils::crashsafe::path_with_suffix_extension;
-use utils::{
-    bin_ser::BeSer,
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
+use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn};

 use crate::config::PageServerConf;
 use crate::virtual_file::VirtualFile;
@@ -272,14 +269,14 @@ impl Serialize for TimelineMetadata {
 }

 /// Save timeline metadata to file
-#[tracing::instrument(skip_all, fields(%tenant_id, %timeline_id))]
+#[tracing::instrument(skip_all, fields(%tenant_id=tenant_shard_id.tenant_id, %shard_id=tenant_shard_id.shard_slug(), %timeline_id))]
 pub async fn save_metadata(
    conf: &'static PageServerConf,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    data: &TimelineMetadata,
 ) -> anyhow::Result<()> {
-    let path = conf.metadata_path(tenant_id, timeline_id);
+    let path = conf.metadata_path(tenant_shard_id, timeline_id);
    let temp_path = path_with_suffix_extension(&path, TEMP_FILE_SUFFIX);
    let metadata_bytes = data.to_bytes().context("serialize metadata")?;
    VirtualFile::crashsafe_overwrite(&path, &temp_path, &metadata_bytes)
@@ -299,10 +296,10 @@ pub enum LoadMetadataError {

 pub fn load_metadata(
    conf: &'static PageServerConf,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
 ) -> Result<TimelineMetadata, LoadMetadataError> {
-    let metadata_path = conf.metadata_path(tenant_id, timeline_id);
+    let metadata_path = conf.metadata_path(tenant_shard_id, timeline_id);
    let metadata_bytes = std::fs::read(metadata_path)?;

    Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,7 +2,8 @@
 //! page server.

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
-use pageserver_api::shard::TenantShardId;
+use pageserver_api::key::Key;
+use pageserver_api::shard::{ShardIdentity, ShardNumber, TenantShardId};
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::collections::{BTreeMap, HashMap};
@@ -29,7 +30,9 @@ use crate::control_plane_client::{
 use crate::deletion_queue::DeletionQueueClient;
 use crate::metrics::TENANT_MANAGER as METRICS;
 use crate::task_mgr::{self, TaskKind};
-use crate::tenant::config::{AttachmentMode, LocationConf, LocationMode, TenantConfOpt};
+use crate::tenant::config::{
+    AttachedLocationConfig, AttachmentMode, LocationConf, LocationMode, TenantConfOpt,
+};
 use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::span::debug_assert_current_span_has_tenant_id;
 use crate::tenant::{create_tenant_files, AttachedTenantConf, SpawnMode, Tenant, TenantState};
@@ -122,6 +125,24 @@ fn exactly_one_or_none<'a>(
    }
 }

+pub(crate) enum TenantsMapRemoveResult {
+    Occupied(TenantSlot),
+    Vacant,
+    InProgress(utils::completion::Barrier),
+}
+
+/// When resolving a TenantId to a shard, we may be looking for the 0th
+/// shard, or we might be looking for whichever shard holds a particular page.
+pub(crate) enum ShardSelector {
+    /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
+    /// ignore it.
+    Zero,
+    /// Pick the first shard we find for the TenantId
+    First,
+    /// Pick the shard that holds this key
+    Page(Key),
+}
+
 impl TenantsMap {
    /// Convenience function for typical usage, where we want to get a `Tenant` object, for
    /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
@@ -136,12 +157,71 @@ impl TenantsMap {
        }
    }

-    pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<TenantSlot> {
+    /// A page service client sends a TenantId, and to look up the correct Tenant we must
+    /// resolve this to a fully qualified TenantShardId.
+    fn resolve_shard(
+        &self,
+        tenant_id: &TenantId,
+        selector: ShardSelector,
+    ) -> Option<TenantShardId> {
+        let mut want_shard = None;
        match self {
            TenantsMap::Initializing => None,
+            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
+                for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
+                    match selector {
+                        ShardSelector::First => return Some(*slot.0),
+                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
+                            return Some(*slot.0)
+                        }
+                        ShardSelector::Page(key) => {
+                            if let Some(tenant) = slot.1.get_attached() {
+                                // First slot we see for this tenant, calculate the expected shard number
+                                // for the key: we will use this for checking if this and subsequent
+                                // slots contain the key, rather than recalculating the hash each time.
+                                if want_shard.is_none() {
+                                    want_shard = Some(tenant.shard_identity.get_shard_number(&key));
+                                }
+
+                                if Some(tenant.shard_identity.number) == want_shard {
+                                    return Some(*slot.0);
+                                }
+                            } else {
+                                continue;
+                            }
+                        }
+                        _ => continue,
+                    }
+                }
+
+                // Fall through: we didn't find an acceptable shard
+                None
+            }
+        }
+    }
+
+    /// Only for use from DeleteTenantFlow.  This method directly removes a TenantSlot from the map.
+    ///
+    /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
+    /// slot if the enclosed tenant is shutdown.
+    pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> TenantsMapRemoveResult {
+        use std::collections::btree_map::Entry;
+        match self {
+            TenantsMap::Initializing => TenantsMapRemoveResult::Vacant,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
                let key = exactly_one_or_none(m, tenant_id).map(|(k, _)| *k);
-                key.and_then(|key| m.remove(&key))
+                match key {
+                    Some(key) => match m.entry(key) {
+                        Entry::Occupied(entry) => match entry.get() {
+                            TenantSlot::InProgress(barrier) => {
+                                TenantsMapRemoveResult::InProgress(barrier.clone())
+                            }
+                            _ => TenantsMapRemoveResult::Occupied(entry.remove()),
+                        },
+                        Entry::Vacant(_entry) => TenantsMapRemoveResult::Vacant,
+                    },
+                    None => TenantsMapRemoveResult::Vacant,
+                }
            }
        }
    }
@@ -190,49 +270,6 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
    Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));

-/// Create a directory, including parents.  This does no fsyncs and makes
-/// no guarantees about the persistence of the resulting metadata: for
-/// use when creating dirs for use as cache.
-async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
-    let mut dirs_to_create = Vec::new();
-    let mut path: &Utf8Path = path.as_ref();
-
-    // Figure out which directories we need to create.
-    loop {
-        let meta = tokio::fs::metadata(path).await;
-        match meta {
-            Ok(metadata) if metadata.is_dir() => break,
-            Ok(_) => {
-                return Err(std::io::Error::new(
-                    std::io::ErrorKind::AlreadyExists,
-                    format!("non-directory found in path: {path}"),
-                ));
-            }
-            Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {}
-            Err(e) => return Err(e),
-        }
-
-        dirs_to_create.push(path);
-
-        match path.parent() {
-            Some(parent) => path = parent,
-            None => {
-                return Err(std::io::Error::new(
-                    std::io::ErrorKind::InvalidInput,
-                    format!("can't find parent of path '{path}'"),
-                ));
-            }
-        }
-    }
-
-    // Create directories from parent to child.
-    for &path in dirs_to_create.iter().rev() {
-        tokio::fs::create_dir(path).await?;
-    }
-
-    Ok(())
-}
-
 /// The TenantManager is responsible for storing and mutating the collection of all tenants
 /// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
 /// lives inside the TenantManager.
@@ -250,8 +287,8 @@ pub struct TenantManager {
 }

 fn emergency_generations(
-    tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
-) -> HashMap<TenantId, Generation> {
+    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
+) -> HashMap<TenantShardId, Generation> {
    tenant_confs
        .iter()
        .filter_map(|(tid, lc)| {
@@ -271,10 +308,10 @@ fn emergency_generations(

 async fn init_load_generations(
    conf: &'static PageServerConf,
-    tenant_confs: &HashMap<TenantId, anyhow::Result<LocationConf>>,
+    tenant_confs: &HashMap<TenantShardId, anyhow::Result<LocationConf>>,
    resources: &TenantSharedResources,
    cancel: &CancellationToken,
-) -> anyhow::Result<Option<HashMap<TenantId, Generation>>> {
+) -> anyhow::Result<Option<HashMap<TenantShardId, Generation>>> {
    let generations = if conf.control_plane_emergency_mode {
        error!(
            "Emergency mode!  Tenants will be attached unsafely using their last known generation"
@@ -317,7 +354,7 @@ async fn init_load_generations(
 fn load_tenant_config(
    conf: &'static PageServerConf,
    dentry: Utf8DirEntry,
-) -> anyhow::Result<Option<(TenantId, anyhow::Result<LocationConf>)>> {
+) -> anyhow::Result<Option<(TenantShardId, anyhow::Result<LocationConf>)>> {
    let tenant_dir_path = dentry.path().to_path_buf();
    if crate::is_temporary(&tenant_dir_path) {
        info!("Found temporary tenant directory, removing: {tenant_dir_path}");
@@ -353,10 +390,10 @@ fn load_tenant_config(
        return Ok(None);
    }

-    let tenant_id = match tenant_dir_path
+    let tenant_shard_id = match tenant_dir_path
        .file_name()
        .unwrap_or_default()
-        .parse::<TenantId>()
+        .parse::<TenantShardId>()
    {
        Ok(id) => id,
        Err(_) => {
@@ -366,8 +403,8 @@ fn load_tenant_config(
    };

    Ok(Some((
-        tenant_id,
-        Tenant::load_tenant_config(conf, &tenant_id),
+        tenant_shard_id,
+        Tenant::load_tenant_config(conf, &tenant_shard_id),
    )))
 }

@@ -378,7 +415,7 @@ fn load_tenant_config(
 /// seconds even on reasonably fast drives.
 async fn init_load_tenant_configs(
    conf: &'static PageServerConf,
-) -> anyhow::Result<HashMap<TenantId, anyhow::Result<LocationConf>>> {
+) -> anyhow::Result<HashMap<TenantShardId, anyhow::Result<LocationConf>>> {
    let tenants_dir = conf.tenants_path();

    let dentries = tokio::task::spawn_blocking(move || -> anyhow::Result<Vec<Utf8DirEntry>> {
@@ -428,19 +465,19 @@ pub async fn init_tenant_mgr(
        init_load_generations(conf, &tenant_configs, &resources, &cancel).await?;

    // Construct `Tenant` objects and start them running
-    for (tenant_id, location_conf) in tenant_configs {
-        let tenant_dir_path = conf.tenant_path(&tenant_id);
+    for (tenant_shard_id, location_conf) in tenant_configs {
+        let tenant_dir_path = conf.tenant_path(&tenant_shard_id);

        let mut location_conf = match location_conf {
            Ok(l) => l,
            Err(e) => {
-                warn!(%tenant_id, "Marking tenant broken, failed to {e:#}");
+                warn!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Marking tenant broken, failed to {e:#}");

                tenants.insert(
-                    TenantShardId::unsharded(tenant_id),
+                    tenant_shard_id,
                    TenantSlot::Attached(Tenant::create_broken_tenant(
                        conf,
-                        tenant_id,
+                        tenant_shard_id,
                        format!("{}", e),
                    )),
                );
@@ -451,7 +488,7 @@ pub async fn init_tenant_mgr(
        let generation = if let Some(generations) = &tenant_generations {
            // We have a generation map: treat it as the authority for whether
            // this tenant is really attached.
-            if let Some(gen) = generations.get(&tenant_id) {
+            if let Some(gen) = generations.get(&tenant_shard_id) {
                *gen
            } else {
                match &location_conf.mode {
@@ -459,8 +496,8 @@ pub async fn init_tenant_mgr(
                        // We do not require the control plane's permission for secondary mode
                        // tenants, because they do no remote writes and hence require no
                        // generation number
-                        info!(%tenant_id, "Loaded tenant in secondary mode");
-                        tenants.insert(TenantShardId::unsharded(tenant_id), TenantSlot::Secondary);
+                        info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Loaded tenant in secondary mode");
+                        tenants.insert(tenant_shard_id, TenantSlot::Secondary);
                    }
                    LocationMode::Attached(_) => {
                        // TODO: augment re-attach API to enable the control plane to
@@ -468,9 +505,9 @@ pub async fn init_tenant_mgr(
                        // away local state, we can gracefully fall back to secondary here, if the control
                        // plane tells us so.
                        // (https://github.com/neondatabase/neon/issues/5377)
-                        info!(%tenant_id, "Detaching tenant, control plane omitted it in re-attach response");
+                        info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Detaching tenant, control plane omitted it in re-attach response");
                        if let Err(e) = safe_remove_tenant_dir_all(&tenant_dir_path).await {
-                            error!(%tenant_id,
+                            error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),
                                "Failed to remove detached tenant directory '{tenant_dir_path}': {e:?}",
                            );
                        }
@@ -482,21 +519,23 @@ pub async fn init_tenant_mgr(
        } else {
            // Legacy mode: no generation information, any tenant present
            // on local disk may activate
-            info!(%tenant_id, "Starting tenant in legacy mode, no generation",);
+            info!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Starting tenant in legacy mode, no generation",);
            Generation::none()
        };

        // Presence of a generation number implies attachment: attach the tenant
        // if it wasn't already, and apply the generation number.
        location_conf.attach_in_generation(generation);
-        Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;
+        Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;

+        let shard_identity = location_conf.shard;
        match tenant_spawn(
            conf,
-            tenant_id,
+            tenant_shard_id,
            &tenant_dir_path,
            resources.clone(),
            AttachedTenantConf::try_from(location_conf)?,
+            shard_identity,
            Some(init_order.clone()),
            &TENANTS,
            SpawnMode::Normal,
@@ -509,7 +548,7 @@ pub async fn init_tenant_mgr(
                );
            }
            Err(e) => {
-                error!(%tenant_id, "Failed to start tenant: {e:#}");
+                error!(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), "Failed to start tenant: {e:#}");
            }
        }
    }
@@ -533,10 +572,11 @@ pub async fn init_tenant_mgr(
 #[allow(clippy::too_many_arguments)]
 pub(crate) fn tenant_spawn(
    conf: &'static PageServerConf,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    tenant_path: &Utf8Path,
    resources: TenantSharedResources,
    location_conf: AttachedTenantConf,
+    shard_identity: ShardIdentity,
    init_order: Option<InitializationOrder>,
    tenants: &'static std::sync::RwLock<TenantsMap>,
    mode: SpawnMode,
@@ -557,18 +597,25 @@ pub(crate) fn tenant_spawn(
        "Cannot load tenant from empty directory {tenant_path:?}"
    );

-    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
+    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
    anyhow::ensure!(
-        !conf.tenant_ignore_mark_file_path(&tenant_id).exists(),
+        !conf.tenant_ignore_mark_file_path(&tenant_shard_id).exists(),
        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
    );

-    info!("Attaching tenant {tenant_id}");
+    info!(
+        tenant_id = %tenant_shard_id.tenant_id,
+        shard_id = %tenant_shard_id.shard_slug(),
+        generation = ?location_conf.location.generation,
+        attach_mode = ?location_conf.location.attach_mode,
+        "Attaching tenant"
+    );
    let tenant = match Tenant::spawn(
        conf,
-        tenant_id,
+        tenant_shard_id,
        resources,
        location_conf,
+        shard_identity,
        init_order,
        tenants,
        mode,
@@ -576,8 +623,8 @@ pub(crate) fn tenant_spawn(
    ) {
        Ok(tenant) => tenant,
        Err(e) => {
-            error!("Failed to spawn tenant {tenant_id}, reason: {e:#}");
-            Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
+            error!("Failed to spawn tenant {tenant_shard_id}, reason: {e:#}");
+            Tenant::create_broken_tenant(conf, tenant_shard_id, format!("{e:#}"))
        }
    };

@@ -732,19 +779,20 @@ pub(crate) async fn create_tenant(
    ctx: &RequestContext,
 ) -> Result<Arc<Tenant>, TenantMapInsertError> {
    let location_conf = LocationConf::attached_single(tenant_conf, generation);
+    info!("Creating tenant at location {location_conf:?}");

    let slot_guard =
        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
-    // TODO(sharding): make local paths shard-aware
-    let tenant_path =
-        super::create_tenant_files(conf, &location_conf, &tenant_shard_id.tenant_id).await?;
+    let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;

+    let shard_identity = location_conf.shard;
    let created_tenant = tenant_spawn(
        conf,
-        tenant_shard_id.tenant_id,
+        tenant_shard_id,
        &tenant_path,
        resources,
        AttachedTenantConf::try_from(location_conf)?,
+        shard_identity,
        None,
        &TENANTS,
        SpawnMode::Create,
@@ -781,8 +829,9 @@ pub(crate) async fn set_new_tenant_config(
    // API to use is the location_config/ endpoint, which lets the caller provide
    // the full LocationConf.
    let location_conf = LocationConf::attached_single(new_tenant_conf, tenant.generation);
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);

-    Tenant::persist_tenant_config(conf, &tenant_id, &location_conf)
+    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf)
        .await
        .map_err(SetNewTenantConfigError::Persist)?;
    tenant.set_new_tenant_config(new_tenant_conf);
@@ -792,8 +841,6 @@ pub(crate) async fn set_new_tenant_config(
 impl TenantManager {
    /// Gets the attached tenant from the in-memory data, erroring if it's absent, in secondary mode, or is not fitting to the query.
    /// `active_only = true` allows to query only tenants that are ready for operations, erroring on other kinds of tenants.
-    ///
-    /// This method is cancel-safe.
    pub(crate) fn get_attached_tenant_shard(
        &self,
        tenant_shard_id: TenantShardId,
@@ -838,10 +885,12 @@ impl TenantManager {
        Ok(())
    }

+    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
    pub(crate) async fn upsert_location(
        &self,
        tenant_shard_id: TenantShardId,
        new_location_config: LocationConf,
+        flush: Option<Duration>,
        ctx: &RequestContext,
    ) -> Result<(), anyhow::Error> {
        debug_assert_current_span_has_tenant_id();
@@ -850,7 +899,7 @@ impl TenantManager {
        // Special case fast-path for updates to Tenant: if our upsert is only updating configuration,
        // then we do not need to set the slot to InProgress, we can just call into the
        // existng tenant.
-        {
+        let modify_tenant = {
            let locked = self.tenants.read().unwrap();
            let peek_slot =
                tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Write)?;
@@ -861,22 +910,50 @@ impl TenantManager {
                        // take our fast path and just provide the updated configuration
                        // to the tenant.
                        tenant.set_new_location_config(AttachedTenantConf::try_from(
-                            new_location_config,
+                            new_location_config.clone(),
                        )?);

-                        // Persist the new config in the background, to avoid holding up any
-                        // locks while we do so.
-                        // TODO
-
-                        return Ok(());
+                        Some(tenant.clone())
                    } else {
                        // Different generations, fall through to general case
+                        None
                    }
                }
                _ => {
                    // Not an Attached->Attached transition, fall through to general case
+                    None
                }
            }
+        };
+
+        // Fast-path continued: having dropped out of the self.tenants lock, do the async
+        // phase of waiting for flush, before returning.
+        if let Some(tenant) = modify_tenant {
+            // Transition to AttachedStale means we may well hold a valid generation
+            // still, and have been requested to go stale as part of a migration.  If
+            // the caller set `flush`, then flush to remote storage.
+            if let LocationMode::Attached(AttachedLocationConfig {
+                generation: _,
+                attach_mode: AttachmentMode::Stale,
+            }) = &new_location_config.mode
+            {
+                if let Some(flush_timeout) = flush {
+                    match tokio::time::timeout(flush_timeout, tenant.flush_remote()).await {
+                        Ok(Err(e)) => {
+                            return Err(e);
+                        }
+                        Ok(Ok(_)) => return Ok(()),
+                        Err(_) => {
+                            tracing::warn!(
+                                timeout_ms = flush_timeout.as_millis(),
+                                "Timed out waiting for flush to remote storage, proceeding anyway."
+                            )
+                        }
+                    }
+                }
+            }
+
+            return Ok(());
        }

        // General case for upserts to TenantsMap, excluding the case above: we will substitute an
@@ -915,55 +992,44 @@ impl TenantManager {
            slot_guard.drop_old_value().expect("We just shut it down");
        }

-        // TODO(sharding): make local paths sharding-aware
-        let tenant_path = self.conf.tenant_path(&tenant_shard_id.tenant_id);
+        let tenant_path = self.conf.tenant_path(&tenant_shard_id);

        let new_slot = match &new_location_config.mode {
            LocationMode::Secondary(_) => {
                // Directory doesn't need to be fsync'd because if we crash it can
                // safely be recreated next time this tenant location is configured.
-                unsafe_create_dir_all(&tenant_path)
+                tokio::fs::create_dir_all(&tenant_path)
                    .await
                    .with_context(|| format!("Creating {tenant_path}"))?;

-                // TODO(sharding): make local paths sharding-aware
-                Tenant::persist_tenant_config(
-                    self.conf,
-                    &tenant_shard_id.tenant_id,
-                    &new_location_config,
-                )
-                .await
-                .map_err(SetNewTenantConfigError::Persist)?;
+                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+                    .await
+                    .map_err(SetNewTenantConfigError::Persist)?;

                TenantSlot::Secondary
            }
            LocationMode::Attached(_attach_config) => {
-                // TODO(sharding): make local paths sharding-aware
-                let timelines_path = self.conf.timelines_path(&tenant_shard_id.tenant_id);
+                let timelines_path = self.conf.timelines_path(&tenant_shard_id);

                // Directory doesn't need to be fsync'd because we do not depend on
                // it to exist after crashes: it may be recreated when tenant is
                // re-attached, see https://github.com/neondatabase/neon/issues/5550
-                unsafe_create_dir_all(&timelines_path)
+                tokio::fs::create_dir_all(&tenant_path)
                    .await
                    .with_context(|| format!("Creating {timelines_path}"))?;

-                // TODO(sharding): make local paths sharding-aware
-                Tenant::persist_tenant_config(
-                    self.conf,
-                    &tenant_shard_id.tenant_id,
-                    &new_location_config,
-                )
-                .await
-                .map_err(SetNewTenantConfigError::Persist)?;
+                Tenant::persist_tenant_config(self.conf, &tenant_shard_id, &new_location_config)
+                    .await
+                    .map_err(SetNewTenantConfigError::Persist)?;

-                // TODO(sharding): make spawn sharding-aware
+                let shard_identity = new_location_config.shard;
                let tenant = tenant_spawn(
                    self.conf,
-                    tenant_shard_id.tenant_id,
+                    tenant_shard_id,
                    &tenant_path,
                    self.resources.clone(),
                    AttachedTenantConf::try_from(new_location_config)?,
+                    shard_identity,
                    None,
                    self.tenants,
                    SpawnMode::Normal,
@@ -978,6 +1044,81 @@ impl TenantManager {

        Ok(())
    }
+
+    /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
+    /// LocationConf that was last used to attach it.  Optionally, the local file cache may be
+    /// dropped before re-attaching.
+    ///
+    /// This is not part of a tenant's normal lifecycle: it is used for debug/support, in situations
+    /// where an issue is identified that would go away with a restart of the tenant.
+    ///
+    /// This does not have any special "force" shutdown of a tenant: it relies on the tenant's tasks
+    /// to respect the cancellation tokens used in normal shutdown().
+    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %drop_cache))]
+    pub(crate) async fn reset_tenant(
+        &self,
+        tenant_shard_id: TenantShardId,
+        drop_cache: bool,
+        ctx: RequestContext,
+    ) -> anyhow::Result<()> {
+        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
+        let Some(old_slot) = slot_guard.get_old_value() else {
+            anyhow::bail!("Tenant not found when trying to reset");
+        };
+
+        let Some(tenant) = old_slot.get_attached() else {
+            slot_guard.revert();
+            anyhow::bail!("Tenant is not in attached state");
+        };
+
+        let (_guard, progress) = utils::completion::channel();
+        match tenant.shutdown(progress, false).await {
+            Ok(()) => {
+                slot_guard.drop_old_value()?;
+            }
+            Err(_barrier) => {
+                slot_guard.revert();
+                anyhow::bail!("Cannot reset Tenant, already shutting down");
+            }
+        }
+
+        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
+        let timelines_path = self.conf.timelines_path(&tenant_shard_id);
+        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
+
+        if drop_cache {
+            tracing::info!("Dropping local file cache");
+
+            match tokio::fs::read_dir(&timelines_path).await {
+                Err(e) => {
+                    tracing::warn!("Failed to list timelines while dropping cache: {}", e);
+                }
+                Ok(mut entries) => {
+                    while let Some(entry) = entries.next_entry().await? {
+                        tokio::fs::remove_dir_all(entry.path()).await?;
+                    }
+                }
+            }
+        }
+
+        let shard_identity = config.shard;
+        let tenant = tenant_spawn(
+            self.conf,
+            tenant_shard_id,
+            &tenant_path,
+            self.resources.clone(),
+            AttachedTenantConf::try_from(config)?,
+            shard_identity,
+            None,
+            self.tenants,
+            SpawnMode::Normal,
+            &ctx,
+        )?;
+
+        slot_guard.upsert(TenantSlot::Attached(tenant))?;
+
+        Ok(())
+    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -1062,6 +1203,7 @@ pub(crate) enum GetActiveTenantError {
 /// then wait for up to `timeout` (minus however long we waited for the slot).
 pub(crate) async fn get_active_tenant_with_timeout(
    tenant_id: TenantId,
+    shard_selector: ShardSelector,
    timeout: Duration,
    cancel: &CancellationToken,
 ) -> Result<Arc<Tenant>, GetActiveTenantError> {
@@ -1070,15 +1212,17 @@ pub(crate) async fn get_active_tenant_with_timeout(
        Tenant(Arc<Tenant>),
    }

-    // TODO(sharding): make page service interface sharding-aware (page service should apply ShardIdentity to the key
-    // to decide which shard services the request)
-    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
-
    let wait_start = Instant::now();
    let deadline = wait_start + timeout;

-    let wait_for = {
+    let (wait_for, tenant_shard_id) = {
        let locked = TENANTS.read().unwrap();
+
+        // Resolve TenantId to TenantShardId
+        let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
+            GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
+        )?;
+
        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
            .map_err(GetTenantError::MapState)?;
        match peek_slot {
@@ -1088,7 +1232,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                        // Fast path: we don't need to do any async waiting.
                        return Ok(tenant.clone());
                    }
-                    _ => WaitFor::Tenant(tenant.clone()),
+                    _ => (WaitFor::Tenant(tenant.clone()), tenant_shard_id),
                }
            }
            Some(TenantSlot::Secondary) => {
@@ -1096,7 +1240,9 @@ pub(crate) async fn get_active_tenant_with_timeout(
                    tenant_id,
                )))
            }
-            Some(TenantSlot::InProgress(barrier)) => WaitFor::Barrier(barrier.clone()),
+            Some(TenantSlot::InProgress(barrier)) => {
+                (WaitFor::Barrier(barrier.clone()), tenant_shard_id)
+            }
            None => {
                return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
                    tenant_id,
@@ -1181,8 +1327,7 @@ pub(crate) async fn delete_tenant(
    // See https://github.com/neondatabase/neon/issues/5080

    // TODO(sharding): make delete API sharding-aware
-    let mut slot_guard =
-        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
+    let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;

    // unwrap is safe because we used MustExist mode when acquiring
    let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
@@ -1262,8 +1407,7 @@ async fn detach_tenant0(
    deletion_queue_client: &DeletionQueueClient,
 ) -> Result<Utf8PathBuf, TenantStateError> {
    let tenant_dir_rename_operation = |tenant_id_to_clean: TenantShardId| async move {
-        // TODO(sharding): make local path helpers shard-aware
-        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean.tenant_id);
+        let local_tenant_directory = conf.tenant_path(&tenant_id_to_clean);
        safe_rename_tenant_dir(&local_tenant_directory)
            .await
            .with_context(|| format!("local tenant directory {local_tenant_directory:?} rename"))
@@ -1288,8 +1432,7 @@ async fn detach_tenant0(
            Err(TenantStateError::SlotError(TenantSlotError::NotFound(_)))
        )
    {
-        // TODO(sharding): make local paths sharding-aware
-        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id.tenant_id);
+        let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
        if tenant_ignore_mark.exists() {
            info!("Detaching an ignored tenant");
            let tmp_path = tenant_dir_rename_operation(tenant_shard_id)
@@ -1318,9 +1461,9 @@ pub(crate) async fn load_tenant(

    let slot_guard =
        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
-    let tenant_path = conf.tenant_path(&tenant_id);
+    let tenant_path = conf.tenant_path(&tenant_shard_id);

-    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_id);
+    let tenant_ignore_mark = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
    if tenant_ignore_mark.exists() {
        std::fs::remove_file(&tenant_ignore_mark).with_context(|| {
            format!(
@@ -1336,17 +1479,19 @@ pub(crate) async fn load_tenant(
    };

    let mut location_conf =
-        Tenant::load_tenant_config(conf, &tenant_id).map_err(TenantMapInsertError::Other)?;
+        Tenant::load_tenant_config(conf, &tenant_shard_id).map_err(TenantMapInsertError::Other)?;
    location_conf.attach_in_generation(generation);

-    Tenant::persist_tenant_config(conf, &tenant_id, &location_conf).await?;
+    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;

+    let shard_identity = location_conf.shard;
    let new_tenant = tenant_spawn(
        conf,
-        tenant_id,
+        tenant_shard_id,
        &tenant_path,
        resources,
        AttachedTenantConf::try_from(location_conf)?,
+        shard_identity,
        None,
        &TENANTS,
        SpawnMode::Normal,
@@ -1374,7 +1519,7 @@ async fn ignore_tenant0(
    let tenant_shard_id = TenantShardId::unsharded(tenant_id);

    remove_tenant_from_memory(tenants, tenant_shard_id, async {
-        let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_id);
+        let ignore_mark_file = conf.tenant_ignore_mark_file_path(&tenant_shard_id);
        fs::File::create(&ignore_mark_file)
            .await
            .context("Failed to create ignore mark file")
@@ -1432,16 +1577,18 @@ pub(crate) async fn attach_tenant(
    let slot_guard =
        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
    let location_conf = LocationConf::attached_single(tenant_conf, generation);
-    let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_id).await?;
+    let tenant_dir = create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;
    // TODO: tenant directory remains on disk if we bail out from here on.
    //       See https://github.com/neondatabase/neon/issues/4233

+    let shard_identity = location_conf.shard;
    let attached_tenant = tenant_spawn(
        conf,
-        tenant_id,
+        tenant_shard_id,
        &tenant_dir,
        resources,
        AttachedTenantConf::try_from(location_conf)?,
+        shard_identity,
        None,
        &TENANTS,
        SpawnMode::Normal,
@@ -1507,9 +1654,10 @@ pub enum TenantSlotUpsertError {
    MapState(#[from] TenantMapError),
 }

-#[derive(Debug)]
+#[derive(Debug, thiserror::Error)]
 enum TenantSlotDropError {
    /// It is only legal to drop a TenantSlot if its contents are fully shut down
+    #[error("Tenant was not shut down")]
    NotShutdown,
 }

@@ -1569,9 +1717,9 @@ impl SlotGuard {
        }
    }

-    /// Take any value that was present in the slot before we acquired ownership
+    /// Get any value that was present in the slot before we acquired ownership
    /// of it: in state transitions, this will be the old state.
-    fn get_old_value(&mut self) -> &Option<TenantSlot> {
+    fn get_old_value(&self) -> &Option<TenantSlot> {
        &self.old_value
    }

@@ -1954,6 +2102,9 @@ pub(crate) async fn immediate_gc(
        .with_context(|| format!("tenant {tenant_id}"))
        .map_err(|e| ApiError::NotFound(e.into()))?;

+    // TODO(sharding): make callers of this function shard-aware
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
    let gc_horizon = gc_req.gc_horizon.unwrap_or_else(|| tenant.get_gc_horizon());
    // Use tenant's pitr setting
    let pitr = tenant.get_pitr_interval();
@@ -1961,6 +2112,7 @@ pub(crate) async fn immediate_gc(
    // Run in task_mgr to avoid race with tenant_detach operation
    let ctx = ctx.detached_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
    let (task_done, wait_task_done) = tokio::sync::oneshot::channel();
+    // TODO: spawning is redundant now, need to hold the gate
    task_mgr::spawn(
        &tokio::runtime::Handle::current(),
        TaskKind::GarbageCollector,
@@ -1970,12 +2122,40 @@ pub(crate) async fn immediate_gc(
        false,
        async move {
            fail::fail_point!("immediate_gc_task_pre");
-            let result = tenant
+
+            #[allow(unused_mut)]
+            let mut result = tenant
                .gc_iteration(Some(timeline_id), gc_horizon, pitr, &cancel, &ctx)
-                .instrument(info_span!("manual_gc", %tenant_id, %timeline_id))
+                .instrument(info_span!("manual_gc", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id))
                .await;
                // FIXME: `gc_iteration` can return an error for multiple reasons; we should handle it
                // better once the types support it.
+
+            #[cfg(feature = "testing")]
+            {
+                if let Ok(result) = result.as_mut() {
+                    // why not futures unordered? it seems it needs very much the same task structure
+                    // but would only run on single task.
+                    let mut js = tokio::task::JoinSet::new();
+                    for layer in std::mem::take(&mut result.doomed_layers) {
+                        js.spawn(layer.wait_drop());
+                    }
+                    tracing::info!(total = js.len(), "starting to wait for the gc'd layers to be dropped");
+                    while let Some(res) = js.join_next().await {
+                        res.expect("wait_drop should not panic");
+                    }
+                }
+
+                let timeline = tenant.get_timeline(timeline_id, false).ok();
+                let rtc = timeline.as_ref().and_then(|x| x.remote_client.as_ref());
+
+                if let Some(rtc) = rtc {
+                    // layer drops schedule actions on remote timeline client to actually do the
+                    // deletions; don't care just exit fast about the shutdown error
+                    drop(rtc.wait_completion().await);
+                }
+            }
+
            match task_done.send(result) {
                Ok(_) => (),
                Err(result) => error!("failed to send gc result: {result:?}"),
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -188,6 +188,8 @@ use anyhow::Context;
 use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};

+pub(crate) use download::download_initdb_tar_zst;
+use pageserver_api::shard::{ShardIndex, TenantShardId};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
 pub(crate) use upload::upload_initdb_dir;
@@ -300,7 +302,7 @@ pub struct RemoteTimelineClient {

    runtime: tokio::runtime::Handle,

-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    generation: Generation,

@@ -324,7 +326,7 @@ impl RemoteTimelineClient {
        remote_storage: GenericRemoteStorage,
        deletion_queue_client: DeletionQueueClient,
        conf: &'static PageServerConf,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        generation: Generation,
    ) -> RemoteTimelineClient {
@@ -336,13 +338,16 @@ impl RemoteTimelineClient {
            } else {
                BACKGROUND_RUNTIME.handle().clone()
            },
-            tenant_id,
+            tenant_shard_id,
            timeline_id,
            generation,
            storage_impl: remote_storage,
            deletion_queue_client,
            upload_queue: Mutex::new(UploadQueue::Uninitialized),
-            metrics: Arc::new(RemoteTimelineClientMetrics::new(&tenant_id, &timeline_id)),
+            metrics: Arc::new(RemoteTimelineClientMetrics::new(
+                &tenant_shard_id,
+                &timeline_id,
+            )),
        }
    }

@@ -463,13 +468,13 @@ impl RemoteTimelineClient {

        let index_part = download::download_index_part(
            &self.storage_impl,
-            &self.tenant_id,
+            &self.tenant_shard_id,
            &self.timeline_id,
            self.generation,
            cancel,
        )
        .measure_remote_op(
-            self.tenant_id,
+            self.tenant_shard_id.tenant_id,
            self.timeline_id,
            RemoteOpFileKind::Index,
            RemoteOpKind::Download,
@@ -505,13 +510,13 @@ impl RemoteTimelineClient {
            download::download_layer_file(
                self.conf,
                &self.storage_impl,
-                self.tenant_id,
+                self.tenant_shard_id,
                self.timeline_id,
                layer_file_name,
                layer_metadata,
            )
            .measure_remote_op(
-                self.tenant_id,
+                self.tenant_shard_id.tenant_id,
                self.timeline_id,
                RemoteOpFileKind::Layer,
                RemoteOpKind::Download,
@@ -657,10 +662,10 @@ impl RemoteTimelineClient {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;

-        let with_generations =
+        let with_metadata =
            self.schedule_unlinking_of_layers_from_index_part0(upload_queue, names.iter().cloned());

-        self.schedule_deletion_of_unlinked0(upload_queue, with_generations);
+        self.schedule_deletion_of_unlinked0(upload_queue, with_metadata);

        // Launch the tasks immediately, if possible
        self.launch_queued_tasks(upload_queue);
@@ -695,7 +700,7 @@ impl RemoteTimelineClient {
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
        names: I,
-    ) -> Vec<(LayerFileName, Generation)>
+    ) -> Vec<(LayerFileName, LayerFileMetadata)>
    where
        I: IntoIterator<Item = LayerFileName>,
    {
@@ -703,16 +708,17 @@ impl RemoteTimelineClient {
        // so we don't need update it. Just serialize it.
        let metadata = upload_queue.latest_metadata.clone();

-        // Decorate our list of names with each name's generation, dropping
-        // names that are unexpectedly missing from our metadata.
-        let with_generations: Vec<_> = names
+        // Decorate our list of names with each name's metadata, dropping
+        // names that are unexpectedly missing from our metadata.  This metadata
+        // is later used when physically deleting layers, to construct key paths.
+        let with_metadata: Vec<_> = names
            .into_iter()
            .filter_map(|name| {
                let meta = upload_queue.latest_files.remove(&name);

                if let Some(meta) = meta {
                    upload_queue.latest_files_changes_since_metadata_upload_scheduled += 1;
-                    Some((name, meta.generation))
+                    Some((name, meta))
                } else {
                    // This can only happen if we forgot to to schedule the file upload
                    // before scheduling the delete. Log it because it is a rare/strange
@@ -725,9 +731,10 @@ impl RemoteTimelineClient {
            .collect();

        #[cfg(feature = "testing")]
-        for (name, gen) in &with_generations {
-            if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), *gen) {
-                if &unexpected == gen {
+        for (name, metadata) in &with_metadata {
+            let gen = metadata.generation;
+            if let Some(unexpected) = upload_queue.dangling_files.insert(name.to_owned(), gen) {
+                if unexpected == gen {
                    tracing::error!("{name} was unlinked twice with same generation");
                } else {
                    tracing::error!("{name} was unlinked twice with different generations {gen:?} and {unexpected:?}");
@@ -742,14 +749,14 @@ impl RemoteTimelineClient {
            self.schedule_index_upload(upload_queue, metadata);
        }

-        with_generations
+        with_metadata
    }

    /// Schedules deletion for layer files which have previously been unlinked from the
    /// `index_part.json` with [`Self::schedule_gc_update`] or [`Self::schedule_compaction_update`].
    pub(crate) fn schedule_deletion_of_unlinked(
        self: &Arc<Self>,
-        layers: Vec<(LayerFileName, Generation)>,
+        layers: Vec<(LayerFileName, LayerFileMetadata)>,
    ) -> anyhow::Result<()> {
        let mut guard = self.upload_queue.lock().unwrap();
        let upload_queue = guard.initialized_mut()?;
@@ -762,16 +769,22 @@ impl RemoteTimelineClient {
    fn schedule_deletion_of_unlinked0(
        self: &Arc<Self>,
        upload_queue: &mut UploadQueueInitialized,
-        with_generations: Vec<(LayerFileName, Generation)>,
+        with_metadata: Vec<(LayerFileName, LayerFileMetadata)>,
    ) {
-        for (name, gen) in &with_generations {
-            info!("scheduling deletion of layer {}{}", name, gen.get_suffix());
+        for (name, meta) in &with_metadata {
+            info!(
+                "scheduling deletion of layer {}{} (shard {})",
+                name,
+                meta.generation.get_suffix(),
+                meta.shard
+            );
        }

        #[cfg(feature = "testing")]
-        for (name, gen) in &with_generations {
+        for (name, meta) in &with_metadata {
+            let gen = meta.generation;
            match upload_queue.dangling_files.remove(name) {
-                Some(same) if &same == gen => { /* expected */ }
+                Some(same) if same == gen => { /* expected */ }
                Some(other) => {
                    tracing::error!("{name} was unlinked with {other:?} but deleted with {gen:?}");
                }
@@ -783,7 +796,7 @@ impl RemoteTimelineClient {

        // schedule the actual deletions
        let op = UploadOp::Delete(Delete {
-            layers: with_generations,
+            layers: with_metadata,
        });
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);
@@ -812,10 +825,8 @@ impl RemoteTimelineClient {
        Ok(())
    }

-    ///
    /// Wait for all previously scheduled uploads/deletions to complete
-    ///
-    pub async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
+    pub(crate) async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
        let mut receiver = {
            let mut guard = self.upload_queue.lock().unwrap();
            let upload_queue = guard.initialized_mut()?;
@@ -825,6 +836,7 @@ impl RemoteTimelineClient {
        if receiver.changed().await.is_err() {
            anyhow::bail!("wait_completion aborted because upload queue was stopped");
        }
+
        Ok(())
    }

@@ -851,6 +863,56 @@ impl RemoteTimelineClient {
        receiver
    }

+    /// Wait for all previously scheduled operations to complete, and then stop.
+    ///
+    /// Not cancellation safe
+    pub(crate) async fn shutdown(self: &Arc<Self>) -> Result<(), StopError> {
+        // On cancellation the queue is left in ackward state of refusing new operations but
+        // proper stop is yet to be called. On cancel the original or some later task must call
+        // `stop` or `shutdown`.
+        let sg = scopeguard::guard((), |_| {
+            tracing::error!("RemoteTimelineClient::shutdown was cancelled; this should not happen, do not make this into an allowed_error")
+        });
+
+        let fut = {
+            let mut guard = self.upload_queue.lock().unwrap();
+            let upload_queue = match &mut *guard {
+                UploadQueue::Stopped(_) => return Ok(()),
+                UploadQueue::Uninitialized => return Err(StopError::QueueUninitialized),
+                UploadQueue::Initialized(ref mut init) => init,
+            };
+
+            // if the queue is already stuck due to a shutdown operation which was cancelled, then
+            // just don't add more of these as they would never complete.
+            //
+            // TODO: if launch_queued_tasks were to be refactored to accept a &mut UploadQueue
+            // in every place we would not have to jump through this hoop, and this method could be
+            // made cancellable.
+            if !upload_queue.shutting_down {
+                upload_queue.shutting_down = true;
+                upload_queue.queued_operations.push_back(UploadOp::Shutdown);
+                // this operation is not counted similar to Barrier
+
+                self.launch_queued_tasks(upload_queue);
+            }
+
+            upload_queue.shutdown_ready.clone().acquire_owned()
+        };
+
+        let res = fut.await;
+
+        scopeguard::ScopeGuard::into_inner(sg);
+
+        match res {
+            Ok(_permit) => unreachable!("shutdown_ready should not have been added permits"),
+            Err(_closed) => {
+                // expected
+            }
+        }
+
+        self.stop()
+    }
+
    /// Set the deleted_at field in the remote index file.
    ///
    /// This fails if the upload queue has not been `stop()`ed.
@@ -902,7 +964,7 @@ impl RemoteTimelineClient {
            || {
                upload::upload_index_part(
                    &self.storage_impl,
-                    &self.tenant_id,
+                    &self.tenant_shard_id,
                    &self.timeline_id,
                    self.generation,
                    &index_part_with_deleted_at,
@@ -960,8 +1022,9 @@ impl RemoteTimelineClient {
                .drain()
                .map(|(file_name, meta)| {
                    remote_layer_path(
-                        &self.tenant_id,
+                        &self.tenant_shard_id.tenant_id,
                        &self.timeline_id,
+                        meta.shard,
                        &file_name,
                        meta.generation,
                    )
@@ -974,7 +1037,7 @@ impl RemoteTimelineClient {

        // Do not delete index part yet, it is needed for possible retry. If we remove it first
        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
-        let timeline_storage_path = remote_timeline_path(&self.tenant_id, &self.timeline_id);
+        let timeline_storage_path = remote_timeline_path(&self.tenant_shard_id, &self.timeline_id);

        // Execute all pending deletions, so that when we proceed to do a list_prefixes below, we aren't
        // taking the burden of listing all the layers that we already know we should delete.
@@ -1010,12 +1073,22 @@ impl RemoteTimelineClient {
            .unwrap_or(
                // No generation-suffixed indices, assume we are dealing with
                // a legacy index.
-                remote_index_path(&self.tenant_id, &self.timeline_id, Generation::none()),
+                remote_index_path(&self.tenant_shard_id, &self.timeline_id, Generation::none()),
            );

        let remaining_layers: Vec<RemotePath> = remaining
            .into_iter()
-            .filter(|p| p!= &latest_index)
+            .filter(|p| {
+                if p == &latest_index {
+                    return false;
+                }
+                if let Some(name) = p.object_name() {
+                    if name == INITDB_PATH {
+                        return false;
+                    }
+                }
+                true
+            })
            .inspect(|path| {
                if let Some(name) = path.object_name() {
                    info!(%name, "deleting a file not referenced from index_part.json");
@@ -1081,7 +1154,9 @@ impl RemoteTimelineClient {
                    upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
                }

-                UploadOp::Barrier(_) => upload_queue.inprogress_tasks.is_empty(),
+                UploadOp::Barrier(_) | UploadOp::Shutdown => {
+                    upload_queue.inprogress_tasks.is_empty()
+                }
            };

            // If we cannot launch this task, don't look any further.
@@ -1094,6 +1169,13 @@ impl RemoteTimelineClient {
                break;
            }

+            if let UploadOp::Shutdown = next_op {
+                // leave the op in the queue but do not start more tasks; it will be dropped when
+                // the stop is called.
+                upload_queue.shutdown_ready.close();
+                break;
+            }
+
            // We can launch this task. Remove it from the queue first.
            let next_op = upload_queue.queued_operations.pop_front().unwrap();

@@ -1114,6 +1196,7 @@ impl RemoteTimelineClient {
                    sender.send_replace(());
                    continue;
                }
+                UploadOp::Shutdown => unreachable!("shutdown is intentionally never popped off"),
            };

            // Assign unique ID to this task
@@ -1132,12 +1215,12 @@ impl RemoteTimelineClient {

            // Spawn task to perform the task
            let self_rc = Arc::clone(self);
-            let tenant_id = self.tenant_id;
+            let tenant_shard_id = self.tenant_shard_id;
            let timeline_id = self.timeline_id;
            task_mgr::spawn(
                &self.runtime,
                TaskKind::RemoteUploadTask,
-                Some(self.tenant_id),
+                Some(self.tenant_shard_id.tenant_id),
                Some(self.timeline_id),
                "remote upload",
                false,
@@ -1145,7 +1228,7 @@ impl RemoteTimelineClient {
                    self_rc.perform_upload_task(task).await;
                    Ok(())
                }
-                .instrument(info_span!(parent: None, "remote_upload", %tenant_id, %timeline_id, %upload_task_id)),
+                .instrument(info_span!(parent: None, "remote_upload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %timeline_id, %upload_task_id)),
            );

            // Loop back to process next task
@@ -1197,7 +1280,7 @@ impl RemoteTimelineClient {
                        self.generation,
                    )
                    .measure_remote_op(
-                        self.tenant_id,
+                        self.tenant_shard_id.tenant_id,
                        self.timeline_id,
                        RemoteOpFileKind::Layer,
                        RemoteOpKind::Upload,
@@ -1217,13 +1300,13 @@ impl RemoteTimelineClient {

                    let res = upload::upload_index_part(
                        &self.storage_impl,
-                        &self.tenant_id,
+                        &self.tenant_shard_id,
                        &self.timeline_id,
                        self.generation,
                        index_part,
                    )
                    .measure_remote_op(
-                        self.tenant_id,
+                        self.tenant_shard_id.tenant_id,
                        self.timeline_id,
                        RemoteOpFileKind::Index,
                        RemoteOpKind::Upload,
@@ -1243,7 +1326,7 @@ impl RemoteTimelineClient {
                    pausable_failpoint!("before-delete-layer-pausable");
                    self.deletion_queue_client
                        .push_layers(
-                            self.tenant_id,
+                            self.tenant_shard_id,
                            self.timeline_id,
                            self.generation,
                            delete.layers.clone(),
@@ -1251,10 +1334,10 @@ impl RemoteTimelineClient {
                        .await
                        .map_err(|e| anyhow::anyhow!(e))
                }
-                UploadOp::Barrier(_) => {
+                unexpected @ UploadOp::Barrier(_) | unexpected @ UploadOp::Shutdown => {
                    // unreachable. Barrier operations are handled synchronously in
                    // launch_queued_tasks
-                    warn!("unexpected Barrier operation in perform_upload_task");
+                    warn!("unexpected {unexpected:?} operation in perform_upload_task");
                    break;
                }
            };
@@ -1348,7 +1431,7 @@ impl RemoteTimelineClient {
                    upload_queue.num_inprogress_deletions -= 1;
                    None
                }
-                UploadOp::Barrier(_) => unreachable!(),
+                UploadOp::Barrier(..) | UploadOp::Shutdown => unreachable!(),
            };

            // Launch any queued tasks that were unblocked by this one.
@@ -1362,7 +1445,7 @@ impl RemoteTimelineClient {
            // data safety guarantees (see docs/rfcs/025-generation-numbers.md)
            self.deletion_queue_client
                .update_remote_consistent_lsn(
-                    self.tenant_id,
+                    self.tenant_shard_id,
                    self.timeline_id,
                    self.generation,
                    lsn,
@@ -1403,7 +1486,7 @@ impl RemoteTimelineClient {
                    reason: "should we track deletes? positive or negative sign?",
                },
            ),
-            UploadOp::Barrier(_) => {
+            UploadOp::Barrier(..) | UploadOp::Shutdown => {
                // we do not account these
                return None;
            }
@@ -1429,10 +1512,13 @@ impl RemoteTimelineClient {
    }

    /// Close the upload queue for new operations and cancel queued operations.
+    ///
+    /// Use [`RemoteTimelineClient::shutdown`] for graceful stop.
+    ///
    /// In-progress operations will still be running after this function returns.
    /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
    /// to wait for them to complete, after calling this function.
-    pub fn stop(&self) -> Result<(), StopError> {
+    pub(crate) fn stop(&self) -> Result<(), StopError> {
        // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
        // into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
        // The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
@@ -1470,6 +1556,8 @@ impl RemoteTimelineClient {
                        queued_operations: VecDeque::default(),
                        #[cfg(feature = "testing")]
                        dangling_files: HashMap::default(),
+                        shutting_down: false,
+                        shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
                    };

                    let upload_queue = std::mem::replace(
@@ -1515,24 +1603,32 @@ impl RemoteTimelineClient {
    }
 }

-pub fn remote_timelines_path(tenant_id: &TenantId) -> RemotePath {
-    let path = format!("tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}");
+pub fn remote_timelines_path(tenant_shard_id: &TenantShardId) -> RemotePath {
+    let path = format!("tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}");
    RemotePath::from_string(&path).expect("Failed to construct path")
 }

-pub fn remote_timeline_path(tenant_id: &TenantId, timeline_id: &TimelineId) -> RemotePath {
-    remote_timelines_path(tenant_id).join(Utf8Path::new(&timeline_id.to_string()))
+pub fn remote_timeline_path(
+    tenant_shard_id: &TenantShardId,
+    timeline_id: &TimelineId,
+) -> RemotePath {
+    remote_timelines_path(tenant_shard_id).join(Utf8Path::new(&timeline_id.to_string()))
 }

+/// Note that the shard component of a remote layer path is _not_ always the same
+/// as in the TenantShardId of the caller: tenants may reference layers from a different
+/// ShardIndex.  Use the ShardIndex from the layer's metadata.
 pub fn remote_layer_path(
    tenant_id: &TenantId,
    timeline_id: &TimelineId,
+    shard: ShardIndex,
    layer_file_name: &LayerFileName,
    generation: Generation,
 ) -> RemotePath {
    // Generation-aware key format
    let path = format!(
-        "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
+        "tenants/{tenant_id}{0}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{1}{2}",
+        shard.get_suffix(),
        layer_file_name.file_name(),
        generation.get_suffix()
    );
@@ -1548,12 +1644,12 @@ pub fn remote_initdb_archive_path(tenant_id: &TenantId, timeline_id: &TimelineId
 }

 pub fn remote_index_path(
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    generation: Generation,
 ) -> RemotePath {
    RemotePath::from_string(&format!(
-        "tenants/{tenant_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
+        "tenants/{tenant_shard_id}/{TIMELINES_SEGMENT_NAME}/{timeline_id}/{0}{1}",
        IndexPart::FILE_NAME,
        generation.get_suffix()
    ))
@@ -1695,14 +1791,14 @@ mod tests {
            Arc::new(RemoteTimelineClient {
                conf: self.harness.conf,
                runtime: tokio::runtime::Handle::current(),
-                tenant_id: self.harness.tenant_id,
+                tenant_shard_id: self.harness.tenant_shard_id,
                timeline_id: TIMELINE_ID,
                generation,
                storage_impl: self.harness.remote_storage.clone(),
                deletion_queue_client: self.harness.deletion_queue.new_client(),
                upload_queue: Mutex::new(UploadQueue::Uninitialized),
                metrics: Arc::new(RemoteTimelineClientMetrics::new(
-                    &self.harness.tenant_id,
+                    &self.harness.tenant_shard_id,
                    &TIMELINE_ID,
                )),
            })
@@ -1778,6 +1874,7 @@ mod tests {
        println!("remote_timeline_dir: {remote_timeline_dir}");

        let generation = harness.generation;
+        let shard = harness.shard;

        // Create a couple of dummy files,  schedule upload for them

@@ -1794,7 +1891,7 @@ mod tests {
                harness.conf,
                &timeline,
                name,
-                LayerFileMetadata::new(contents.len() as u64, generation),
+                LayerFileMetadata::new(contents.len() as u64, generation, shard),
            )
        }).collect::<Vec<_>>();

@@ -1943,7 +2040,7 @@ mod tests {
            harness.conf,
            &timeline,
            layer_file_name_1.clone(),
-            LayerFileMetadata::new(content_1.len() as u64, harness.generation),
+            LayerFileMetadata::new(content_1.len() as u64, harness.generation, harness.shard),
        );

        #[derive(Debug, PartialEq, Clone, Copy)]
@@ -2029,7 +2126,12 @@ mod tests {
        std::fs::create_dir_all(remote_timeline_dir).expect("creating test dir should work");

        let index_path = test_state.harness.remote_fs_dir.join(
-            remote_index_path(&test_state.harness.tenant_id, &TIMELINE_ID, generation).get_path(),
+            remote_index_path(
+                &test_state.harness.tenant_shard_id,
+                &TIMELINE_ID,
+                generation,
+            )
+            .get_path(),
        );
        eprintln!("Writing {index_path}");
        std::fs::write(&index_path, index_part_bytes).unwrap();
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -8,10 +8,12 @@ use std::future::Future;
 use std::time::Duration;

 use anyhow::{anyhow, Context};
-use camino::Utf8Path;
-use tokio::fs;
-use tokio::io::AsyncWriteExt;
+use camino::{Utf8Path, Utf8PathBuf};
+use pageserver_api::shard::TenantShardId;
+use tokio::fs::{self, File, OpenOptions};
+use tokio::io::{AsyncSeekExt, AsyncWriteExt};
 use tokio_util::sync::CancellationToken;
+use tracing::warn;
 use utils::{backoff, crashsafe};

 use crate::config::PageServerConf;
@@ -19,14 +21,15 @@ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::Generation;
+use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
 use utils::crashsafe::path_with_suffix_extension;
-use utils::id::{TenantId, TimelineId};
+use utils::id::TimelineId;

 use super::index::{IndexPart, LayerFileMetadata};
 use super::{
-    parse_remote_index_path, remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
-    FAILED_REMOTE_OP_RETRIES,
+    parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
+    FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
 };

 static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
@@ -39,7 +42,7 @@ static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
 pub async fn download_layer_file<'a>(
    conf: &'static PageServerConf,
    storage: &'a GenericRemoteStorage,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
    layer_file_name: &'a LayerFileName,
    layer_metadata: &'a LayerFileMetadata,
@@ -47,12 +50,13 @@ pub async fn download_layer_file<'a>(
    debug_assert_current_span_has_tenant_and_timeline_id();

    let local_path = conf
-        .timeline_path(&tenant_id, &timeline_id)
+        .timeline_path(&tenant_shard_id, &timeline_id)
        .join(layer_file_name.file_name());

    let remote_path = remote_layer_path(
-        &tenant_id,
+        &tenant_shard_id.tenant_id,
        &timeline_id,
+        layer_metadata.shard,
        layer_file_name,
        layer_metadata.generation,
    );
@@ -169,10 +173,10 @@ pub fn is_temp_download_file(path: &Utf8Path) -> bool {
 /// List timelines of given tenant in remote storage
 pub async fn list_remote_timelines(
    storage: &GenericRemoteStorage,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    cancel: CancellationToken,
 ) -> anyhow::Result<(HashSet<TimelineId>, HashSet<String>)> {
-    let remote_path = remote_timelines_path(&tenant_id);
+    let remote_path = remote_timelines_path(&tenant_shard_id);

    fail::fail_point!("storage-sync-list-remote-timelines", |_| {
        anyhow::bail!("storage-sync-list-remote-timelines");
@@ -180,7 +184,7 @@ pub async fn list_remote_timelines(

    let listing = download_retry_forever(
        || storage.list(Some(&remote_path), ListingMode::WithDelimiter),
-        &format!("list timelines for {tenant_id}"),
+        &format!("list timelines for {tenant_shard_id}"),
        cancel,
    )
    .await?;
@@ -190,7 +194,7 @@ pub async fn list_remote_timelines(

    for timeline_remote_storage_key in listing.prefixes {
        let object_name = timeline_remote_storage_key.object_name().ok_or_else(|| {
-            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_id}")
+            anyhow::anyhow!("failed to get timeline id for remote tenant {tenant_shard_id}")
        })?;

        match object_name.parse::<TimelineId>() {
@@ -211,12 +215,12 @@ pub async fn list_remote_timelines(

 async fn do_download_index_part(
    storage: &GenericRemoteStorage,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    index_generation: Generation,
    cancel: CancellationToken,
 ) -> Result<IndexPart, DownloadError> {
-    let remote_path = remote_index_path(tenant_id, timeline_id, index_generation);
+    let remote_path = remote_index_path(tenant_shard_id, timeline_id, index_generation);

    let index_part_bytes = download_retry_forever(
        || async {
@@ -252,7 +256,7 @@ async fn do_download_index_part(
 #[tracing::instrument(skip_all, fields(generation=?my_generation))]
 pub(super) async fn download_index_part(
    storage: &GenericRemoteStorage,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    my_generation: Generation,
    cancel: CancellationToken,
@@ -261,8 +265,14 @@ pub(super) async fn download_index_part(

    if my_generation.is_none() {
        // Operating without generations: just fetch the generation-less path
-        return do_download_index_part(storage, tenant_id, timeline_id, my_generation, cancel)
-            .await;
+        return do_download_index_part(
+            storage,
+            tenant_shard_id,
+            timeline_id,
+            my_generation,
+            cancel,
+        )
+        .await;
    }

    // Stale case: If we were intentionally attached in a stale generation, there may already be a remote
@@ -271,7 +281,7 @@ pub(super) async fn download_index_part(
    // This is an optimization to avoid doing the listing for the general case below.
    let res = do_download_index_part(
        storage,
-        tenant_id,
+        tenant_shard_id,
        timeline_id,
        my_generation,
        cancel.clone(),
@@ -298,7 +308,7 @@ pub(super) async fn download_index_part(
    // This is an optimization to avoid doing the listing for the general case below.
    let res = do_download_index_part(
        storage,
-        tenant_id,
+        tenant_shard_id,
        timeline_id,
        my_generation.previous(),
        cancel.clone(),
@@ -320,8 +330,9 @@ pub(super) async fn download_index_part(
    }

    // General case/fallback: if there is no index at my_generation or prev_generation, then list all index_part.json
-    // objects, and select the highest one with a generation <= my_generation.
-    let index_prefix = remote_index_path(tenant_id, timeline_id, Generation::none());
+    // objects, and select the highest one with a generation <= my_generation.  Constructing the prefix is equivalent
+    // to constructing a full index path with no generation, because the generation is a suffix.
+    let index_prefix = remote_index_path(tenant_shard_id, timeline_id, Generation::none());
    let indices = backoff::retry(
        || async { storage.list_files(Some(&index_prefix)).await },
        |_| false,
@@ -347,18 +358,87 @@ pub(super) async fn download_index_part(
    match max_previous_generation {
        Some(g) => {
            tracing::debug!("Found index_part in generation {g:?}");
-            do_download_index_part(storage, tenant_id, timeline_id, g, cancel).await
+            do_download_index_part(storage, tenant_shard_id, timeline_id, g, cancel).await
        }
        None => {
            // Migration from legacy pre-generation state: we have a generation but no prior
            // attached pageservers did.  Try to load from a no-generation path.
-            tracing::info!("No index_part.json* found");
-            do_download_index_part(storage, tenant_id, timeline_id, Generation::none(), cancel)
-                .await
+            tracing::debug!("No index_part.json* found");
+            do_download_index_part(
+                storage,
+                tenant_shard_id,
+                timeline_id,
+                Generation::none(),
+                cancel,
+            )
+            .await
        }
    }
 }

+pub(crate) async fn download_initdb_tar_zst(
+    conf: &'static PageServerConf,
+    storage: &GenericRemoteStorage,
+    tenant_shard_id: &TenantShardId,
+    timeline_id: &TimelineId,
+) -> Result<(Utf8PathBuf, File), DownloadError> {
+    debug_assert_current_span_has_tenant_and_timeline_id();
+
+    let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id);
+
+    let timeline_path = conf.timelines_path(tenant_shard_id);
+
+    if !timeline_path.exists() {
+        tokio::fs::create_dir_all(&timeline_path)
+            .await
+            .with_context(|| format!("timeline dir creation {timeline_path}"))
+            .map_err(DownloadError::Other)?;
+    }
+    let temp_path = timeline_path.join(format!("{INITDB_PATH}-{timeline_id}.{TEMP_FILE_SUFFIX}"));
+
+    let file = download_retry(
+        || async {
+            let mut file = OpenOptions::new()
+                .create(true)
+                .truncate(true)
+                .read(true)
+                .write(true)
+                .open(&temp_path)
+                .await
+                .with_context(|| format!("tempfile creation {temp_path}"))
+                .map_err(DownloadError::Other)?;
+
+            let mut download = storage.download(&remote_path).await?;
+
+            tokio::io::copy(&mut download.download_stream, &mut file)
+                .await
+                .with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
+                .map_err(DownloadError::Other)?;
+
+            file.seek(std::io::SeekFrom::Start(0))
+                .await
+                .with_context(|| format!("rewinding initdb.tar.zst at: {remote_path:?}"))
+                .map_err(DownloadError::Other)?;
+
+            Ok(file)
+        },
+        &format!("download {remote_path}"),
+    )
+    .await
+    .map_err(|e| {
+        if temp_path.exists() {
+            // Do a best-effort attempt at deleting the temporary file upon encountering an error.
+            // We don't have async here nor do we want to pile on any extra errors.
+            if let Err(e) = std::fs::remove_file(&temp_path) {
+                warn!("error deleting temporary file {temp_path}: {e}");
+            }
+        }
+        e
+    })?;
+
+    Ok((temp_path, file))
+}
+
 /// Helper function to handle retries for a download operation.
 ///
 /// Remote operations can fail due to rate limits (IAM, S3), spurious network
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -12,6 +12,7 @@ use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::upload_queue::UploadQueueInitialized;
 use crate::tenant::Generation;
+use pageserver_api::shard::ShardIndex;

 use utils::lsn::Lsn;

@@ -25,6 +26,8 @@ pub struct LayerFileMetadata {
    file_size: u64,

    pub(crate) generation: Generation,
+
+    pub(crate) shard: ShardIndex,
 }

 impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
@@ -32,15 +35,17 @@ impl From<&'_ IndexLayerMetadata> for LayerFileMetadata {
        LayerFileMetadata {
            file_size: other.file_size,
            generation: other.generation,
+            shard: other.shard,
        }
    }
 }

 impl LayerFileMetadata {
-    pub fn new(file_size: u64, generation: Generation) -> Self {
+    pub fn new(file_size: u64, generation: Generation, shard: ShardIndex) -> Self {
        LayerFileMetadata {
            file_size,
            generation,
+            shard,
        }
    }

@@ -161,6 +166,10 @@ pub struct IndexLayerMetadata {
    #[serde(default = "Generation::none")]
    #[serde(skip_serializing_if = "Generation::is_none")]
    pub generation: Generation,
+
+    #[serde(default = "ShardIndex::unsharded")]
+    #[serde(skip_serializing_if = "ShardIndex::is_unsharded")]
+    pub shard: ShardIndex,
 }

 impl From<LayerFileMetadata> for IndexLayerMetadata {
@@ -168,6 +177,7 @@ impl From<LayerFileMetadata> for IndexLayerMetadata {
        IndexLayerMetadata {
            file_size: other.file_size,
            generation: other.generation,
+            shard: other.shard,
        }
    }
 }
@@ -195,13 +205,15 @@ mod tests {
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
-                    generation: Generation::none()
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
                }),
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
-                    generation: Generation::none()
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
@@ -233,13 +245,15 @@ mod tests {
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
-                    generation: Generation::none()
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
                }),
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
-                    generation: Generation::none()
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
@@ -272,13 +286,15 @@ mod tests {
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
-                    generation: Generation::none()
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
                }),
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
-                    generation: Generation::none()
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
@@ -354,19 +370,21 @@ mod tests {
            layer_metadata: HashMap::from([
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
                    file_size: 25600000,
-                    generation: Generation::none()
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
                }),
                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
                    // serde_json should always parse this but this might be a double with jq for
                    // example.
                    file_size: 9007199254741001,
-                    generation: Generation::none()
+                    generation: Generation::none(),
+                    shard: ShardIndex::unsharded()
                })
            ]),
            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
            metadata: TimelineMetadata::from_bytes(&[113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]).unwrap(),
            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
-                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap()),
        };

        let part = IndexPart::from_s3_bytes(example.as_bytes()).unwrap();
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -4,6 +4,7 @@ use anyhow::{bail, Context};
 use bytes::Bytes;
 use camino::Utf8Path;
 use fail::fail_point;
+use pageserver_api::shard::TenantShardId;
 use std::io::ErrorKind;
 use tokio::fs;

@@ -24,7 +25,7 @@ use tracing::info;
 /// Serializes and uploads the given index part data to the remote storage.
 pub(super) async fn upload_index_part<'a>(
    storage: &'a GenericRemoteStorage,
-    tenant_id: &TenantId,
+    tenant_shard_id: &TenantShardId,
    timeline_id: &TimelineId,
    generation: Generation,
    index_part: &'a IndexPart,
@@ -42,11 +43,11 @@ pub(super) async fn upload_index_part<'a>(
    let index_part_size = index_part_bytes.len();
    let index_part_bytes = tokio::io::BufReader::new(std::io::Cursor::new(index_part_bytes));

-    let remote_path = remote_index_path(tenant_id, timeline_id, generation);
+    let remote_path = remote_index_path(tenant_shard_id, timeline_id, generation);
    storage
        .upload_storage_object(Box::new(index_part_bytes), index_part_size, &remote_path)
        .await
-        .with_context(|| format!("upload index part for '{tenant_id} / {timeline_id}'"))
+        .with_context(|| format!("upload index part for '{tenant_shard_id} / {timeline_id}'"))
 }

 /// Attempts to upload given layer files.
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -2,7 +2,7 @@

 pub mod delta_layer;
 mod filename;
-mod image_layer;
+pub mod image_layer;
 mod inmemory_layer;
 mod layer;
 mod layer_desc;
@@ -24,10 +24,7 @@ use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
 use utils::rate_limit::RateLimit;

-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
+use utils::{id::TimelineId, lsn::Lsn};

 pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
 pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
@@ -304,12 +301,14 @@ pub trait AsLayerDesc {
 }

 pub mod tests {
+    use pageserver_api::shard::TenantShardId;
+
    use super::*;

    impl From<DeltaFileName> for PersistentLayerDesc {
        fn from(value: DeltaFileName) -> Self {
            PersistentLayerDesc::new_delta(
-                TenantId::from_array([0; 16]),
+                TenantShardId::from([0; 18]),
                TimelineId::from_array([0; 16]),
                value.key_range,
                value.lsn_range,
@@ -321,7 +320,7 @@ pub mod tests {
    impl From<ImageFileName> for PersistentLayerDesc {
        fn from(value: ImageFileName) -> Self {
            PersistentLayerDesc::new_img(
-                TenantId::from_array([0; 16]),
+                TenantShardId::from([0; 18]),
                TimelineId::from_array([0; 16]),
                value.key_range,
                value.lsn,
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -42,6 +42,7 @@ use crate::{DELTA_FILE_MAGIC, STORAGE_FORMAT_VERSION};
 use anyhow::{bail, ensure, Context, Result};
 use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::models::LayerAccessKind;
+use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::fs::File;
@@ -69,13 +70,13 @@ use super::{AsLayerDesc, LayerAccessStats, PersistentLayerDesc, ResidentLayer};
 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
 pub struct Summary {
    /// Magic value to identify this as a neon delta file. Always DELTA_FILE_MAGIC.
-    magic: u16,
-    format_version: u16,
+    pub magic: u16,
+    pub format_version: u16,

-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    key_range: Range<Key>,
-    lsn_range: Range<Lsn>,
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,

    /// Block number where the 'index' part of the file begins.
    pub index_start_blk: u32,
@@ -86,7 +87,7 @@ pub struct Summary {
 impl From<&DeltaLayer> for Summary {
    fn from(layer: &DeltaLayer) -> Self {
        Self::expected(
-            layer.desc.tenant_id,
+            layer.desc.tenant_shard_id.tenant_id,
            layer.desc.timeline_id,
            layer.desc.key_range.clone(),
            layer.desc.lsn_range.clone(),
@@ -248,7 +249,7 @@ impl DeltaLayer {

    fn temp_path_for(
        conf: &PageServerConf,
-        tenant_id: &TenantId,
+        tenant_shard_id: &TenantShardId,
        timeline_id: &TimelineId,
        key_start: Key,
        lsn_range: &Range<Lsn>,
@@ -259,14 +260,15 @@ impl DeltaLayer {
            .map(char::from)
            .collect();

-        conf.timeline_path(tenant_id, timeline_id).join(format!(
-            "{}-XXX__{:016X}-{:016X}.{}.{}",
-            key_start,
-            u64::from(lsn_range.start),
-            u64::from(lsn_range.end),
-            rand_string,
-            TEMP_FILE_SUFFIX,
-        ))
+        conf.timeline_path(tenant_shard_id, timeline_id)
+            .join(format!(
+                "{}-XXX__{:016X}-{:016X}.{}.{}",
+                key_start,
+                u64::from(lsn_range.start),
+                u64::from(lsn_range.end),
+                rand_string,
+                TEMP_FILE_SUFFIX,
+            ))
    }

    ///
@@ -318,10 +320,14 @@ impl DeltaLayer {
            .metadata()
            .context("get file metadata to determine size")?;

+        // TODO(sharding): we must get the TenantShardId from the path instead of reading the Summary.
+        // we should also validate the path against the Summary, as both should contain the same tenant, timeline, key, lsn.
+        let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
+
        Ok(DeltaLayer {
            path: path.to_path_buf(),
            desc: PersistentLayerDesc::new_delta(
-                summary.tenant_id,
+                tenant_shard_id,
                summary.timeline_id,
                summary.key_range,
                summary.lsn_range,
@@ -353,7 +359,7 @@ struct DeltaLayerWriterInner {
    conf: &'static PageServerConf,
    pub path: Utf8PathBuf,
    timeline_id: TimelineId,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,

    key_start: Key,
    lsn_range: Range<Lsn>,
@@ -370,7 +376,7 @@ impl DeltaLayerWriterInner {
    async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        key_start: Key,
        lsn_range: Range<Lsn>,
    ) -> anyhow::Result<Self> {
@@ -380,7 +386,8 @@ impl DeltaLayerWriterInner {
        //
        // Note: This overwrites any existing file. There shouldn't be any.
        // FIXME: throw an error instead?
-        let path = DeltaLayer::temp_path_for(conf, &tenant_id, &timeline_id, key_start, &lsn_range);
+        let path =
+            DeltaLayer::temp_path_for(conf, &tenant_shard_id, &timeline_id, key_start, &lsn_range);

        let mut file = VirtualFile::create(&path).await?;
        // make room for the header block
@@ -395,7 +402,7 @@ impl DeltaLayerWriterInner {
            conf,
            path,
            timeline_id,
-            tenant_id,
+            tenant_shard_id,
            key_start,
            lsn_range,
            tree: tree_builder,
@@ -457,7 +464,7 @@ impl DeltaLayerWriterInner {
        let summary = Summary {
            magic: DELTA_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,
-            tenant_id: self.tenant_id,
+            tenant_id: self.tenant_shard_id.tenant_id,
            timeline_id: self.timeline_id,
            key_range: self.key_start..key_end,
            lsn_range: self.lsn_range.clone(),
@@ -498,7 +505,7 @@ impl DeltaLayerWriterInner {
        // set inner.file here. The first read will have to re-open it.

        let desc = PersistentLayerDesc::new_delta(
-            self.tenant_id,
+            self.tenant_shard_id,
            self.timeline_id,
            self.key_start..key_end,
            self.lsn_range.clone(),
@@ -549,14 +556,20 @@ impl DeltaLayerWriter {
    pub async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        key_start: Key,
        lsn_range: Range<Lsn>,
    ) -> anyhow::Result<Self> {
        Ok(Self {
            inner: Some(
-                DeltaLayerWriterInner::new(conf, timeline_id, tenant_id, key_start, lsn_range)
-                    .await?,
+                DeltaLayerWriterInner::new(
+                    conf,
+                    timeline_id,
+                    tenant_shard_id,
+                    key_start,
+                    lsn_range,
+                )
+                .await?,
            ),
        })
    }
@@ -611,6 +624,61 @@ impl Drop for DeltaLayerWriter {
    }
 }

+#[derive(thiserror::Error, Debug)]
+pub enum RewriteSummaryError {
+    #[error("magic mismatch")]
+    MagicMismatch,
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl From<std::io::Error> for RewriteSummaryError {
+    fn from(e: std::io::Error) -> Self {
+        Self::Other(anyhow::anyhow!(e))
+    }
+}
+
+impl DeltaLayer {
+    pub async fn rewrite_summary<F>(
+        path: &Utf8Path,
+        rewrite: F,
+        ctx: &RequestContext,
+    ) -> Result<(), RewriteSummaryError>
+    where
+        F: Fn(Summary) -> Summary,
+    {
+        let file = VirtualFile::open_with_options(
+            path,
+            &*std::fs::OpenOptions::new().read(true).write(true),
+        )
+        .await
+        .with_context(|| format!("Failed to open file '{}'", path))?;
+        let file = FileBlockReader::new(file);
+        let summary_blk = file.read_blk(0, ctx).await?;
+        let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
+        let mut file = file.file;
+        if actual_summary.magic != DELTA_FILE_MAGIC {
+            return Err(RewriteSummaryError::MagicMismatch);
+        }
+
+        let new_summary = rewrite(actual_summary);
+
+        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
+        if buf.spilled() {
+            // The code in DeltaLayerWriterInner just warn!()s for this.
+            // It should probably error out as well.
+            return Err(RewriteSummaryError::Other(anyhow::anyhow!(
+                "Used more than one page size for summary buffer: {}",
+                buf.len()
+            )));
+        }
+        file.seek(SeekFrom::Start(0)).await?;
+        file.write_all(&buf).await?;
+        Ok(())
+    }
+}
+
 impl DeltaLayerInner {
    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
    /// - inner has the success or transient failure
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -41,6 +41,7 @@ use bytes::Bytes;
 use camino::{Utf8Path, Utf8PathBuf};
 use hex;
 use pageserver_api::models::LayerAccessKind;
+use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
 use std::fs::File;
@@ -67,27 +68,27 @@ use super::{AsLayerDesc, Layer, PersistentLayerDesc, ResidentLayer};
 /// the 'index' starts at the block indicated by 'index_start_blk'
 ///
 #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)]
-pub(super) struct Summary {
+pub struct Summary {
    /// Magic value to identify this as a neon image file. Always IMAGE_FILE_MAGIC.
-    magic: u16,
-    format_version: u16,
+    pub magic: u16,
+    pub format_version: u16,

-    tenant_id: TenantId,
-    timeline_id: TimelineId,
-    key_range: Range<Key>,
-    lsn: Lsn,
+    pub tenant_id: TenantId,
+    pub timeline_id: TimelineId,
+    pub key_range: Range<Key>,
+    pub lsn: Lsn,

    /// Block number where the 'index' part of the file begins.
-    index_start_blk: u32,
+    pub index_start_blk: u32,
    /// Block within the 'index', where the B-tree root page is stored
-    index_root_blk: u32,
+    pub index_root_blk: u32,
    // the 'values' part starts after the summary header, on block 1.
 }

 impl From<&ImageLayer> for Summary {
    fn from(layer: &ImageLayer) -> Self {
        Self::expected(
-            layer.desc.tenant_id,
+            layer.desc.tenant_shard_id.tenant_id,
            layer.desc.timeline_id,
            layer.desc.key_range.clone(),
            layer.lsn,
@@ -217,7 +218,7 @@ impl ImageLayer {
    fn temp_path_for(
        conf: &PageServerConf,
        timeline_id: TimelineId,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        fname: &ImageFileName,
    ) -> Utf8PathBuf {
        let rand_string: String = rand::thread_rng()
@@ -226,7 +227,7 @@ impl ImageLayer {
            .map(char::from)
            .collect();

-        conf.timeline_path(&tenant_id, &timeline_id)
+        conf.timeline_path(&tenant_shard_id, &timeline_id)
            .join(format!("{fname}.{rand_string}.{TEMP_FILE_SUFFIX}"))
    }

@@ -276,10 +277,15 @@ impl ImageLayer {
        let metadata = file
            .metadata()
            .context("get file metadata to determine size")?;
+
+        // TODO(sharding): we should get TenantShardId from path.
+        // OR, not at all: any layer we load from disk should also get reconciled with remote IndexPart.
+        let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
+
        Ok(ImageLayer {
            path: path.to_path_buf(),
            desc: PersistentLayerDesc::new_img(
-                summary.tenant_id,
+                tenant_shard_id,
                summary.timeline_id,
                summary.key_range,
                summary.lsn,
@@ -296,6 +302,61 @@ impl ImageLayer {
    }
 }

+#[derive(thiserror::Error, Debug)]
+pub enum RewriteSummaryError {
+    #[error("magic mismatch")]
+    MagicMismatch,
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
+impl From<std::io::Error> for RewriteSummaryError {
+    fn from(e: std::io::Error) -> Self {
+        Self::Other(anyhow::anyhow!(e))
+    }
+}
+
+impl ImageLayer {
+    pub async fn rewrite_summary<F>(
+        path: &Utf8Path,
+        rewrite: F,
+        ctx: &RequestContext,
+    ) -> Result<(), RewriteSummaryError>
+    where
+        F: Fn(Summary) -> Summary,
+    {
+        let file = VirtualFile::open_with_options(
+            path,
+            &*std::fs::OpenOptions::new().read(true).write(true),
+        )
+        .await
+        .with_context(|| format!("Failed to open file '{}'", path))?;
+        let file = FileBlockReader::new(file);
+        let summary_blk = file.read_blk(0, ctx).await?;
+        let actual_summary = Summary::des_prefix(summary_blk.as_ref()).context("deserialize")?;
+        let mut file = file.file;
+        if actual_summary.magic != IMAGE_FILE_MAGIC {
+            return Err(RewriteSummaryError::MagicMismatch);
+        }
+
+        let new_summary = rewrite(actual_summary);
+
+        let mut buf = smallvec::SmallVec::<[u8; PAGE_SZ]>::new();
+        Summary::ser_into(&new_summary, &mut buf).context("serialize")?;
+        if buf.spilled() {
+            // The code in ImageLayerWriterInner just warn!()s for this.
+            // It should probably error out as well.
+            return Err(RewriteSummaryError::Other(anyhow::anyhow!(
+                "Used more than one page size for summary buffer: {}",
+                buf.len()
+            )));
+        }
+        file.seek(SeekFrom::Start(0)).await?;
+        file.write_all(&buf).await?;
+        Ok(())
+    }
+}
+
 impl ImageLayerInner {
    /// Returns nested result following Result<Result<_, OpErr>, Critical>:
    /// - inner has the success or transient failure
@@ -400,7 +461,7 @@ struct ImageLayerWriterInner {
    conf: &'static PageServerConf,
    path: Utf8PathBuf,
    timeline_id: TimelineId,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    key_range: Range<Key>,
    lsn: Lsn,

@@ -415,7 +476,7 @@ impl ImageLayerWriterInner {
    async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        key_range: &Range<Key>,
        lsn: Lsn,
    ) -> anyhow::Result<Self> {
@@ -424,7 +485,7 @@ impl ImageLayerWriterInner {
        let path = ImageLayer::temp_path_for(
            conf,
            timeline_id,
-            tenant_id,
+            tenant_shard_id,
            &ImageFileName {
                key_range: key_range.clone(),
                lsn,
@@ -448,7 +509,7 @@ impl ImageLayerWriterInner {
            conf,
            path,
            timeline_id,
-            tenant_id,
+            tenant_shard_id,
            key_range: key_range.clone(),
            lsn,
            tree: tree_builder,
@@ -495,7 +556,7 @@ impl ImageLayerWriterInner {
        let summary = Summary {
            magic: IMAGE_FILE_MAGIC,
            format_version: STORAGE_FORMAT_VERSION,
-            tenant_id: self.tenant_id,
+            tenant_id: self.tenant_shard_id.tenant_id,
            timeline_id: self.timeline_id,
            key_range: self.key_range.clone(),
            lsn: self.lsn,
@@ -521,7 +582,7 @@ impl ImageLayerWriterInner {
            .context("get metadata to determine file size")?;

        let desc = PersistentLayerDesc::new_img(
-            self.tenant_id,
+            self.tenant_shard_id,
            self.timeline_id,
            self.key_range.clone(),
            self.lsn,
@@ -577,13 +638,14 @@ impl ImageLayerWriter {
    pub async fn new(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        key_range: &Range<Key>,
        lsn: Lsn,
    ) -> anyhow::Result<ImageLayerWriter> {
        Ok(Self {
            inner: Some(
-                ImageLayerWriterInner::new(conf, timeline_id, tenant_id, key_range, lsn).await?,
+                ImageLayerWriterInner::new(conf, timeline_id, tenant_shard_id, key_range, lsn)
+                    .await?,
            ),
        })
    }
--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -14,15 +14,11 @@ use crate::tenant::Timeline;
 use crate::walrecord;
 use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
+use pageserver_api::shard::TenantShardId;
 use std::collections::HashMap;
 use std::sync::{Arc, OnceLock};
 use tracing::*;
-use utils::{
-    bin_ser::BeSer,
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-    vec_map::VecMap,
-};
+use utils::{bin_ser::BeSer, id::TimelineId, lsn::Lsn, vec_map::VecMap};
 // avoid binding to Write (conflicts with std::io::Write)
 // while being able to use std::fmt::Write's methods
 use std::fmt::Write as _;
@@ -33,7 +29,7 @@ use super::{DeltaLayerWriter, ResidentLayer};

 pub struct InMemoryLayer {
    conf: &'static PageServerConf,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,

    /// This layer contains all the changes from 'start_lsn'. The
@@ -226,17 +222,17 @@ impl InMemoryLayer {
    pub async fn create(
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        start_lsn: Lsn,
    ) -> Result<InMemoryLayer> {
        trace!("initializing new empty InMemoryLayer for writing on timeline {timeline_id} at {start_lsn}");

-        let file = EphemeralFile::create(conf, tenant_id, timeline_id).await?;
+        let file = EphemeralFile::create(conf, tenant_shard_id, timeline_id).await?;

        Ok(InMemoryLayer {
            conf,
            timeline_id,
-            tenant_id,
+            tenant_shard_id,
            start_lsn,
            end_lsn: OnceLock::new(),
            inner: RwLock::new(InMemoryLayerInner {
@@ -335,7 +331,7 @@ impl InMemoryLayer {
        let mut delta_layer_writer = DeltaLayerWriter::new(
            self.conf,
            self.timeline_id,
-            self.tenant_id,
+            self.tenant_shard_id,
            Key::MIN,
            self.start_lsn..end_lsn,
        )
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -3,6 +3,7 @@ use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::models::{
    HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
 };
+use pageserver_api::shard::ShardIndex;
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, Weak};
@@ -81,7 +82,7 @@ impl Layer {
        metadata: LayerFileMetadata,
    ) -> Self {
        let desc = PersistentLayerDesc::from_filename(
-            timeline.tenant_id,
+            timeline.tenant_shard_id,
            timeline.timeline_id,
            file_name,
            metadata.file_size(),
@@ -96,6 +97,7 @@ impl Layer {
            desc,
            None,
            metadata.generation,
+            metadata.shard,
        )));

        debug_assert!(owner.0.needs_download_blocking().unwrap().is_some());
@@ -111,7 +113,7 @@ impl Layer {
        metadata: LayerFileMetadata,
    ) -> ResidentLayer {
        let desc = PersistentLayerDesc::from_filename(
-            timeline.tenant_id,
+            timeline.tenant_shard_id,
            timeline.timeline_id,
            file_name,
            metadata.file_size(),
@@ -136,6 +138,7 @@ impl Layer {
                desc,
                Some(inner),
                metadata.generation,
+                metadata.shard,
            )
        }));

@@ -179,6 +182,7 @@ impl Layer {
                desc,
                Some(inner),
                timeline.generation,
+                timeline.get_shard_index(),
            )
        }));

@@ -226,6 +230,10 @@ impl Layer {
    ///
    /// It is up to the caller to collect more data from the previous layer and
    /// perform WAL redo, if necessary.
+    ///
+    /// # Cancellation-Safety
+    ///
+    /// This method is cancellation-safe.
    pub(crate) async fn get_value_reconstruct_data(
        &self,
        key: Key,
@@ -322,6 +330,24 @@ impl Layer {

        Ok(())
    }
+
+    /// Waits until this layer has been dropped (and if needed, local garbage collection and remote
+    /// deletion scheduling has completed).
+    ///
+    /// Does not start garbage collection, use [`Self::garbage_collect_on_drop`] for that
+    /// separatedly.
+    #[cfg(feature = "testing")]
+    pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
+        let mut rx = self.0.status.subscribe();
+
+        async move {
+            loop {
+                if let Err(tokio::sync::broadcast::error::RecvError::Closed) = rx.recv().await {
+                    break;
+                }
+            }
+        }
+    }
 }

 /// The download-ness ([`DownloadedLayer`]) can be either resident or wanted evicted.
@@ -426,6 +452,15 @@ struct LayerInner {
    /// For loaded layers (resident or evicted) this comes from [`LayerFileMetadata::generation`],
    /// for created layers from [`Timeline::generation`].
    generation: Generation,
+
+    /// The shard of this Layer.
+    ///
+    /// For layers created in this process, this will always be the [`ShardIndex`] of the
+    /// current `ShardIdentity`` (TODO: add link once it's introduced).
+    ///
+    /// For loaded layers, this may be some other value if the tenant has undergone
+    /// a shard split since the layer was originally written.
+    shard: ShardIndex,
 }

 impl std::fmt::Display for LayerInner {
@@ -455,17 +490,21 @@ impl Drop for LayerInner {
            return;
        }

-        let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_id, timeline_id = %self.layer_desc().timeline_id);
+        let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);

        let path = std::mem::take(&mut self.path);
        let file_name = self.layer_desc().filename();
-        let gen = self.generation;
        let file_size = self.layer_desc().file_size;
        let timeline = self.timeline.clone();
+        let meta = self.metadata();
+        let status = self.status.clone();

        crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
            let _g = span.entered();

+            // carry this until we are finished for [`Layer::wait_drop`] support
+            let _status = status;
+
            let removed = match std::fs::remove_file(path) {
                Ok(()) => true,
                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
@@ -489,7 +528,7 @@ impl Drop for LayerInner {
                    timeline.metrics.resident_physical_size_sub(file_size);
                }
                if let Some(remote_client) = timeline.remote_client.as_ref() {
-                    let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, gen)]);
+                    let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);

                    if let Err(e) = res {
                        // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
@@ -523,9 +562,10 @@ impl LayerInner {
        desc: PersistentLayerDesc,
        downloaded: Option<Arc<DownloadedLayer>>,
        generation: Generation,
+        shard: ShardIndex,
    ) -> Self {
        let path = conf
-            .timeline_path(&timeline.tenant_id, &timeline.timeline_id)
+            .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id)
            .join(desc.filename().to_string());

        let (inner, version) = if let Some(inner) = downloaded {
@@ -550,6 +590,7 @@ impl LayerInner {
            status: tokio::sync::broadcast::channel(1).0,
            consecutive_failures: AtomicUsize::new(0),
            generation,
+            shard,
        }
    }

@@ -795,7 +836,7 @@ impl LayerInner {
        crate::task_mgr::spawn(
            &tokio::runtime::Handle::current(),
            crate::task_mgr::TaskKind::RemoteDownloadTask,
-            Some(self.desc.tenant_id),
+            Some(self.desc.tenant_shard_id.tenant_id),
            Some(self.desc.timeline_id),
            &task_name,
            false,
@@ -960,7 +1001,7 @@ impl LayerInner {
        if gc {
            // do nothing now, only in LayerInner::drop
        } else if can_evict && evict {
-            let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_id, timeline_id = %self.desc.timeline_id, layer=%self, %version);
+            let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);

            // downgrade for queueing, in case there's a tear down already ongoing we should not
            // hold it alive.
@@ -1077,7 +1118,7 @@ impl LayerInner {
    }

    fn metadata(&self) -> LayerFileMetadata {
-        LayerFileMetadata::new(self.desc.file_size, self.generation)
+        LayerFileMetadata::new(self.desc.file_size, self.generation, self.shard)
    }
 }

@@ -1192,7 +1233,7 @@ impl DownloadedLayer {

            let res = if owner.desc.is_delta {
                let summary = Some(delta_layer::Summary::expected(
-                    owner.desc.tenant_id,
+                    owner.desc.tenant_shard_id.tenant_id,
                    owner.desc.timeline_id,
                    owner.desc.key_range.clone(),
                    owner.desc.lsn_range.clone(),
@@ -1203,7 +1244,7 @@ impl DownloadedLayer {
            } else {
                let lsn = owner.desc.image_layer_lsn();
                let summary = Some(image_layer::Summary::expected(
-                    owner.desc.tenant_id,
+                    owner.desc.tenant_shard_id.tenant_id,
                    owner.desc.timeline_id,
                    owner.desc.key_range.clone(),
                    lsn,
@@ -1401,6 +1442,7 @@ impl Default for LayerImplMetrics {
        )
        .unwrap();

+        // reminder: this will be pageserver_layer_gcs_count_total with "_total" suffix
        let gcs = metrics::register_int_counter_vec!(
            "pageserver_layer_gcs_count",
            "Garbage collections started and completed in the Layer implementation",
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,9 +1,7 @@
 use core::fmt::Display;
+use pageserver_api::shard::TenantShardId;
 use std::ops::Range;
-use utils::{
-    id::{TenantId, TimelineId},
-    lsn::Lsn,
-};
+use utils::{id::TimelineId, lsn::Lsn};

 use crate::repository::Key;

@@ -11,12 +9,15 @@ use super::{DeltaFileName, ImageFileName, LayerFileName};

 use serde::{Deserialize, Serialize};

+#[cfg(test)]
+use utils::id::TenantId;
+
 /// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
 /// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
 /// a unified way to generate layer information like file name.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct PersistentLayerDesc {
-    pub tenant_id: TenantId,
+    pub tenant_shard_id: TenantShardId,
    pub timeline_id: TimelineId,
    /// Range of keys that this layer covers
    pub key_range: Range<Key>,
@@ -56,7 +57,7 @@ impl PersistentLayerDesc {
    #[cfg(test)]
    pub fn new_test(key_range: Range<Key>) -> Self {
        Self {
-            tenant_id: TenantId::generate(),
+            tenant_shard_id: TenantShardId::unsharded(TenantId::generate()),
            timeline_id: TimelineId::generate(),
            key_range,
            lsn_range: Lsn(0)..Lsn(1),
@@ -66,14 +67,14 @@ impl PersistentLayerDesc {
    }

    pub fn new_img(
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        key_range: Range<Key>,
        lsn: Lsn,
        file_size: u64,
    ) -> Self {
        Self {
-            tenant_id,
+            tenant_shard_id,
            timeline_id,
            key_range,
            lsn_range: Self::image_layer_lsn_range(lsn),
@@ -83,14 +84,14 @@ impl PersistentLayerDesc {
    }

    pub fn new_delta(
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        key_range: Range<Key>,
        lsn_range: Range<Lsn>,
        file_size: u64,
    ) -> Self {
        Self {
-            tenant_id,
+            tenant_shard_id,
            timeline_id,
            key_range,
            lsn_range,
@@ -100,18 +101,22 @@ impl PersistentLayerDesc {
    }

    pub fn from_filename(
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        timeline_id: TimelineId,
        filename: LayerFileName,
        file_size: u64,
    ) -> Self {
        match filename {
            LayerFileName::Image(i) => {
-                Self::new_img(tenant_id, timeline_id, i.key_range, i.lsn, file_size)
-            }
-            LayerFileName::Delta(d) => {
-                Self::new_delta(tenant_id, timeline_id, d.key_range, d.lsn_range, file_size)
+                Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size)
            }
+            LayerFileName::Delta(d) => Self::new_delta(
+                tenant_shard_id,
+                timeline_id,
+                d.key_range,
+                d.lsn_range,
+                file_size,
+            ),
        }
    }

@@ -172,10 +177,6 @@ impl PersistentLayerDesc {
        self.timeline_id
    }

-    pub fn get_tenant_id(&self) -> TenantId {
-        self.tenant_id
-    }
-
    /// Does this layer only contain some data for the key-range (incremental),
    /// or does it contain a version of every page? This is important to know
    /// for garbage collecting old layers: an incremental layer depends on
@@ -192,7 +193,7 @@ impl PersistentLayerDesc {
        if self.is_delta {
            println!(
                "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----",
-                self.tenant_id,
+                self.tenant_shard_id,
                self.timeline_id,
                self.key_range.start,
                self.key_range.end,
@@ -204,7 +205,7 @@ impl PersistentLayerDesc {
        } else {
            println!(
                "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
-                self.tenant_id,
+                self.tenant_shard_id,
                self.timeline_id,
                self.key_range.start,
                self.key_range.end,
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -44,6 +44,7 @@ pub(crate) enum BackgroundLoopKind {
    Eviction,
    ConsumptionMetricsCollectMetrics,
    ConsumptionMetricsSyntheticSizeWorker,
+    InitialLogicalSizeCalculation,
 }

 impl BackgroundLoopKind {
@@ -86,7 +87,7 @@ pub fn start_background_loops(
    tenant: &Arc<Tenant>,
    background_jobs_can_start: Option<&completion::Barrier>,
 ) {
-    let tenant_id = tenant.tenant_id;
+    let tenant_id = tenant.tenant_shard_id.tenant_id;
    task_mgr::spawn(
        BACKGROUND_RUNTIME.handle(),
        TaskKind::Compaction,
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -4,13 +4,10 @@ use std::{
 };

 use anyhow::Context;
-use pageserver_api::models::TimelineState;
+use pageserver_api::{models::TimelineState, shard::TenantShardId};
 use tokio::sync::OwnedMutexGuard;
 use tracing::{debug, error, info, instrument, warn, Instrument, Span};
-use utils::{
-    crashsafe, fs_ext,
-    id::{TenantId, TimelineId},
-};
+use utils::{crashsafe, fs_ext, id::TimelineId};

 use crate::{
    config::PageServerConf,
@@ -24,7 +21,6 @@ use crate::{
        },
        CreateTimelineCause, DeleteTimelineError, Tenant,
    },
-    InitializationOrder,
 };

 use super::{Timeline, TimelineResources};
@@ -47,7 +43,7 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
    // Shut down the layer flush task before the remote client, as one depends on the other
    task_mgr::shutdown_tasks(
        Some(TaskKind::LayerFlushTask),
-        Some(timeline.tenant_id),
+        Some(timeline.tenant_shard_id.tenant_id),
        Some(timeline.timeline_id),
    )
    .await;
@@ -73,7 +69,12 @@ async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
    // NB: This and other delete_timeline calls do not run as a task_mgr task,
    //     so, they are not affected by this shutdown_tasks() call.
    info!("waiting for timeline tasks to shutdown");
-    task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await;
+    task_mgr::shutdown_tasks(
+        None,
+        Some(timeline.tenant_shard_id.tenant_id),
+        Some(timeline.timeline_id),
+    )
+    .await;

    fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
        Err(anyhow::anyhow!(
@@ -110,11 +111,11 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi
    Ok(())
 }

-/// Grab the layer_removal_cs lock, and actually perform the deletion.
+/// Grab the compaction and gc locks, and actually perform the deletion.
 ///
-/// This lock prevents prevents GC or compaction from running at the same time.
-/// The GC task doesn't register itself with the timeline it's operating on,
-/// so it might still be running even though we called `shutdown_tasks`.
+/// The locks prevent GC or compaction from running at the same time. The background tasks do not
+/// register themselves with the timeline it's operating on, so it might still be running even
+/// though we called `shutdown_tasks`.
 ///
 /// Note that there are still other race conditions between
 /// GC, compaction and timeline deletion. See
@@ -122,19 +123,24 @@ async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTi
 ///
 /// No timeout here, GC & Compaction should be responsive to the
 /// `TimelineState::Stopping` change.
-async fn delete_local_layer_files(
+// pub(super): documentation link
+pub(super) async fn delete_local_layer_files(
    conf: &PageServerConf,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    timeline: &Timeline,
 ) -> anyhow::Result<()> {
-    info!("waiting for layer_removal_cs.lock()");
-    let layer_removal_guard = timeline.layer_removal_cs.lock().await;
-    info!("got layer_removal_cs.lock(), deleting layer files");
+    let guards = async { tokio::join!(timeline.gc_lock.lock(), timeline.compaction_lock.lock()) };
+    let guards = crate::timed(
+        guards,
+        "acquire gc and compaction locks",
+        std::time::Duration::from_secs(5),
+    )
+    .await;

    // NB: storage_sync upload tasks that reference these layers have been cancelled
    //     by the caller.

-    let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id);
+    let local_timeline_directory = conf.timeline_path(&tenant_shard_id, &timeline.timeline_id);

    fail::fail_point!("timeline-delete-before-rm", |_| {
        Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
@@ -150,8 +156,8 @@ async fn delete_local_layer_files(
    // because of a previous failure/cancellation at/after
    // failpoint timeline-delete-after-rm.
    //
-    // It can also happen if we race with tenant detach, because,
-    // it doesn't grab the layer_removal_cs lock.
+    // ErrorKind::NotFound can also happen if we race with tenant detach, because,
+    // no locks are shared.
    //
    // For now, log and continue.
    // warn! level is technically not appropriate for the
@@ -170,7 +176,7 @@ async fn delete_local_layer_files(
        return Ok(());
    }

-    let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id);
+    let metadata_path = conf.metadata_path(&tenant_shard_id, &timeline.timeline_id);

    for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
        #[cfg(feature = "testing")]
@@ -219,8 +225,8 @@ async fn delete_local_layer_files(
        .with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
    }

-    info!("finished deleting layer files, releasing layer_removal_cs.lock()");
-    drop(layer_removal_guard);
+    info!("finished deleting layer files, releasing locks");
+    drop(guards);

    fail::fail_point!("timeline-delete-after-rm", |_| {
        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
@@ -245,11 +251,11 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<(
 // (nothing can fail after its deletion)
 async fn cleanup_remaining_timeline_fs_traces(
    conf: &PageServerConf,
-    tenant_id: TenantId,
+    tenant_shard_id: TenantShardId,
    timeline_id: TimelineId,
 ) -> anyhow::Result<()> {
    // Remove local metadata
-    tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id))
+    tokio::fs::remove_file(conf.metadata_path(&tenant_shard_id, &timeline_id))
        .await
        .or_else(fs_ext::ignore_not_found)
        .context("remove metadata")?;
@@ -261,7 +267,7 @@ async fn cleanup_remaining_timeline_fs_traces(
    });

    // Remove timeline dir
-    tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id))
+    tokio::fs::remove_dir(conf.timeline_path(&tenant_shard_id, &timeline_id))
        .await
        .or_else(fs_ext::ignore_not_found)
        .context("timeline dir")?;
@@ -276,7 +282,7 @@ async fn cleanup_remaining_timeline_fs_traces(
    // to be reordered later and thus missed if a crash occurs.
    // Note that we dont need to sync after mark file is removed
    // because we can tolerate the case when mark file reappears on startup.
-    let timeline_path = conf.timelines_path(&tenant_id);
+    let timeline_path = conf.timelines_path(&tenant_shard_id);
    crashsafe::fsync_async(timeline_path)
        .await
        .context("fsync_pre_mark_remove")?;
@@ -284,7 +290,7 @@ async fn cleanup_remaining_timeline_fs_traces(
    // Remove delete mark
    // TODO: once we are confident that no more exist in the field, remove this
    // line.  It cleans up a legacy marker file that might in rare cases be present.
-    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
+    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_shard_id, timeline_id))
        .await
        .or_else(fs_ext::ignore_not_found)
        .context("remove delete mark")
@@ -350,7 +356,7 @@ impl DeleteTimelineFlow {
    // NB: If this fails half-way through, and is retried, the retry will go through
    // all the same steps again. Make sure the code here is idempotent, and don't
    // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))]
+    #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_shard_id.tenant_id, shard_id=%tenant.tenant_shard_id.shard_slug()))]
    pub async fn run(
        tenant: &Arc<Tenant>,
        timeline_id: TimelineId,
@@ -400,7 +406,6 @@ impl DeleteTimelineFlow {
        local_metadata: &TimelineMetadata,
        remote_client: Option<RemoteTimelineClient>,
        deletion_queue_client: DeletionQueueClient,
-        init_order: Option<&InitializationOrder>,
    ) -> anyhow::Result<()> {
        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
        // RemoteTimelineClient is the only functioning part.
@@ -413,7 +418,6 @@ impl DeleteTimelineFlow {
                    remote_client,
                    deletion_queue_client,
                },
-                init_order,
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
                CreateTimelineCause::Delete,
@@ -446,7 +450,8 @@ impl DeleteTimelineFlow {
        timeline_id: TimelineId,
    ) -> anyhow::Result<()> {
        let r =
-            cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await;
+            cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_shard_id, timeline_id)
+                .await;
        info!("Done");
        r
    }
@@ -517,13 +522,13 @@ impl DeleteTimelineFlow {
        tenant: Arc<Tenant>,
        timeline: Arc<Timeline>,
    ) {
-        let tenant_id = timeline.tenant_id;
+        let tenant_shard_id = timeline.tenant_shard_id;
        let timeline_id = timeline.timeline_id;

        task_mgr::spawn(
            task_mgr::BACKGROUND_RUNTIME.handle(),
            TaskKind::TimelineDeletionWorker,
-            Some(tenant_id),
+            Some(tenant_shard_id.tenant_id),
            Some(timeline_id),
            "timeline_delete",
            false,
@@ -536,7 +541,7 @@ impl DeleteTimelineFlow {
            }
            .instrument({
                let span =
-                    tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
+                    tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(),timeline_id=%timeline_id);
                span.follows_from(Span::current());
                span
            }),
@@ -549,13 +554,14 @@ impl DeleteTimelineFlow {
        tenant: &Tenant,
        timeline: &Timeline,
    ) -> Result<(), DeleteTimelineError> {
-        delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;
+        delete_local_layer_files(conf, tenant.tenant_shard_id, timeline).await?;

        delete_remote_layers_and_index(timeline).await?;

        pausable_failpoint!("in_progress_delete");

-        cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?;
+        cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_shard_id, timeline.timeline_id)
+            .await?;

        remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;

--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -60,9 +60,12 @@ impl Timeline {
        task_mgr::spawn(
            BACKGROUND_RUNTIME.handle(),
            TaskKind::Eviction,
-            Some(self.tenant_id),
+            Some(self.tenant_shard_id.tenant_id),
            Some(self.timeline_id),
-            &format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id),
+            &format!(
+                "layer eviction for {}/{}",
+                self.tenant_shard_id, self.timeline_id
+            ),
            false,
            async move {
                let cancel = task_mgr::shutdown_token();
@@ -77,7 +80,7 @@ impl Timeline {
        );
    }

-    #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
+    #[instrument(skip_all, fields(tenant_id = %self.tenant_shard_id.tenant_id, shard_id = %self.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id))]
    async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
        use crate::tenant::tasks::random_init_delay;
        {
@@ -296,7 +299,6 @@ impl Timeline {
                    stats.evicted += 1;
                }
                Some(Err(EvictionError::NotFound | EvictionError::Downloaded)) => {
-                    // compaction/gc removed the file while we were waiting on layer_removal_cs
                    stats.not_evictable += 1;
                }
            }
@@ -341,7 +343,7 @@ impl Timeline {
        // Make one of the tenant's timelines draw the short straw and run the calculation.
        // The others wait until the calculation is done so that they take into account the
        // imitated accesses that the winner made.
-        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_id, true) {
+        let tenant = match crate::tenant::mgr::get_tenant(self.tenant_shard_id.tenant_id, true) {
            Ok(t) => t,
            Err(_) => {
                return ControlFlow::Break(());
--- a/pageserver/src/tenant/timeline/init.rs
+++ b/pageserver/src/tenant/timeline/init.rs
@@ -13,6 +13,7 @@ use crate::{
 };
 use anyhow::Context;
 use camino::Utf8Path;
+use pageserver_api::shard::ShardIndex;
 use std::{collections::HashMap, str::FromStr};
 use utils::lsn::Lsn;

@@ -107,6 +108,7 @@ pub(super) fn reconcile(
    index_part: Option<&IndexPart>,
    disk_consistent_lsn: Lsn,
    generation: Generation,
+    shard: ShardIndex,
 ) -> Vec<(LayerFileName, Result<Decision, DismissedLayer>)> {
    use Decision::*;

@@ -118,10 +120,13 @@ pub(super) fn reconcile(
        .map(|(name, file_size)| {
            (
                name,
-                // The generation here will be corrected to match IndexPart in the merge below, unless
+                // The generation and shard here will be corrected to match IndexPart in the merge below, unless
                // it is not in IndexPart, in which case using our current generation makes sense
                // because it will be uploaded in this generation.
-                (Some(LayerFileMetadata::new(file_size, generation)), None),
+                (
+                    Some(LayerFileMetadata::new(file_size, generation, shard)),
+                    None,
+                ),
            )
        })
        .collect::<Collected>();
--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -1,8 +1,9 @@
 use anyhow::{bail, ensure, Context, Result};
+use pageserver_api::shard::TenantShardId;
 use std::{collections::HashMap, sync::Arc};
 use tracing::trace;
 use utils::{
-    id::{TenantId, TimelineId},
+    id::TimelineId,
    lsn::{AtomicLsn, Lsn},
 };

@@ -73,7 +74,7 @@ impl LayerManager {
        last_record_lsn: Lsn,
        conf: &'static PageServerConf,
        timeline_id: TimelineId,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
    ) -> Result<Arc<InMemoryLayer>> {
        ensure!(lsn.is_aligned());

@@ -109,7 +110,8 @@ impl LayerManager {
                lsn
            );

-            let new_layer = InMemoryLayer::create(conf, timeline_id, tenant_id, start_lsn).await?;
+            let new_layer =
+                InMemoryLayer::create(conf, timeline_id, tenant_shard_id, start_lsn).await?;
            let layer = Arc::new(new_layer);

            self.layer_map.open_layer = Some(layer.clone());
@@ -190,7 +192,6 @@ impl LayerManager {
    /// Called when compaction is completed.
    pub(crate) fn finish_compact_l0(
        &mut self,
-        layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
        compact_from: &[Layer],
        compact_to: &[ResidentLayer],
        metrics: &TimelineMetrics,
@@ -201,25 +202,16 @@ impl LayerManager {
            metrics.record_new_file_metrics(l.layer_desc().file_size);
        }
        for l in compact_from {
-            Self::delete_historic_layer(layer_removal_cs, l, &mut updates, &mut self.layer_fmgr);
+            Self::delete_historic_layer(l, &mut updates, &mut self.layer_fmgr);
        }
        updates.flush();
    }

-    /// Called when garbage collect the timeline. Returns a guard that will apply the updates to the layer map.
-    pub(crate) fn finish_gc_timeline(
-        &mut self,
-        layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
-        gc_layers: Vec<Layer>,
-    ) {
+    /// Called when garbage collect has selected the layers to be removed.
+    pub(crate) fn finish_gc_timeline(&mut self, gc_layers: &[Layer]) {
        let mut updates = self.layer_map.batch_update();
        for doomed_layer in gc_layers {
-            Self::delete_historic_layer(
-                layer_removal_cs,
-                &doomed_layer,
-                &mut updates,
-                &mut self.layer_fmgr,
-            );
+            Self::delete_historic_layer(doomed_layer, &mut updates, &mut self.layer_fmgr);
        }
        updates.flush()
    }
@@ -238,7 +230,6 @@ impl LayerManager {
    /// Remote storage is not affected by this operation.
    fn delete_historic_layer(
        // we cannot remove layers otherwise, since gc and compaction will race
-        _layer_removal_cs: &Arc<tokio::sync::OwnedMutexGuard<()>>,
        layer: &Layer,
        updates: &mut BatchedUpdates<'_>,
        mapping: &mut LayerFileManager<Layer>,
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -1,11 +1,10 @@
 use anyhow::Context;
-use once_cell::sync::OnceCell;

-use tokio::sync::Semaphore;
+use once_cell::sync::OnceCell;
+use tokio_util::sync::CancellationToken;
 use utils::lsn::Lsn;

-use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
-use std::sync::Arc;
+use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};

 /// Internal structure to hold all data needed for logical size calculation.
 ///
@@ -23,10 +22,17 @@ pub(super) struct LogicalSize {
    ///
    /// NOTE: size at a given LSN is constant, but after a restart we will calculate
    /// the initial size at a different LSN.
-    pub initial_logical_size: OnceCell<u64>,
+    pub initial_logical_size: OnceCell<(
+        u64,
+        crate::metrics::initial_logical_size::FinishedCalculationGuard,
+    )>,

-    /// Semaphore to track ongoing calculation of `initial_logical_size`.
-    pub initial_size_computation: Arc<tokio::sync::Semaphore>,
+    /// Cancellation for the best-effort logical size calculation.
+    ///
+    /// The token is kept in a once-cell so that we can error out if a higher priority
+    /// request comes in *before* we have started the normal logical size calculation.
+    pub(crate) cancel_wait_for_background_loop_concurrency_limit_semaphore:
+        OnceCell<CancellationToken>,

    /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
    pub initial_part_end: Option<Lsn>,
@@ -52,25 +58,57 @@ pub(super) struct LogicalSize {
    /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
    /// to modify this, it will also keep the prometheus metric in sync.
    pub size_added_after_initial: AtomicI64,
+
+    /// For [`crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE`].
+    pub(super) did_return_approximate_to_walreceiver: AtomicBool,
 }

 /// Normalized current size, that the data in pageserver occupies.
 #[derive(Debug, Clone, Copy)]
-pub(super) enum CurrentLogicalSize {
+pub(crate) enum CurrentLogicalSize {
    /// The size is not yet calculated to the end, this is an intermediate result,
    /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
    /// yet total logical size cannot be below 0.
-    Approximate(u64),
+    Approximate(Approximate),
    // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
    // available for observation without any calculations.
-    Exact(u64),
+    Exact(Exact),
+}
+
+#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+pub(crate) enum Accuracy {
+    Approximate,
+    Exact,
+}
+
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct Approximate(u64);
+#[derive(Debug, Clone, Copy)]
+pub(crate) struct Exact(u64);
+
+impl From<&Approximate> for u64 {
+    fn from(value: &Approximate) -> Self {
+        value.0
+    }
+}
+
+impl From<&Exact> for u64 {
+    fn from(val: &Exact) -> Self {
+        val.0
+    }
 }

 impl CurrentLogicalSize {
-    pub(super) fn size(&self) -> u64 {
-        *match self {
-            Self::Approximate(size) => size,
-            Self::Exact(size) => size,
+    pub(crate) fn size_dont_care_about_accuracy(&self) -> u64 {
+        match self {
+            Self::Approximate(size) => size.into(),
+            Self::Exact(size) => size.into(),
+        }
+    }
+    pub(crate) fn accuracy(&self) -> Accuracy {
+        match self {
+            Self::Approximate(_) => Accuracy::Approximate,
+            Self::Exact(_) => Accuracy::Exact,
        }
    }
 }
@@ -78,36 +116,42 @@ impl CurrentLogicalSize {
 impl LogicalSize {
    pub(super) fn empty_initial() -> Self {
        Self {
-            initial_logical_size: OnceCell::with_value(0),
-            //  initial_logical_size already computed, so, don't admit any calculations
-            initial_size_computation: Arc::new(Semaphore::new(0)),
+            initial_logical_size: OnceCell::with_value((0, {
+                crate::metrics::initial_logical_size::START_CALCULATION
+                    .first(crate::metrics::initial_logical_size::StartCircumstances::EmptyInitial)
+                    .calculation_result_saved()
+            })),
+            cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
            initial_part_end: None,
            size_added_after_initial: AtomicI64::new(0),
+            did_return_approximate_to_walreceiver: AtomicBool::new(false),
        }
    }

    pub(super) fn deferred_initial(compute_to: Lsn) -> Self {
        Self {
            initial_logical_size: OnceCell::new(),
-            initial_size_computation: Arc::new(Semaphore::new(1)),
+            cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
            initial_part_end: Some(compute_to),
            size_added_after_initial: AtomicI64::new(0),
+            did_return_approximate_to_walreceiver: AtomicBool::new(false),
        }
    }

-    pub(super) fn current_size(&self) -> anyhow::Result<CurrentLogicalSize> {
+    pub(super) fn current_size(&self) -> CurrentLogicalSize {
        let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
        //                  ^^^ keep this type explicit so that the casts in this function break if
        //                  we change the type.
        match self.initial_logical_size.get() {
-            Some(initial_size) => {
-                initial_size.checked_add_signed(size_increment)
+            Some((initial_size, _)) => {
+                CurrentLogicalSize::Exact(Exact(initial_size.checked_add_signed(size_increment)
                    .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
-                    .map(CurrentLogicalSize::Exact)
+                    .unwrap()))
            }
            None => {
+
                let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
-                Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
+                CurrentLogicalSize::Approximate(Approximate(non_negative_size_increment))
            }
        }
    }
@@ -121,7 +165,7 @@ impl LogicalSize {
    /// available for re-use. This doesn't contain the incremental part.
    pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
        match self.initial_part_end {
-            Some(v) if v == lsn => self.initial_logical_size.get().copied(),
+            Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, _)| *s),
            _ => None,
        }
    }
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -43,11 +43,11 @@ impl<'t> UninitializedTimeline<'t> {
    /// The caller is responsible for activating the timeline (function `.activate()`).
    pub(crate) fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
        let timeline_id = self.timeline_id;
-        let tenant_id = self.owning_tenant.tenant_id;
+        let tenant_shard_id = self.owning_tenant.tenant_shard_id;

        if self.raw_timeline.is_none() {
            return Err(anyhow::anyhow!(
-                "No timeline for initialization found for {tenant_id}/{timeline_id}"
+                "No timeline for initialization found for {tenant_shard_id}/{timeline_id}"
            ));
        }

@@ -61,13 +61,13 @@ impl<'t> UninitializedTimeline<'t> {

        anyhow::ensure!(
            new_disk_consistent_lsn.is_valid(),
-            "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
+            "new timeline {tenant_shard_id}/{timeline_id} has invalid disk_consistent_lsn"
        );

        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
        match timelines.entry(timeline_id) {
            Entry::Occupied(_) => anyhow::bail!(
-                "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
+                "Found freshly initialized timeline {tenant_shard_id}/{timeline_id} in the tenant map"
            ),
            Entry::Vacant(v) => {
                // after taking here should be no fallible operations, because the drop guard will not
@@ -79,7 +79,7 @@ impl<'t> UninitializedTimeline<'t> {
                // this should be an assertion.
                uninit_mark.remove_uninit_mark().with_context(|| {
                    format!(
-                        "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
+                        "Failed to remove uninit mark file for timeline {tenant_shard_id}/{timeline_id}"
                    )
                })?;
                v.insert(Arc::clone(&new_timeline));
@@ -134,7 +134,7 @@ impl<'t> UninitializedTimeline<'t> {
            .with_context(|| {
                format!(
                    "No raw timeline {}/{} found",
-                    self.owning_tenant.tenant_id, self.timeline_id
+                    self.owning_tenant.tenant_shard_id, self.timeline_id
                )
            })?
            .0)
@@ -144,7 +144,7 @@ impl<'t> UninitializedTimeline<'t> {
 impl Drop for UninitializedTimeline<'_> {
    fn drop(&mut self) {
        if let Some((_, uninit_mark)) = self.raw_timeline.take() {
-            let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_id, timeline_id = %self.timeline_id).entered();
+            let _entered = info_span!("drop_uninitialized_timeline", tenant_id = %self.owning_tenant.tenant_shard_id.tenant_id, shard_id = %self.owning_tenant.tenant_shard_id.shard_slug(), timeline_id = %self.timeline_id).entered();
            error!("Timeline got dropped without initializing, cleaning its files");
            cleanup_timeline_directory(uninit_mark);
        }
--- a/pageserver/src/tenant/timeline/walreceiver.rs
+++ b/pageserver/src/tenant/timeline/walreceiver.rs
@@ -71,7 +71,7 @@ impl WalReceiver {
        mut broker_client: BrokerClientChannel,
        ctx: &RequestContext,
    ) -> Self {
-        let tenant_id = timeline.tenant_id;
+        let tenant_id = timeline.tenant_shard_id.tenant_id;
        let timeline_id = timeline.timeline_id;
        let walreceiver_ctx =
            ctx.detached_child(TaskKind::WalReceiverManager, DownloadBehavior::Error);
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -75,7 +75,7 @@ pub(super) async fn connection_manager_loop_step(
    }

    let id = TenantTimelineId {
-        tenant_id: connection_manager_state.timeline.tenant_id,
+        tenant_id: connection_manager_state.timeline.tenant_shard_id.tenant_id,
        timeline_id: connection_manager_state.timeline.timeline_id,
    };

@@ -388,7 +388,7 @@ struct BrokerSkTimeline {
 impl ConnectionManagerState {
    pub(super) fn new(timeline: Arc<Timeline>, conf: WalReceiverConf) -> Self {
        let id = TenantTimelineId {
-            tenant_id: timeline.tenant_id,
+            tenant_id: timeline.tenant_shard_id.tenant_id,
            timeline_id: timeline.timeline_id,
        };
        Self {
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -163,7 +163,7 @@ pub(super) async fn handle_walreceiver_connection(
    task_mgr::spawn(
        WALRECEIVER_RUNTIME.handle(),
        TaskKind::WalReceiverConnectionPoller,
-        Some(timeline.tenant_id),
+        Some(timeline.tenant_shard_id.tenant_id),
        Some(timeline.timeline_id),
        "walreceiver connection",
        false,
@@ -396,11 +396,15 @@ pub(super) async fn handle_walreceiver_connection(

            // Send the replication feedback message.
            // Regular standby_status_update fields are put into this message.
-            let (timeline_logical_size, _) = timeline
-                .get_current_logical_size(&ctx)
-                .context("Status update creation failed to get current logical size")?;
+            let current_timeline_size = timeline
+                .get_current_logical_size(
+                    crate::tenant::timeline::GetLogicalSizePriority::User,
+                    &ctx,
+                )
+                // FIXME: https://github.com/neondatabase/neon/issues/5963
+                .size_dont_care_about_accuracy();
            let status_update = PageserverFeedback {
-                current_timeline_size: timeline_logical_size,
+                current_timeline_size,
                last_received_lsn,
                disk_consistent_lsn,
                remote_consistent_lsn,
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -1,6 +1,5 @@
 use super::storage_layer::LayerFileName;
 use super::storage_layer::ResidentLayer;
-use super::Generation;
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
@@ -15,6 +14,9 @@ use utils::lsn::AtomicLsn;
 use std::sync::atomic::AtomicU32;
 use utils::lsn::Lsn;

+#[cfg(feature = "testing")]
+use utils::generation::Generation;
+
 // clippy warns that Uninitialized is much smaller than Initialized, which wastes
 // memory for Uninitialized variants. Doesn't matter in practice, there are not
 // that many upload queues in a running pageserver, and most of them are initialized
@@ -88,6 +90,14 @@ pub(crate) struct UploadQueueInitialized {
    /// bug causing leaks, then it's better to not leave this enabled for production builds.
    #[cfg(feature = "testing")]
    pub(crate) dangling_files: HashMap<LayerFileName, Generation>,
+
+    /// Set to true when we have inserted the `UploadOp::Shutdown` into the `inprogress_tasks`.
+    pub(crate) shutting_down: bool,
+
+    /// Permitless semaphore on which any number of `RemoteTimelineClient::shutdown` futures can
+    /// wait on until one of them stops the queue. The semaphore is closed when
+    /// `RemoteTimelineClient::launch_queued_tasks` encounters `UploadOp::Shutdown`.
+    pub(crate) shutdown_ready: Arc<tokio::sync::Semaphore>,
 }

 impl UploadQueueInitialized {
@@ -146,6 +156,8 @@ impl UploadQueue {
            queued_operations: VecDeque::new(),
            #[cfg(feature = "testing")]
            dangling_files: HashMap::new(),
+            shutting_down: false,
+            shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
        };

        *self = UploadQueue::Initialized(state);
@@ -193,6 +205,8 @@ impl UploadQueue {
            queued_operations: VecDeque::new(),
            #[cfg(feature = "testing")]
            dangling_files: HashMap::new(),
+            shutting_down: false,
+            shutdown_ready: Arc::new(tokio::sync::Semaphore::new(0)),
        };

        *self = UploadQueue::Initialized(state);
@@ -204,7 +218,13 @@ impl UploadQueue {
            UploadQueue::Uninitialized | UploadQueue::Stopped(_) => {
                anyhow::bail!("queue is in state {}", self.as_str())
            }
-            UploadQueue::Initialized(x) => Ok(x),
+            UploadQueue::Initialized(x) => {
+                if !x.shutting_down {
+                    Ok(x)
+                } else {
+                    anyhow::bail!("queue is shutting down")
+                }
+            }
        }
    }

@@ -232,7 +252,7 @@ pub(crate) struct UploadTask {
 /// for timeline deletion, which skips this queue and goes directly to DeletionQueue.
 #[derive(Debug)]
 pub(crate) struct Delete {
-    pub(crate) layers: Vec<(LayerFileName, Generation)>,
+    pub(crate) layers: Vec<(LayerFileName, LayerFileMetadata)>,
 }

 #[derive(Debug)]
@@ -248,6 +268,10 @@ pub(crate) enum UploadOp {

    /// Barrier. When the barrier operation is reached,
    Barrier(tokio::sync::watch::Sender<()>),
+
+    /// Shutdown; upon encountering this operation no new operations will be spawned, otherwise
+    /// this is the same as a Barrier.
+    Shutdown,
 }

 impl std::fmt::Display for UploadOp {
@@ -269,6 +293,7 @@ impl std::fmt::Display for UploadOp {
                write!(f, "Delete({} layers)", delete.layers.len())
            }
            UploadOp::Barrier(_) => write!(f, "Barrier"),
+            UploadOp::Shutdown => write!(f, "Shutdown"),
        }
    }
 }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -98,261 +98,258 @@ impl<'a> WalIngest<'a> {
            self.checkpoint_modified = true;
        }

-        // Heap AM records need some special handling, because they modify VM pages
-        // without registering them with the standard mechanism.
-        if decoded.xl_rmid == pg_constants::RM_HEAP_ID
-            || decoded.xl_rmid == pg_constants::RM_HEAP2_ID
-        {
-            self.ingest_heapam_record(&mut buf, modification, decoded, ctx)
-                .await?;
-        }
-        if decoded.xl_rmid == pg_constants::RM_NEON_ID {
-            self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx)
-                .await?;
-        }
-        // Handle other special record types
-        if decoded.xl_rmid == pg_constants::RM_SMGR_ID
-            && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
-                == pg_constants::XLOG_SMGR_CREATE
-        {
-            let create = XlSmgrCreate::decode(&mut buf);
-            self.ingest_xlog_smgr_create(modification, &create, ctx)
-                .await?;
-        } else if decoded.xl_rmid == pg_constants::RM_SMGR_ID
-            && (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
-                == pg_constants::XLOG_SMGR_TRUNCATE
-        {
-            let truncate = XlSmgrTruncate::decode(&mut buf);
-            self.ingest_xlog_smgr_truncate(modification, &truncate, ctx)
-                .await?;
-        } else if decoded.xl_rmid == pg_constants::RM_DBASE_ID {
-            debug!(
-                "handle RM_DBASE_ID for Postgres version {:?}",
-                self.timeline.pg_version
-            );
-            if self.timeline.pg_version == 14 {
-                if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
-                    == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE
-                {
-                    let createdb = XlCreateDatabase::decode(&mut buf);
-                    debug!("XLOG_DBASE_CREATE v14");
+        match decoded.xl_rmid {
+            pg_constants::RM_HEAP_ID | pg_constants::RM_HEAP2_ID => {
+                // Heap AM records need some special handling, because they modify VM pages
+                // without registering them with the standard mechanism.
+                self.ingest_heapam_record(&mut buf, modification, decoded, ctx)
+                    .await?;
+            }
+            pg_constants::RM_NEON_ID => {
+                self.ingest_neonrmgr_record(&mut buf, modification, decoded, ctx)
+                    .await?;
+            }
+            // Handle other special record types
+            pg_constants::RM_SMGR_ID => {
+                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;

-                    self.ingest_xlog_dbase_create(modification, &createdb, ctx)
+                if info == pg_constants::XLOG_SMGR_CREATE {
+                    let create = XlSmgrCreate::decode(&mut buf);
+                    self.ingest_xlog_smgr_create(modification, &create, ctx)
                        .await?;
-                } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
-                    == postgres_ffi::v14::bindings::XLOG_DBASE_DROP
-                {
-                    let dropdb = XlDropDatabase::decode(&mut buf);
-                    for tablespace_id in dropdb.tablespace_ids {
-                        trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        modification
-                            .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
-                            .await?;
-                    }
-                }
-            } else if self.timeline.pg_version == 15 {
-                if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
-                    == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG
-                {
-                    debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
-                } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
-                    == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY
-                {
-                    // The XLOG record was renamed between v14 and v15,
-                    // but the record format is the same.
-                    // So we can reuse XlCreateDatabase here.
-                    debug!("XLOG_DBASE_CREATE_FILE_COPY");
-                    let createdb = XlCreateDatabase::decode(&mut buf);
-                    self.ingest_xlog_dbase_create(modification, &createdb, ctx)
+                } else if info == pg_constants::XLOG_SMGR_TRUNCATE {
+                    let truncate = XlSmgrTruncate::decode(&mut buf);
+                    self.ingest_xlog_smgr_truncate(modification, &truncate, ctx)
                        .await?;
-                } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
-                    == postgres_ffi::v15::bindings::XLOG_DBASE_DROP
-                {
-                    let dropdb = XlDropDatabase::decode(&mut buf);
-                    for tablespace_id in dropdb.tablespace_ids {
-                        trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        modification
-                            .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
-                            .await?;
-                    }
-                }
-            } else if self.timeline.pg_version == 16 {
-                if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
-                    == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG
-                {
-                    debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
-                } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
-                    == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY
-                {
-                    // The XLOG record was renamed between v14 and v15,
-                    // but the record format is the same.
-                    // So we can reuse XlCreateDatabase here.
-                    debug!("XLOG_DBASE_CREATE_FILE_COPY");
-                    let createdb = XlCreateDatabase::decode(&mut buf);
-                    self.ingest_xlog_dbase_create(modification, &createdb, ctx)
-                        .await?;
-                } else if (decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK)
-                    == postgres_ffi::v16::bindings::XLOG_DBASE_DROP
-                {
-                    let dropdb = XlDropDatabase::decode(&mut buf);
-                    for tablespace_id in dropdb.tablespace_ids {
-                        trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
-                        modification
-                            .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
-                            .await?;
-                    }
                }
            }
-        } else if decoded.xl_rmid == pg_constants::RM_TBLSPC_ID {
-            trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
-        } else if decoded.xl_rmid == pg_constants::RM_CLOG_ID {
-            let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
-            if info == pg_constants::CLOG_ZEROPAGE {
-                let pageno = buf.get_u32_le();
-                let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                self.put_slru_page_image(
-                    modification,
-                    SlruKind::Clog,
-                    segno,
-                    rpageno,
-                    ZERO_PAGE.clone(),
-                    ctx,
-                )
-                .await?;
-            } else {
-                assert!(info == pg_constants::CLOG_TRUNCATE);
-                let xlrec = XlClogTruncate::decode(&mut buf);
-                self.ingest_clog_truncate_record(modification, &xlrec, ctx)
-                    .await?;
-            }
-        } else if decoded.xl_rmid == pg_constants::RM_XACT_ID {
-            let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
-            if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_ABORT {
-                let parsed_xact =
-                    XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
-                self.ingest_xact_record(
-                    modification,
-                    &parsed_xact,
-                    info == pg_constants::XLOG_XACT_COMMIT,
-                    ctx,
-                )
-                .await?;
-            } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED
-                || info == pg_constants::XLOG_XACT_ABORT_PREPARED
-            {
-                let parsed_xact =
-                    XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
-                self.ingest_xact_record(
-                    modification,
-                    &parsed_xact,
-                    info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
-                    ctx,
-                )
-                .await?;
-                // Remove twophase file. see RemoveTwoPhaseFile() in postgres code
-                trace!(
-                    "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
-                    decoded.xl_xid,
-                    parsed_xact.xid,
-                    lsn,
-                );
-                modification
-                    .drop_twophase_file(parsed_xact.xid, ctx)
-                    .await?;
-            } else if info == pg_constants::XLOG_XACT_PREPARE {
-                modification
-                    .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx)
-                    .await?;
-            }
-        } else if decoded.xl_rmid == pg_constants::RM_MULTIXACT_ID {
-            let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+            pg_constants::RM_DBASE_ID => {
+                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+                debug!(%info, pg_version=%self.timeline.pg_version, "handle RM_DBASE_ID");

-            if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
-                let pageno = buf.get_u32_le();
-                let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                self.put_slru_page_image(
-                    modification,
-                    SlruKind::MultiXactOffsets,
-                    segno,
-                    rpageno,
-                    ZERO_PAGE.clone(),
-                    ctx,
-                )
-                .await?;
-            } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
-                let pageno = buf.get_u32_le();
-                let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
-                let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
-                self.put_slru_page_image(
-                    modification,
-                    SlruKind::MultiXactMembers,
-                    segno,
-                    rpageno,
-                    ZERO_PAGE.clone(),
-                    ctx,
-                )
-                .await?;
-            } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
-                let xlrec = XlMultiXactCreate::decode(&mut buf);
-                self.ingest_multixact_create_record(modification, &xlrec)?;
-            } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
-                let xlrec = XlMultiXactTruncate::decode(&mut buf);
-                self.ingest_multixact_truncate_record(modification, &xlrec, ctx)
+                if self.timeline.pg_version == 14 {
+                    if info == postgres_ffi::v14::bindings::XLOG_DBASE_CREATE {
+                        let createdb = XlCreateDatabase::decode(&mut buf);
+                        debug!("XLOG_DBASE_CREATE v14");
+
+                        self.ingest_xlog_dbase_create(modification, &createdb, ctx)
+                            .await?;
+                    } else if info == postgres_ffi::v14::bindings::XLOG_DBASE_DROP {
+                        let dropdb = XlDropDatabase::decode(&mut buf);
+                        for tablespace_id in dropdb.tablespace_ids {
+                            trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
+                            modification
+                                .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
+                                .await?;
+                        }
+                    }
+                } else if self.timeline.pg_version == 15 {
+                    if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                        debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+                    } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                        // The XLOG record was renamed between v14 and v15,
+                        // but the record format is the same.
+                        // So we can reuse XlCreateDatabase here.
+                        debug!("XLOG_DBASE_CREATE_FILE_COPY");
+                        let createdb = XlCreateDatabase::decode(&mut buf);
+                        self.ingest_xlog_dbase_create(modification, &createdb, ctx)
+                            .await?;
+                    } else if info == postgres_ffi::v15::bindings::XLOG_DBASE_DROP {
+                        let dropdb = XlDropDatabase::decode(&mut buf);
+                        for tablespace_id in dropdb.tablespace_ids {
+                            trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
+                            modification
+                                .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
+                                .await?;
+                        }
+                    }
+                } else if self.timeline.pg_version == 16 {
+                    if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_WAL_LOG {
+                        debug!("XLOG_DBASE_CREATE_WAL_LOG: noop");
+                    } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_CREATE_FILE_COPY {
+                        // The XLOG record was renamed between v14 and v15,
+                        // but the record format is the same.
+                        // So we can reuse XlCreateDatabase here.
+                        debug!("XLOG_DBASE_CREATE_FILE_COPY");
+                        let createdb = XlCreateDatabase::decode(&mut buf);
+                        self.ingest_xlog_dbase_create(modification, &createdb, ctx)
+                            .await?;
+                    } else if info == postgres_ffi::v16::bindings::XLOG_DBASE_DROP {
+                        let dropdb = XlDropDatabase::decode(&mut buf);
+                        for tablespace_id in dropdb.tablespace_ids {
+                            trace!("Drop db {}, {}", tablespace_id, dropdb.db_id);
+                            modification
+                                .drop_dbdir(tablespace_id, dropdb.db_id, ctx)
+                                .await?;
+                        }
+                    }
+                }
+            }
+            pg_constants::RM_TBLSPC_ID => {
+                trace!("XLOG_TBLSPC_CREATE/DROP is not handled yet");
+            }
+            pg_constants::RM_CLOG_ID => {
+                let info = decoded.xl_info & !pg_constants::XLR_INFO_MASK;
+
+                if info == pg_constants::CLOG_ZEROPAGE {
+                    let pageno = buf.get_u32_le();
+                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                    self.put_slru_page_image(
+                        modification,
+                        SlruKind::Clog,
+                        segno,
+                        rpageno,
+                        ZERO_PAGE.clone(),
+                        ctx,
+                    )
                    .await?;
-            }
-        } else if decoded.xl_rmid == pg_constants::RM_RELMAP_ID {
-            let xlrec = XlRelmapUpdate::decode(&mut buf);
-            self.ingest_relmap_page(modification, &xlrec, decoded, ctx)
-                .await?;
-        } else if decoded.xl_rmid == pg_constants::RM_XLOG_ID {
-            let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-            if info == pg_constants::XLOG_NEXTOID {
-                let next_oid = buf.get_u32_le();
-                if self.checkpoint.nextOid != next_oid {
-                    self.checkpoint.nextOid = next_oid;
-                    self.checkpoint_modified = true;
+                } else {
+                    assert!(info == pg_constants::CLOG_TRUNCATE);
+                    let xlrec = XlClogTruncate::decode(&mut buf);
+                    self.ingest_clog_truncate_record(modification, &xlrec, ctx)
+                        .await?;
                }
-            } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
-                || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
-            {
-                let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
-                buf.copy_to_slice(&mut checkpoint_bytes);
-                let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
-                trace!(
-                    "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
-                    xlog_checkpoint.oldestXid,
-                    self.checkpoint.oldestXid
-                );
-                if (self
-                    .checkpoint
-                    .oldestXid
-                    .wrapping_sub(xlog_checkpoint.oldestXid) as i32)
-                    < 0
+            }
+            pg_constants::RM_XACT_ID => {
+                let info = decoded.xl_info & pg_constants::XLOG_XACT_OPMASK;
+
+                if info == pg_constants::XLOG_XACT_COMMIT || info == pg_constants::XLOG_XACT_ABORT {
+                    let parsed_xact =
+                        XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
+                    self.ingest_xact_record(
+                        modification,
+                        &parsed_xact,
+                        info == pg_constants::XLOG_XACT_COMMIT,
+                        ctx,
+                    )
+                    .await?;
+                } else if info == pg_constants::XLOG_XACT_COMMIT_PREPARED
+                    || info == pg_constants::XLOG_XACT_ABORT_PREPARED
                {
-                    self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
-                    self.checkpoint_modified = true;
-                }
-            }
-        } else if decoded.xl_rmid == pg_constants::RM_LOGICALMSG_ID {
-            let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
-            if info == pg_constants::XLOG_LOGICAL_MESSAGE {
-                let xlrec = XlLogicalMessage::decode(&mut buf);
-                let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
-                let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size];
-                if prefix == "neon-test" {
-                    // This is a convenient way to make the WAL ingestion pause at
-                    // particular point in the WAL. For more fine-grained control,
-                    // we could peek into the message and only pause if it contains
-                    // a particular string, for example, but this is enough for now.
-                    crate::failpoint_support::sleep_millis_async!(
-                        "wal-ingest-logical-message-sleep"
+                    let parsed_xact =
+                        XlXactParsedRecord::decode(&mut buf, decoded.xl_xid, decoded.xl_info);
+                    self.ingest_xact_record(
+                        modification,
+                        &parsed_xact,
+                        info == pg_constants::XLOG_XACT_COMMIT_PREPARED,
+                        ctx,
+                    )
+                    .await?;
+                    // Remove twophase file. see RemoveTwoPhaseFile() in postgres code
+                    trace!(
+                        "Drop twophaseFile for xid {} parsed_xact.xid {} here at {}",
+                        decoded.xl_xid,
+                        parsed_xact.xid,
+                        lsn,
                    );
-                } else if let Some(path) = prefix.strip_prefix("neon-file:") {
-                    modification.put_file(path, message, ctx).await?;
+                    modification
+                        .drop_twophase_file(parsed_xact.xid, ctx)
+                        .await?;
+                } else if info == pg_constants::XLOG_XACT_PREPARE {
+                    modification
+                        .put_twophase_file(decoded.xl_xid, Bytes::copy_from_slice(&buf[..]), ctx)
+                        .await?;
                }
            }
+            pg_constants::RM_MULTIXACT_ID => {
+                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+
+                if info == pg_constants::XLOG_MULTIXACT_ZERO_OFF_PAGE {
+                    let pageno = buf.get_u32_le();
+                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                    self.put_slru_page_image(
+                        modification,
+                        SlruKind::MultiXactOffsets,
+                        segno,
+                        rpageno,
+                        ZERO_PAGE.clone(),
+                        ctx,
+                    )
+                    .await?;
+                } else if info == pg_constants::XLOG_MULTIXACT_ZERO_MEM_PAGE {
+                    let pageno = buf.get_u32_le();
+                    let segno = pageno / pg_constants::SLRU_PAGES_PER_SEGMENT;
+                    let rpageno = pageno % pg_constants::SLRU_PAGES_PER_SEGMENT;
+                    self.put_slru_page_image(
+                        modification,
+                        SlruKind::MultiXactMembers,
+                        segno,
+                        rpageno,
+                        ZERO_PAGE.clone(),
+                        ctx,
+                    )
+                    .await?;
+                } else if info == pg_constants::XLOG_MULTIXACT_CREATE_ID {
+                    let xlrec = XlMultiXactCreate::decode(&mut buf);
+                    self.ingest_multixact_create_record(modification, &xlrec)?;
+                } else if info == pg_constants::XLOG_MULTIXACT_TRUNCATE_ID {
+                    let xlrec = XlMultiXactTruncate::decode(&mut buf);
+                    self.ingest_multixact_truncate_record(modification, &xlrec, ctx)
+                        .await?;
+                }
+            }
+            pg_constants::RM_RELMAP_ID => {
+                let xlrec = XlRelmapUpdate::decode(&mut buf);
+                self.ingest_relmap_page(modification, &xlrec, decoded, ctx)
+                    .await?;
+            }
+            pg_constants::RM_XLOG_ID => {
+                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+
+                if info == pg_constants::XLOG_NEXTOID {
+                    let next_oid = buf.get_u32_le();
+                    if self.checkpoint.nextOid != next_oid {
+                        self.checkpoint.nextOid = next_oid;
+                        self.checkpoint_modified = true;
+                    }
+                } else if info == pg_constants::XLOG_CHECKPOINT_ONLINE
+                    || info == pg_constants::XLOG_CHECKPOINT_SHUTDOWN
+                {
+                    let mut checkpoint_bytes = [0u8; SIZEOF_CHECKPOINT];
+                    buf.copy_to_slice(&mut checkpoint_bytes);
+                    let xlog_checkpoint = CheckPoint::decode(&checkpoint_bytes)?;
+                    trace!(
+                        "xlog_checkpoint.oldestXid={}, checkpoint.oldestXid={}",
+                        xlog_checkpoint.oldestXid,
+                        self.checkpoint.oldestXid
+                    );
+                    if (self
+                        .checkpoint
+                        .oldestXid
+                        .wrapping_sub(xlog_checkpoint.oldestXid) as i32)
+                        < 0
+                    {
+                        self.checkpoint.oldestXid = xlog_checkpoint.oldestXid;
+                        self.checkpoint_modified = true;
+                    }
+                }
+            }
+            pg_constants::RM_LOGICALMSG_ID => {
+                let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+
+                if info == pg_constants::XLOG_LOGICAL_MESSAGE {
+                    let xlrec = XlLogicalMessage::decode(&mut buf);
+                    let prefix = std::str::from_utf8(&buf[0..xlrec.prefix_size - 1])?;
+                    let message = &buf[xlrec.prefix_size..xlrec.prefix_size + xlrec.message_size];
+                    if prefix == "neon-test" {
+                        // This is a convenient way to make the WAL ingestion pause at
+                        // particular point in the WAL. For more fine-grained control,
+                        // we could peek into the message and only pause if it contains
+                        // a particular string, for example, but this is enough for now.
+                        crate::failpoint_support::sleep_millis_async!(
+                            "wal-ingest-logical-message-sleep"
+                        );
+                    } else if let Some(path) = prefix.strip_prefix("neon-file:") {
+                        modification.put_file(path, message, ctx).await?;
+                    }
+                }
+            }
+            _x => {
+                // TODO: should probably log & fail here instead of blindly
+                // doing something without understanding the protocol
+            }
        }

        // Iterate through all the blocks that the record modifies, and
@@ -1440,7 +1437,16 @@ impl<'a> WalIngest<'a> {
        // record.
        // TODO: would be nice if to be more explicit about it
        let last_lsn = modification.lsn;
-        let old_nblocks = if !self
+
+        // Get current size and put rel creation if rel doesn't exist
+        //
+        // NOTE: we check the cache first even though get_rel_exists and get_rel_size would
+        //       check the cache too. This is because eagerly checking the cache results in
+        //       less work overall and 10% better performance. It's more work on cache miss
+        //       but cache miss is rare.
+        let old_nblocks = if let Some(nblocks) = self.timeline.get_cached_rel_size(&rel, last_lsn) {
+            nblocks
+        } else if !self
            .timeline
            .get_rel_exists(rel, last_lsn, true, ctx)
            .await?
@@ -2079,4 +2085,88 @@ mod tests {

        Ok(())
    }
+
+    /// Replay a wal segment file taken directly from safekeepers.
+    ///
+    /// This test is useful for benchmarking since it allows us to profile only
+    /// the walingest code in a single-threaded executor, and iterate more quickly
+    /// without waiting for unrelated steps.
+    #[tokio::test]
+    async fn test_ingest_real_wal() {
+        use crate::tenant::harness::*;
+        use postgres_ffi::waldecoder::WalStreamDecoder;
+        use postgres_ffi::WAL_SEGMENT_SIZE;
+
+        // Define test data path and constants.
+        //
+        // Steps to reconstruct the data, if needed:
+        // 1. Run the pgbench python test
+        // 2. Take the first wal segment file from safekeeper
+        // 3. Compress it using `zstd --long input_file`
+        // 4. Copy initdb.tar.zst from local_fs_remote_storage
+        // 5. Grep sk logs for "restart decoder" to get startpoint
+        // 6. Run just the decoder from this test to get the endpoint.
+        //    It's the last LSN the decoder will output.
+        let pg_version = 15; // The test data was generated by pg15
+        let path = "test_data/sk_wal_segment_from_pgbench";
+        let wal_segment_path = format!("{path}/000000010000000000000001.zst");
+        let startpoint = Lsn::from_hex("14AEC08").unwrap();
+        let endpoint = Lsn::from_hex("1FFFF98").unwrap();
+
+        // Bootstrap a real timeline. We can't use create_test_timeline because
+        // it doesn't create a real checkpoint, and Walingest::new tries to parse
+        // the garbage data.
+        //
+        // TODO use the initdb.tar.zst file stored with the test data to avoid
+        //      problems with inconsistent initdb results after pg minor version bumps.
+        let (tenant, ctx) = TenantHarness::create("test_ingest_real_wal")
+            .unwrap()
+            .load()
+            .await;
+        let tline = tenant
+            .bootstrap_timeline(TIMELINE_ID, pg_version, None, &ctx)
+            .await
+            .unwrap();
+
+        // We fully read and decompress this into memory before decoding
+        // to get a more accurate perf profile of the decoder.
+        let bytes = {
+            use async_compression::tokio::bufread::ZstdDecoder;
+            let file = tokio::fs::File::open(wal_segment_path).await.unwrap();
+            let reader = tokio::io::BufReader::new(file);
+            let decoder = ZstdDecoder::new(reader);
+            let mut reader = tokio::io::BufReader::new(decoder);
+            let mut buffer = Vec::new();
+            tokio::io::copy_buf(&mut reader, &mut buffer).await.unwrap();
+            buffer
+        };
+
+        // TODO start a profiler too
+        let started_at = std::time::Instant::now();
+
+        // Initialize walingest
+        let xlogoff: usize = startpoint.segment_offset(WAL_SEGMENT_SIZE);
+        let mut decoder = WalStreamDecoder::new(startpoint, pg_version);
+        let mut walingest = WalIngest::new(tline.as_ref(), startpoint, &ctx)
+            .await
+            .unwrap();
+        let mut modification = tline.begin_modification(endpoint);
+        let mut decoded = DecodedWALRecord::default();
+        println!("decoding {} bytes", bytes.len() - xlogoff);
+
+        // Decode and ingest wal. We process the wal in chunks because
+        // that's what happens when we get bytes from safekeepers.
+        for chunk in bytes[xlogoff..].chunks(50) {
+            decoder.feed_bytes(chunk);
+            while let Some((lsn, recdata)) = decoder.poll_decode().unwrap() {
+                walingest
+                    .ingest_record(recdata, lsn, &mut modification, &mut decoded, &ctx)
+                    .await
+                    .unwrap();
+            }
+        }
+
+        let duration = started_at.elapsed();
+        println!("done in {:?}", duration);
+    }
 }
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -34,17 +34,20 @@ use std::process::{Child, ChildStdin, ChildStdout, Command};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::Duration;
 use std::time::Instant;
-use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};

 #[cfg(feature = "testing")]
 use std::sync::atomic::{AtomicUsize, Ordering};

+#[cfg(feature = "testing")]
+use pageserver_api::shard::TenantShardId;
+
 use crate::config::PageServerConf;
 use crate::metrics::{
    WalRedoKillCause, WAL_REDO_BYTES_HISTOGRAM, WAL_REDO_PROCESS_COUNTERS,
-    WAL_REDO_RECORDS_HISTOGRAM, WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
+    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM, WAL_REDO_RECORDS_HISTOGRAM,
+    WAL_REDO_RECORD_COUNTER, WAL_REDO_TIME,
 };
 use crate::pgdatadir_mapping::{key_to_rel_block, key_to_slru_block};
 use crate::repository::Key;
@@ -120,7 +123,9 @@ impl PostgresRedoManager {
    /// The WAL redo is handled by a separate thread, so this just sends a request
    /// to the thread and waits for response.
    ///
-    /// CANCEL SAFETY: NOT CANCEL SAFE.
+    /// # Cancel-Safety
+    ///
+    /// This method is cancellation-safe.
    pub async fn request_redo(
        &self,
        key: Key,
@@ -153,7 +158,6 @@ impl PostgresRedoManager {
                        self.conf.wal_redo_timeout,
                        pg_version,
                    )
-                    .await
                };
                img = Some(result?);

@@ -174,7 +178,6 @@ impl PostgresRedoManager {
                self.conf.wal_redo_timeout,
                pg_version,
            )
-            .await
        }
    }
 }
@@ -212,7 +215,7 @@ impl PostgresRedoManager {
    /// Process one request for WAL redo using wal-redo postgres
    ///
    #[allow(clippy::too_many_arguments)]
-    async fn apply_batch_postgres(
+    fn apply_batch_postgres(
        &self,
        key: Key,
        lsn: Lsn,
@@ -238,10 +241,13 @@ impl PostgresRedoManager {
                        let mut proc_guard = self.redo_process.write().unwrap();
                        match &*proc_guard {
                            None => {
+                                let timer =
+                                    WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM.start_timer();
                                let proc = Arc::new(
                                    WalRedoProcess::launch(self.conf, self.tenant_id, pg_version)
                                        .context("launch walredo process")?,
                                );
+                                timer.observe_duration();
                                *proc_guard = Some(Arc::clone(&proc));
                                proc
                            }
@@ -325,12 +331,7 @@ impl PostgresRedoManager {
                // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
                // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
                // This probably needs revisiting at some later point.
-                let mut wait_done = proc.stderr_logger_task_done.clone();
                drop(proc);
-                wait_done
-                    .wait_for(|v| *v)
-                    .await
-                    .expect("we use scopeguard to ensure we always send `true` to the channel before dropping the sender");
            } else if n_attempts != 0 {
                info!(n_attempts, "retried walredo succeeded");
            }
@@ -642,8 +643,6 @@ struct WalRedoProcess {
    child: Option<NoLeakChild>,
    stdout: Mutex<ProcessOutput>,
    stdin: Mutex<ProcessInput>,
-    stderr_logger_cancel: CancellationToken,
-    stderr_logger_task_done: tokio::sync::watch::Receiver<bool>,
    /// Counter to separate same sized walredo inputs failing at the same millisecond.
    #[cfg(feature = "testing")]
    dump_sequence: AtomicUsize,
@@ -692,6 +691,8 @@ impl WalRedoProcess {
        let stdin = child.stdin.take().unwrap();
        let stdout = child.stdout.take().unwrap();
        let stderr = child.stderr.take().unwrap();
+        let stderr = tokio::process::ChildStderr::from_std(stderr)
+            .context("convert to tokio::ChildStderr")?;
        macro_rules! set_nonblock_or_log_err {
            ($file:ident) => {{
                let res = set_nonblock($file.as_raw_fd());
@@ -703,69 +704,45 @@ impl WalRedoProcess {
        }
        set_nonblock_or_log_err!(stdin)?;
        set_nonblock_or_log_err!(stdout)?;
-        set_nonblock_or_log_err!(stderr)?;
-
-        let mut stderr = tokio::io::unix::AsyncFd::new(stderr).context("AsyncFd::with_interest")?;

        // all fallible operations post-spawn are complete, so get rid of the guard
        let child = scopeguard::ScopeGuard::into_inner(child);

-        let stderr_logger_cancel = CancellationToken::new();
-        let (stderr_logger_task_done_tx, stderr_logger_task_done_rx) =
-            tokio::sync::watch::channel(false);
-        tokio::spawn({
-            let stderr_logger_cancel = stderr_logger_cancel.clone();
+        tokio::spawn(
            async move {
                scopeguard::defer! {
                    debug!("wal-redo-postgres stderr_logger_task finished");
-                    let _ = stderr_logger_task_done_tx.send(true);
+                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
                }
                debug!("wal-redo-postgres stderr_logger_task started");
-                loop {
-                    // NB: we purposefully don't do a select! for the cancellation here.
-                    // The cancellation would likely cause us to miss stderr messages.
-                    // We can rely on this to return from .await because when we SIGKILL
-                    // the child, the writing end of the stderr pipe gets closed.
-                    match stderr.readable_mut().await {
-                        Ok(mut guard) => {
-                            let mut errbuf = [0; 16384];
-                            let res = guard.try_io(|fd| {
-                                use std::io::Read;
-                                fd.get_mut().read(&mut errbuf)
-                            });
-                            match res {
-                                Ok(Ok(0)) => {
-                                    // it closed the stderr pipe
-                                    break;
-                                }
-                                Ok(Ok(n)) => {
-                                    // The message might not be split correctly into lines here. But this is
-                                    // good enough, the important thing is to get the message to the log.
-                                    let output = String::from_utf8_lossy(&errbuf[0..n]).to_string();
-                                    error!(output, "received output");
-                                },
-                                Ok(Err(e)) => {
-                                    error!(error = ?e, "read() error, waiting for cancellation");
-                                    stderr_logger_cancel.cancelled().await;
-                                    error!(error = ?e, "read() error, cancellation complete");
-                                    break;
-                                }
-                                Err(e) => {
-                                    let _e: tokio::io::unix::TryIoError = e;
-                                    // the read() returned WouldBlock, that's expected
-                                }
-                            }
+                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
+
+                use tokio::io::AsyncBufReadExt;
+                let mut stderr_lines = tokio::io::BufReader::new(stderr);
+                let mut buf = Vec::new();
+                let res = loop {
+                    buf.clear();
+                    // TODO we don't trust the process to cap its stderr length.
+                    // Currently it can do unbounded Vec allocation.
+                    match stderr_lines.read_until(b'\n', &mut buf).await {
+                        Ok(0) => break Ok(()), // eof
+                        Ok(num_bytes) => {
+                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
+                            error!(%output, "received output");
                        }
                        Err(e) => {
-                            error!(error = ?e, "read() error, waiting for cancellation");
-                            stderr_logger_cancel.cancelled().await;
-                            error!(error = ?e, "read() error, cancellation complete");
-                            break;
+                            break Err(e);
                        }
                    }
+                };
+                match res {
+                    Ok(()) => (),
+                    Err(e) => {
+                        error!(error=?e, "failed to read from walredo stderr");
+                    }
                }
            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
-        });
+        );

        Ok(Self {
            conf,
@@ -780,8 +757,6 @@ impl WalRedoProcess {
                pending_responses: VecDeque::new(),
                n_processed_responses: 0,
            }),
-            stderr_logger_cancel,
-            stderr_logger_task_done: stderr_logger_task_done_rx,
            #[cfg(feature = "testing")]
            dump_sequence: AtomicUsize::default(),
        })
@@ -991,7 +966,11 @@ impl WalRedoProcess {
        // these files will be collected to an allure report
        let filename = format!("walredo-{millis}-{}-{seq}.walredo", writebuf.len());

-        let path = self.conf.tenant_path(&self.tenant_id).join(&filename);
+        // TODO(sharding): update this call when WalRedoProcess gets a TenantShardId.
+        let path = self
+            .conf
+            .tenant_path(&TenantShardId::unsharded(self.tenant_id))
+            .join(&filename);

        let res = std::fs::OpenOptions::new()
            .write(true)
@@ -1018,7 +997,6 @@ impl Drop for WalRedoProcess {
            .take()
            .expect("we only do this once")
            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
-        self.stderr_logger_cancel.cancel();
        // no way to wait for stderr_logger_task from Drop because that is async only
    }
 }
@@ -1182,7 +1160,7 @@ mod tests {

    #[tokio::test]
    async fn short_v14_redo() {
-        let expected = std::fs::read("fixtures/short_v14_redo.page").unwrap();
+        let expected = std::fs::read("test_data/short_v14_redo.page").unwrap();

        let h = RedoHarness::new().unwrap();

--- a/pageserver/test_data/short_v14_redo.page
+++ b/pageserver/test_data/short_v14_redo.page
--- a/pageserver/test_data/sk_wal_segment_from_pgbench/000000010000000000000001.zst
+++ b/pageserver/test_data/sk_wal_segment_from_pgbench/000000010000000000000001.zst
--- a/pageserver/test_data/sk_wal_segment_from_pgbench/initdb.tar.zst
+++ b/pageserver/test_data/sk_wal_segment_from_pgbench/initdb.tar.zst
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -19,8 +19,8 @@
 #include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "storage/buf_internals.h"
-#include "storage/lwlock.h"
 #include "storage/ipc.h"
+#include "storage/pg_shmem.h"
 #include "c.h"
 #include "postmaster/interrupt.h"

@@ -36,6 +36,8 @@
 #include "walproposer.h"
 #include "neon_utils.h"

+#include <pthread.h>
+
 #define PageStoreTrace DEBUG5

 #define RECONNECT_INTERVAL_USEC 1000000
@@ -68,7 +70,7 @@ int			max_reconnect_attempts = 60;

 typedef struct
 {
-    LWLockId lock;
+    pthread_rwlock_t lock;
    pg_atomic_uint64 update_counter;
    char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
 } PagestoreShmemState;
@@ -87,6 +89,12 @@ bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) =
 static bool pageserver_flush(void);
 static void pageserver_disconnect(void);

+static bool
+PagestoreShmemIsValid()
+{
+    return pagestore_shared && UsedShmemSegAddr;
+}
+
 static bool
 CheckPageserverConnstring(char **newval, void **extra, GucSource source)
 {
@@ -96,31 +104,40 @@ CheckPageserverConnstring(char **newval, void **extra, GucSource source)
 static void
 AssignPageserverConnstring(const char *newval, void *extra)
 {
-    if(!pagestore_shared)
+    if(!PagestoreShmemIsValid())
        return;
-    LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
+    pthread_rwlock_wrlock(&pagestore_shared->lock);
    strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
    pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
-    LWLockRelease(pagestore_shared->lock);
+    pthread_rwlock_unlock(&pagestore_shared->lock);
 }

 static bool
 CheckConnstringUpdated()
 {
-    if(!pagestore_shared)
+    if(!PagestoreShmemIsValid())
        return false;
    return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
 }

-static void
+/* Returns true if the connstring has changed and false if not */
+static bool
 ReloadConnstring()
 {
-    if(!pagestore_shared)
-        return;
-    LWLockAcquire(pagestore_shared->lock, LW_SHARED);
+    if(!PagestoreShmemIsValid())
+        return false;
+    pthread_rwlock_rdlock(&pagestore_shared->lock);
+
+    if(strcmp(local_pageserver_connstring, pagestore_shared->pageserver_connstring) == 0)
+    {
+        pthread_rwlock_unlock(&pagestore_shared->lock);
+        return false;
+    }
+
    strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
    pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
-    LWLockRelease(pagestore_shared->lock);
+    pthread_rwlock_unlock(&pagestore_shared->lock);
+    return true;
 }

 static bool
@@ -283,7 +300,6 @@ pageserver_disconnect(void)
 	 */
 	if (connected)
 	{
-		neon_log(LOG, "dropping connection to page server due to error");
 		PQfinish(pageserver_conn);
 		pageserver_conn = NULL;
 		connected = false;
@@ -304,8 +320,12 @@ pageserver_send(NeonRequest * request)

        if(CheckConnstringUpdated())
        {
-            pageserver_disconnect();
-            ReloadConnstring();
+            bool should_disconnect = ReloadConnstring();
+            if(should_disconnect)
+            {
+                neon_log(LOG, "pageserver_send disconnect because connstring changed");
+                pageserver_disconnect();
+            }
        }

 	/* If the connection was lost for some reason, reconnect */
@@ -477,7 +497,12 @@ PagestoreShmemInit(void)
                                       &found);
    if(!found)
    {
-        pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
+        pthread_rwlockattr_t attr;
+        pthread_rwlockattr_init(&attr);
+        pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); 
+        pthread_rwlock_init(&pagestore_shared->lock, &attr);
+        pthread_rwlockattr_destroy(&attr);
+
        pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
        AssignPageserverConnstring(page_server_connstring, NULL);
    }
--- a/pgxn/neon/neon.control
+++ b/pgxn/neon/neon.control
@@ -2,3 +2,4 @@
 comment = 'cloud storage for PostgreSQL'
 default_version = '1.1'
 module_pathname = '$libdir/neon'
+relocatable = true
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -59,6 +59,7 @@
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
 #include "storage/buf_internals.h"
+#include "storage/fsm_internals.h"
 #include "storage/smgr.h"
 #include "storage/md.h"
 #include "pgstat.h"
@@ -2722,6 +2723,86 @@ smgr_init_neon(void)
 }


+static void
+neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, XLogRecPtr end_recptr)
+{
+	BlockNumber relsize;
+	/* Extend the relation if we know its size */
+	if (get_cached_relsize(rinfo, forknum, &relsize))
+	{
+		if (relsize < blkno + 1)
+		{
+			update_cached_relsize(rinfo, forknum, blkno + 1);
+			SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
+		}
+	}
+	else
+	{
+		/*
+		 * Size was not cached. We populate the cache now, with the size of the
+		 * relation measured after this WAL record is applied.
+		 *
+		 * This length is later reused when we open the smgr to read the block,
+		 * which is fine and expected.
+		 */
+
+		NeonResponse *response;
+		NeonNblocksResponse *nbresponse;
+		NeonNblocksRequest request = {
+			.req = (NeonRequest) {
+				.lsn = end_recptr,
+				.latest = false,
+				.tag = T_NeonNblocksRequest,
+			},
+			.rinfo = rinfo,
+			.forknum = forknum,
+		};
+
+		response = page_server_request(&request);
+
+		Assert(response->tag == T_NeonNblocksResponse);
+		nbresponse = (NeonNblocksResponse *) response;
+
+		relsize = Max(nbresponse->n_blocks, blkno+1);
+
+		set_cached_relsize(rinfo, forknum, relsize);
+		SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
+
+		elog(SmgrTrace, "Set length to %d", relsize);
+	}
+}
+
+#define FSM_TREE_DEPTH	((SlotsPerFSMPage >= 1626) ? 3 : 4)
+
+/*
+ * TODO: May be it is better to make correspondent fgunctio from freespace.c public?
+ */
+static BlockNumber
+get_fsm_physical_block(BlockNumber heapblk)
+{
+	BlockNumber pages;
+	int			leafno;
+	int			l;
+
+	/*
+	 * Calculate the logical page number of the first leaf page below the
+	 * given page.
+	 */
+	leafno = heapblk / SlotsPerFSMPage;
+
+	/* Count upper level nodes required to address the leaf page */
+	pages = 0;
+	for (l = 0; l < FSM_TREE_DEPTH; l++)
+	{
+		pages += leafno + 1;
+		leafno /= SlotsPerFSMPage;
+	}
+
+	/* Turn the page count into 0-based block number */
+	return pages - 1;
+}
+
+
 /*
 * Return whether we can skip the redo for this block.
 * 
@@ -2769,7 +2850,6 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	LWLock	   *partitionLock;
 	Buffer		buffer;
 	bool		no_redo_needed;
-	BlockNumber relsize;

 	if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
 		return true;
@@ -2819,49 +2899,10 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)

 	LWLockRelease(partitionLock);

-	/* Extend the relation if we know its size */
-	if (get_cached_relsize(rinfo, forknum, &relsize))
+	neon_extend_rel_size(rinfo, forknum, blkno, end_recptr);
+	if (forknum == MAIN_FORKNUM)
 	{
-		if (relsize < blkno + 1)
-		{
-			update_cached_relsize(rinfo, forknum, blkno + 1);
-			SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
-		}
+		neon_extend_rel_size(rinfo, FSM_FORKNUM, get_fsm_physical_block(blkno), end_recptr);
 	}
-	else
-	{
-		/*
-		 * Size was not cached. We populate the cache now, with the size of the
-		 * relation measured after this WAL record is applied.
-		 *
-		 * This length is later reused when we open the smgr to read the block,
-		 * which is fine and expected.
-		 */
-
-		NeonResponse *response;
-		NeonNblocksResponse *nbresponse;
-		NeonNblocksRequest request = {
-			.req = (NeonRequest) {
-				.lsn = end_recptr,
-				.latest = false,
-				.tag = T_NeonNblocksRequest,
-			},
-			.rinfo = rinfo,
-			.forknum = forknum,
-		};
-
-		response = page_server_request(&request);
-
-		Assert(response->tag == T_NeonNblocksResponse);
-		nbresponse = (NeonNblocksResponse *) response;
-
-		Assert(nbresponse->n_blocks > blkno);
-
-		set_cached_relsize(rinfo, forknum, nbresponse->n_blocks);
-		SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
-
-		elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
-	}
-
 	return no_redo_needed;
 }
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,112 +1,100 @@
-# This file is automatically @generated by Poetry 1.6.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.7.1 and should not be changed by hand.

 [[package]]
 name = "aiohttp"
-version = "3.8.6"
+version = "3.9.0"
 description = "Async http client/server framework (asyncio)"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.8"
 files = [
-    {file = "aiohttp-3.8.6-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:41d55fc043954cddbbd82503d9cc3f4814a40bcef30b3569bc7b5e34130718c1"},
-    {file = "aiohttp-3.8.6-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1d84166673694841d8953f0a8d0c90e1087739d24632fe86b1a08819168b4566"},
-    {file = "aiohttp-3.8.6-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:253bf92b744b3170eb4c4ca2fa58f9c4b87aeb1df42f71d4e78815e6e8b73c9e"},
-    {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3fd194939b1f764d6bb05490987bfe104287bbf51b8d862261ccf66f48fb4096"},
-    {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:6c5f938d199a6fdbdc10bbb9447496561c3a9a565b43be564648d81e1102ac22"},
-    {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:2817b2f66ca82ee699acd90e05c95e79bbf1dc986abb62b61ec8aaf851e81c93"},
-    {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0fa375b3d34e71ccccf172cab401cd94a72de7a8cc01847a7b3386204093bb47"},
-    {file = "aiohttp-3.8.6-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:9de50a199b7710fa2904be5a4a9b51af587ab24c8e540a7243ab737b45844543"},
-    {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:e1d8cb0b56b3587c5c01de3bf2f600f186da7e7b5f7353d1bf26a8ddca57f965"},
-    {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:8e31e9db1bee8b4f407b77fd2507337a0a80665ad7b6c749d08df595d88f1cf5"},
-    {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:7bc88fc494b1f0311d67f29fee6fd636606f4697e8cc793a2d912ac5b19aa38d"},
-    {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:ec00c3305788e04bf6d29d42e504560e159ccaf0be30c09203b468a6c1ccd3b2"},
-    {file = "aiohttp-3.8.6-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:ad1407db8f2f49329729564f71685557157bfa42b48f4b93e53721a16eb813ed"},
-    {file = "aiohttp-3.8.6-cp310-cp310-win32.whl", hash = "sha256:ccc360e87341ad47c777f5723f68adbb52b37ab450c8bc3ca9ca1f3e849e5fe2"},
-    {file = "aiohttp-3.8.6-cp310-cp310-win_amd64.whl", hash = "sha256:93c15c8e48e5e7b89d5cb4613479d144fda8344e2d886cf694fd36db4cc86865"},
-    {file = "aiohttp-3.8.6-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:6e2f9cc8e5328f829f6e1fb74a0a3a939b14e67e80832975e01929e320386b34"},
-    {file = "aiohttp-3.8.6-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:e6a00ffcc173e765e200ceefb06399ba09c06db97f401f920513a10c803604ca"},
-    {file = "aiohttp-3.8.6-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:41bdc2ba359032e36c0e9de5a3bd00d6fb7ea558a6ce6b70acedf0da86458321"},
-    {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:14cd52ccf40006c7a6cd34a0f8663734e5363fd981807173faf3a017e202fec9"},
-    {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2d5b785c792802e7b275c420d84f3397668e9d49ab1cb52bd916b3b3ffcf09ad"},
-    {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1bed815f3dc3d915c5c1e556c397c8667826fbc1b935d95b0ad680787896a358"},
-    {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:96603a562b546632441926cd1293cfcb5b69f0b4159e6077f7c7dbdfb686af4d"},
-    {file = "aiohttp-3.8.6-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d76e8b13161a202d14c9584590c4df4d068c9567c99506497bdd67eaedf36403"},
-    {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:e3f1e3f1a1751bb62b4a1b7f4e435afcdade6c17a4fd9b9d43607cebd242924a"},
-    {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:76b36b3124f0223903609944a3c8bf28a599b2cc0ce0be60b45211c8e9be97f8"},
-    {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:a2ece4af1f3c967a4390c284797ab595a9f1bc1130ef8b01828915a05a6ae684"},
-    {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:16d330b3b9db87c3883e565340d292638a878236418b23cc8b9b11a054aaa887"},
-    {file = "aiohttp-3.8.6-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:42c89579f82e49db436b69c938ab3e1559e5a4409eb8639eb4143989bc390f2f"},
-    {file = "aiohttp-3.8.6-cp311-cp311-win32.whl", hash = "sha256:efd2fcf7e7b9d7ab16e6b7d54205beded0a9c8566cb30f09c1abe42b4e22bdcb"},
-    {file = "aiohttp-3.8.6-cp311-cp311-win_amd64.whl", hash = "sha256:3b2ab182fc28e7a81f6c70bfbd829045d9480063f5ab06f6e601a3eddbbd49a0"},
-    {file = "aiohttp-3.8.6-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:fdee8405931b0615220e5ddf8cd7edd8592c606a8e4ca2a00704883c396e4479"},
-    {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d25036d161c4fe2225d1abff2bd52c34ed0b1099f02c208cd34d8c05729882f0"},
-    {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5d791245a894be071d5ab04bbb4850534261a7d4fd363b094a7b9963e8cdbd31"},
-    {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:0cccd1de239afa866e4ce5c789b3032442f19c261c7d8a01183fd956b1935349"},
-    {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1f13f60d78224f0dace220d8ab4ef1dbc37115eeeab8c06804fec11bec2bbd07"},
-    {file = "aiohttp-3.8.6-cp36-cp36m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:8a9b5a0606faca4f6cc0d338359d6fa137104c337f489cd135bb7fbdbccb1e39"},
-    {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_aarch64.whl", hash = "sha256:13da35c9ceb847732bf5c6c5781dcf4780e14392e5d3b3c689f6d22f8e15ae31"},
-    {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_i686.whl", hash = "sha256:4d4cbe4ffa9d05f46a28252efc5941e0462792930caa370a6efaf491f412bc66"},
-    {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_ppc64le.whl", hash = "sha256:229852e147f44da0241954fc6cb910ba074e597f06789c867cb7fb0621e0ba7a"},
-    {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_s390x.whl", hash = "sha256:713103a8bdde61d13490adf47171a1039fd880113981e55401a0f7b42c37d071"},
-    {file = "aiohttp-3.8.6-cp36-cp36m-musllinux_1_1_x86_64.whl", hash = "sha256:45ad816b2c8e3b60b510f30dbd37fe74fd4a772248a52bb021f6fd65dff809b6"},
-    {file = "aiohttp-3.8.6-cp36-cp36m-win32.whl", hash = "sha256:2b8d4e166e600dcfbff51919c7a3789ff6ca8b3ecce16e1d9c96d95dd569eb4c"},
-    {file = "aiohttp-3.8.6-cp36-cp36m-win_amd64.whl", hash = "sha256:0912ed87fee967940aacc5306d3aa8ba3a459fcd12add0b407081fbefc931e53"},
-    {file = "aiohttp-3.8.6-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:e2a988a0c673c2e12084f5e6ba3392d76c75ddb8ebc6c7e9ead68248101cd446"},
-    {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ebf3fd9f141700b510d4b190094db0ce37ac6361a6806c153c161dc6c041ccda"},
-    {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3161ce82ab85acd267c8f4b14aa226047a6bee1e4e6adb74b798bd42c6ae1f80"},
-    {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d95fc1bf33a9a81469aa760617b5971331cdd74370d1214f0b3109272c0e1e3c"},
-    {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c43ecfef7deaf0617cee936836518e7424ee12cb709883f2c9a1adda63cc460"},
-    {file = "aiohttp-3.8.6-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:ca80e1b90a05a4f476547f904992ae81eda5c2c85c66ee4195bb8f9c5fb47f28"},
-    {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_aarch64.whl", hash = "sha256:90c72ebb7cb3a08a7f40061079817133f502a160561d0675b0a6adf231382c92"},
-    {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_i686.whl", hash = "sha256:bb54c54510e47a8c7c8e63454a6acc817519337b2b78606c4e840871a3e15349"},
-    {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_ppc64le.whl", hash = "sha256:de6a1c9f6803b90e20869e6b99c2c18cef5cc691363954c93cb9adeb26d9f3ae"},
-    {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_s390x.whl", hash = "sha256:a3628b6c7b880b181a3ae0a0683698513874df63783fd89de99b7b7539e3e8a8"},
-    {file = "aiohttp-3.8.6-cp37-cp37m-musllinux_1_1_x86_64.whl", hash = "sha256:fc37e9aef10a696a5a4474802930079ccfc14d9f9c10b4662169671ff034b7df"},
-    {file = "aiohttp-3.8.6-cp37-cp37m-win32.whl", hash = "sha256:f8ef51e459eb2ad8e7a66c1d6440c808485840ad55ecc3cafefadea47d1b1ba2"},
-    {file = "aiohttp-3.8.6-cp37-cp37m-win_amd64.whl", hash = "sha256:b2fe42e523be344124c6c8ef32a011444e869dc5f883c591ed87f84339de5976"},
-    {file = "aiohttp-3.8.6-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:9e2ee0ac5a1f5c7dd3197de309adfb99ac4617ff02b0603fd1e65b07dc772e4b"},
-    {file = "aiohttp-3.8.6-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:01770d8c04bd8db568abb636c1fdd4f7140b284b8b3e0b4584f070180c1e5c62"},
-    {file = "aiohttp-3.8.6-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:3c68330a59506254b556b99a91857428cab98b2f84061260a67865f7f52899f5"},
-    {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:89341b2c19fb5eac30c341133ae2cc3544d40d9b1892749cdd25892bbc6ac951"},
-    {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:71783b0b6455ac8f34b5ec99d83e686892c50498d5d00b8e56d47f41b38fbe04"},
-    {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f628dbf3c91e12f4d6c8b3f092069567d8eb17814aebba3d7d60c149391aee3a"},
-    {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:b04691bc6601ef47c88f0255043df6f570ada1a9ebef99c34bd0b72866c217ae"},
-    {file = "aiohttp-3.8.6-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:7ee912f7e78287516df155f69da575a0ba33b02dd7c1d6614dbc9463f43066e3"},
-    {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:9c19b26acdd08dd239e0d3669a3dddafd600902e37881f13fbd8a53943079dbc"},
-    {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:99c5ac4ad492b4a19fc132306cd57075c28446ec2ed970973bbf036bcda1bcc6"},
-    {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:f0f03211fd14a6a0aed2997d4b1c013d49fb7b50eeb9ffdf5e51f23cfe2c77fa"},
-    {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:8d399dade330c53b4106160f75f55407e9ae7505263ea86f2ccca6bfcbdb4921"},
-    {file = "aiohttp-3.8.6-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:ec4fd86658c6a8964d75426517dc01cbf840bbf32d055ce64a9e63a40fd7b771"},
-    {file = "aiohttp-3.8.6-cp38-cp38-win32.whl", hash = "sha256:33164093be11fcef3ce2571a0dccd9041c9a93fa3bde86569d7b03120d276c6f"},
-    {file = "aiohttp-3.8.6-cp38-cp38-win_amd64.whl", hash = "sha256:bdf70bfe5a1414ba9afb9d49f0c912dc524cf60141102f3a11143ba3d291870f"},
-    {file = "aiohttp-3.8.6-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:d52d5dc7c6682b720280f9d9db41d36ebe4791622c842e258c9206232251ab2b"},
-    {file = "aiohttp-3.8.6-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:4ac39027011414dbd3d87f7edb31680e1f430834c8cef029f11c66dad0670aa5"},
-    {file = "aiohttp-3.8.6-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3f5c7ce535a1d2429a634310e308fb7d718905487257060e5d4598e29dc17f0b"},
-    {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b30e963f9e0d52c28f284d554a9469af073030030cef8693106d918b2ca92f54"},
-    {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:918810ef188f84152af6b938254911055a72e0f935b5fbc4c1a4ed0b0584aed1"},
-    {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:002f23e6ea8d3dd8d149e569fd580c999232b5fbc601c48d55398fbc2e582e8c"},
-    {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4fcf3eabd3fd1a5e6092d1242295fa37d0354b2eb2077e6eb670accad78e40e1"},
-    {file = "aiohttp-3.8.6-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:255ba9d6d5ff1a382bb9a578cd563605aa69bec845680e21c44afc2670607a95"},
-    {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:d67f8baed00870aa390ea2590798766256f31dc5ed3ecc737debb6e97e2ede78"},
-    {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:86f20cee0f0a317c76573b627b954c412ea766d6ada1a9fcf1b805763ae7feeb"},
-    {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:39a312d0e991690ccc1a61f1e9e42daa519dcc34ad03eb6f826d94c1190190dd"},
-    {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:e827d48cf802de06d9c935088c2924e3c7e7533377d66b6f31ed175c1620e05e"},
-    {file = "aiohttp-3.8.6-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:bd111d7fc5591ddf377a408ed9067045259ff2770f37e2d94e6478d0f3fc0c17"},
-    {file = "aiohttp-3.8.6-cp39-cp39-win32.whl", hash = "sha256:caf486ac1e689dda3502567eb89ffe02876546599bbf915ec94b1fa424eeffd4"},
-    {file = "aiohttp-3.8.6-cp39-cp39-win_amd64.whl", hash = "sha256:3f0e27e5b733803333bb2371249f41cf42bae8884863e8e8965ec69bebe53132"},
-    {file = "aiohttp-3.8.6.tar.gz", hash = "sha256:b0cf2a4501bff9330a8a5248b4ce951851e415bdcce9dc158e76cfd55e15085c"},
+    {file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:6896b8416be9ada4d22cd359d7cb98955576ce863eadad5596b7cdfbf3e17c6c"},
+    {file = "aiohttp-3.9.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:1736d87dad8ef46a8ec9cddd349fa9f7bd3a064c47dd6469c0d6763d3d49a4fc"},
+    {file = "aiohttp-3.9.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:8c9e5f4d7208cda1a2bb600e29069eecf857e6980d0ccc922ccf9d1372c16f4b"},
+    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8488519aa05e636c5997719fe543c8daf19f538f4fa044f3ce94bee608817cff"},
+    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:5ab16c254e2312efeb799bc3c06897f65a133b38b69682bf75d1f1ee1a9c43a9"},
+    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:7a94bde005a8f926d0fa38b88092a03dea4b4875a61fbcd9ac6f4351df1b57cd"},
+    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4b777c9286b6c6a94f50ddb3a6e730deec327e9e2256cb08b5530db0f7d40fd8"},
+    {file = "aiohttp-3.9.0-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:571760ad7736b34d05597a1fd38cbc7d47f7b65deb722cb8e86fd827404d1f6b"},
+    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_aarch64.whl", hash = "sha256:deac0a32aec29608eb25d730f4bc5a261a65b6c48ded1ed861d2a1852577c932"},
+    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_i686.whl", hash = "sha256:4ee1b4152bc3190cc40ddd6a14715e3004944263ea208229ab4c297712aa3075"},
+    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_ppc64le.whl", hash = "sha256:3607375053df58ed6f23903aa10cf3112b1240e8c799d243bbad0f7be0666986"},
+    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_s390x.whl", hash = "sha256:65b0a70a25456d329a5e1426702dde67be0fb7a4ead718005ba2ca582d023a94"},
+    {file = "aiohttp-3.9.0-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:5a2eb5311a37fe105aa35f62f75a078537e1a9e4e1d78c86ec9893a3c97d7a30"},
+    {file = "aiohttp-3.9.0-cp310-cp310-win32.whl", hash = "sha256:2cbc14a13fb6b42d344e4f27746a4b03a2cb0c1c3c5b932b0d6ad8881aa390e3"},
+    {file = "aiohttp-3.9.0-cp310-cp310-win_amd64.whl", hash = "sha256:ac9669990e2016d644ba8ae4758688534aabde8dbbc81f9af129c3f5f01ca9cd"},
+    {file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:f8e05f5163528962ce1d1806fce763ab893b1c5b7ace0a3538cd81a90622f844"},
+    {file = "aiohttp-3.9.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:4afa8f71dba3a5a2e1e1282a51cba7341ae76585345c43d8f0e624882b622218"},
+    {file = "aiohttp-3.9.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:f929f4c9b9a00f3e6cc0587abb95ab9c05681f8b14e0fe1daecfa83ea90f8318"},
+    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:28185e36a78d247c55e9fbea2332d16aefa14c5276a582ce7a896231c6b1c208"},
+    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:a486ddf57ab98b6d19ad36458b9f09e6022de0381674fe00228ca7b741aacb2f"},
+    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:70e851f596c00f40a2f00a46126c95c2e04e146015af05a9da3e4867cfc55911"},
+    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c5b7bf8fe4d39886adc34311a233a2e01bc10eb4e842220235ed1de57541a896"},
+    {file = "aiohttp-3.9.0-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c67a51ea415192c2e53e4e048c78bab82d21955b4281d297f517707dc836bf3d"},
+    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:694df243f394629bcae2d8ed94c589a181e8ba8604159e6e45e7b22e58291113"},
+    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:3dd8119752dd30dd7bca7d4bc2a92a59be6a003e4e5c2cf7e248b89751b8f4b7"},
+    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_ppc64le.whl", hash = "sha256:eb6dfd52063186ac97b4caa25764cdbcdb4b10d97f5c5f66b0fa95052e744eb7"},
+    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_s390x.whl", hash = "sha256:d97c3e286d0ac9af6223bc132dc4bad6540b37c8d6c0a15fe1e70fb34f9ec411"},
+    {file = "aiohttp-3.9.0-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:816f4db40555026e4cdda604a1088577c1fb957d02f3f1292e0221353403f192"},
+    {file = "aiohttp-3.9.0-cp311-cp311-win32.whl", hash = "sha256:3abf0551874fecf95f93b58f25ef4fc9a250669a2257753f38f8f592db85ddea"},
+    {file = "aiohttp-3.9.0-cp311-cp311-win_amd64.whl", hash = "sha256:e18d92c3e9e22553a73e33784fcb0ed484c9874e9a3e96c16a8d6a1e74a0217b"},
+    {file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_universal2.whl", hash = "sha256:99ae01fb13a618b9942376df77a1f50c20a281390dad3c56a6ec2942e266220d"},
+    {file = "aiohttp-3.9.0-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:05857848da443c8c12110d99285d499b4e84d59918a21132e45c3f0804876994"},
+    {file = "aiohttp-3.9.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:317719d7f824eba55857fe0729363af58e27c066c731bc62cd97bc9c3d9c7ea4"},
+    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1e3b3c107ccb0e537f309f719994a55621acd2c8fdf6d5ce5152aed788fb940"},
+    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:45820ddbb276113ead8d4907a7802adb77548087ff5465d5c554f9aa3928ae7d"},
+    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:05a183f1978802588711aed0dea31e697d760ce9055292db9dc1604daa9a8ded"},
+    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:51a4cd44788ea0b5e6bb8fa704597af3a30be75503a7ed1098bc5b8ffdf6c982"},
+    {file = "aiohttp-3.9.0-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:673343fbc0c1ac44d0d2640addc56e97a052504beacd7ade0dc5e76d3a4c16e8"},
+    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_aarch64.whl", hash = "sha256:7e8a3b79b6d186a9c99761fd4a5e8dd575a48d96021f220ac5b5fa856e5dd029"},
+    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:6777a390e41e78e7c45dab43a4a0196c55c3b8c30eebe017b152939372a83253"},
+    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:7ae5f99a32c53731c93ac3075abd3e1e5cfbe72fc3eaac4c27c9dd64ba3b19fe"},
+    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_s390x.whl", hash = "sha256:f1e4f254e9c35d8965d377e065c4a8a55d396fe87c8e7e8429bcfdeeb229bfb3"},
+    {file = "aiohttp-3.9.0-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:11ca808f9a6b63485059f5f6e164ef7ec826483c1212a44f268b3653c91237d8"},
+    {file = "aiohttp-3.9.0-cp312-cp312-win32.whl", hash = "sha256:de3cc86f4ea8b4c34a6e43a7306c40c1275e52bfa9748d869c6b7d54aa6dad80"},
+    {file = "aiohttp-3.9.0-cp312-cp312-win_amd64.whl", hash = "sha256:ca4fddf84ac7d8a7d0866664936f93318ff01ee33e32381a115b19fb5a4d1202"},
+    {file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:f09960b5bb1017d16c0f9e9f7fc42160a5a49fa1e87a175fd4a2b1a1833ea0af"},
+    {file = "aiohttp-3.9.0-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:8303531e2c17b1a494ffaeba48f2da655fe932c4e9a2626c8718403c83e5dd2b"},
+    {file = "aiohttp-3.9.0-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:4790e44f46a4aa07b64504089def5744d3b6780468c4ec3a1a36eb7f2cae9814"},
+    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a1d7edf74a36de0e5ca50787e83a77cf352f5504eb0ffa3f07000a911ba353fb"},
+    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:94697c7293199c2a2551e3e3e18438b4cba293e79c6bc2319f5fd652fccb7456"},
+    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:a1b66dbb8a7d5f50e9e2ea3804b01e766308331d0cac76eb30c563ac89c95985"},
+    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:9623cfd9e85b76b83ef88519d98326d4731f8d71869867e47a0b979ffec61c73"},
+    {file = "aiohttp-3.9.0-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:f32c86dc967ab8c719fd229ce71917caad13cc1e8356ee997bf02c5b368799bf"},
+    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_aarch64.whl", hash = "sha256:f50b4663c3e0262c3a361faf440761fbef60ccdde5fe8545689a4b3a3c149fb4"},
+    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_i686.whl", hash = "sha256:dcf71c55ec853826cd70eadb2b6ac62ec577416442ca1e0a97ad875a1b3a0305"},
+    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_ppc64le.whl", hash = "sha256:42fe4fd9f0dfcc7be4248c162d8056f1d51a04c60e53366b0098d1267c4c9da8"},
+    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_s390x.whl", hash = "sha256:76a86a9989ebf82ee61e06e2bab408aec4ea367dc6da35145c3352b60a112d11"},
+    {file = "aiohttp-3.9.0-cp38-cp38-musllinux_1_1_x86_64.whl", hash = "sha256:f9e09a1c83521d770d170b3801eea19b89f41ccaa61d53026ed111cb6f088887"},
+    {file = "aiohttp-3.9.0-cp38-cp38-win32.whl", hash = "sha256:a00ce44c21612d185c5275c5cba4bab8d7c1590f248638b667ed8a782fa8cd6f"},
+    {file = "aiohttp-3.9.0-cp38-cp38-win_amd64.whl", hash = "sha256:d5b9345ab92ebe6003ae11d8092ce822a0242146e6fa270889b9ba965457ca40"},
+    {file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:98d21092bf2637c5fa724a428a69e8f5955f2182bff61f8036827cf6ce1157bf"},
+    {file = "aiohttp-3.9.0-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:35a68cd63ca6aaef5707888f17a70c36efe62b099a4e853d33dc2e9872125be8"},
+    {file = "aiohttp-3.9.0-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:3d7f6235c7475658acfc1769d968e07ab585c79f6ca438ddfecaa9a08006aee2"},
+    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:db04d1de548f7a62d1dd7e7cdf7c22893ee168e22701895067a28a8ed51b3735"},
+    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:536b01513d67d10baf6f71c72decdf492fb7433c5f2f133e9a9087379d4b6f31"},
+    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:87c8b0a6487e8109427ccf638580865b54e2e3db4a6e0e11c02639231b41fc0f"},
+    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:7276fe0017664414fdc3618fca411630405f1aaf0cc3be69def650eb50441787"},
+    {file = "aiohttp-3.9.0-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:23170247ef89ffa842a02bbfdc425028574d9e010611659abeb24d890bc53bb8"},
+    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_aarch64.whl", hash = "sha256:b1a2ea8252cacc7fd51df5a56d7a2bb1986ed39be9397b51a08015727dfb69bd"},
+    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_i686.whl", hash = "sha256:2d71abc15ff7047412ef26bf812dfc8d0d1020d664617f4913df2df469f26b76"},
+    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_ppc64le.whl", hash = "sha256:2d820162c8c2bdbe97d328cd4f417c955ca370027dce593345e437b2e9ffdc4d"},
+    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_s390x.whl", hash = "sha256:2779f5e7c70f7b421915fd47db332c81de365678180a9f3ab404088f87ba5ff9"},
+    {file = "aiohttp-3.9.0-cp39-cp39-musllinux_1_1_x86_64.whl", hash = "sha256:366bc870d7ac61726f32a489fbe3d1d8876e87506870be66b01aeb84389e967e"},
+    {file = "aiohttp-3.9.0-cp39-cp39-win32.whl", hash = "sha256:1df43596b826022b14998f0460926ce261544fedefe0d2f653e1b20f49e96454"},
+    {file = "aiohttp-3.9.0-cp39-cp39-win_amd64.whl", hash = "sha256:9c196b30f1b1aa3363a69dd69079ae9bec96c2965c4707eaa6914ba099fb7d4f"},
+    {file = "aiohttp-3.9.0.tar.gz", hash = "sha256:09f23292d29135025e19e8ff4f0a68df078fe4ee013bca0105b2e803989de92d"},
 ]

 [package.dependencies]
 aiosignal = ">=1.1.2"
-async-timeout = ">=4.0.0a3,<5.0"
+async-timeout = {version = ">=4.0,<5.0", markers = "python_version < \"3.11\""}
 attrs = ">=17.3.0"
-charset-normalizer = ">=2.0,<4.0"
 frozenlist = ">=1.1.1"
 multidict = ">=4.5,<7.0"
 yarl = ">=1.0,<2.0"

 [package.extras]
-speedups = ["Brotli", "aiodns", "cchardet"]
+speedups = ["Brotli", "aiodns", "brotlicffi"]

 [[package]]
 name = "aiopg"
@@ -887,34 +875,34 @@ files = [

 [[package]]
 name = "cryptography"
-version = "41.0.4"
+version = "41.0.6"
 description = "cryptography is a package which provides cryptographic recipes and primitives to Python developers."
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "cryptography-41.0.4-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:80907d3faa55dc5434a16579952ac6da800935cd98d14dbd62f6f042c7f5e839"},
-    {file = "cryptography-41.0.4-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:35c00f637cd0b9d5b6c6bd11b6c3359194a8eba9c46d4e875a3660e3b400005f"},
-    {file = "cryptography-41.0.4-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cecfefa17042941f94ab54f769c8ce0fe14beff2694e9ac684176a2535bf9714"},
-    {file = "cryptography-41.0.4-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e40211b4923ba5a6dc9769eab704bdb3fbb58d56c5b336d30996c24fcf12aadb"},
-    {file = "cryptography-41.0.4-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:23a25c09dfd0d9f28da2352503b23e086f8e78096b9fd585d1d14eca01613e13"},
-    {file = "cryptography-41.0.4-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:2ed09183922d66c4ec5fdaa59b4d14e105c084dd0febd27452de8f6f74704143"},
-    {file = "cryptography-41.0.4-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5a0f09cefded00e648a127048119f77bc2b2ec61e736660b5789e638f43cc397"},
-    {file = "cryptography-41.0.4-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:9eeb77214afae972a00dee47382d2591abe77bdae166bda672fb1e24702a3860"},
-    {file = "cryptography-41.0.4-cp37-abi3-win32.whl", hash = "sha256:3b224890962a2d7b57cf5eeb16ccaafba6083f7b811829f00476309bce2fe0fd"},
-    {file = "cryptography-41.0.4-cp37-abi3-win_amd64.whl", hash = "sha256:c880eba5175f4307129784eca96f4e70b88e57aa3f680aeba3bab0e980b0f37d"},
-    {file = "cryptography-41.0.4-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:004b6ccc95943f6a9ad3142cfabcc769d7ee38a3f60fb0dddbfb431f818c3a67"},
-    {file = "cryptography-41.0.4-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:86defa8d248c3fa029da68ce61fe735432b047e32179883bdb1e79ed9bb8195e"},
-    {file = "cryptography-41.0.4-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:37480760ae08065437e6573d14be973112c9e6dcaf5f11d00147ee74f37a3829"},
-    {file = "cryptography-41.0.4-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:b5f4dfe950ff0479f1f00eda09c18798d4f49b98f4e2006d644b3301682ebdca"},
-    {file = "cryptography-41.0.4-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:7e53db173370dea832190870e975a1e09c86a879b613948f09eb49324218c14d"},
-    {file = "cryptography-41.0.4-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:5b72205a360f3b6176485a333256b9bcd48700fc755fef51c8e7e67c4b63e3ac"},
-    {file = "cryptography-41.0.4-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:93530900d14c37a46ce3d6c9e6fd35dbe5f5601bf6b3a5c325c7bffc030344d9"},
-    {file = "cryptography-41.0.4-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:efc8ad4e6fc4f1752ebfb58aefece8b4e3c4cae940b0994d43649bdfce8d0d4f"},
-    {file = "cryptography-41.0.4-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:c3391bd8e6de35f6f1140e50aaeb3e2b3d6a9012536ca23ab0d9c35ec18c8a91"},
-    {file = "cryptography-41.0.4-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:0d9409894f495d465fe6fda92cb70e8323e9648af912d5b9141d616df40a87b8"},
-    {file = "cryptography-41.0.4-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:8ac4f9ead4bbd0bc8ab2d318f97d85147167a488be0e08814a37eb2f439d5cf6"},
-    {file = "cryptography-41.0.4-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:047c4603aeb4bbd8db2756e38f5b8bd7e94318c047cfe4efeb5d715e08b49311"},
-    {file = "cryptography-41.0.4.tar.gz", hash = "sha256:7febc3094125fc126a7f6fb1f420d0da639f3f32cb15c8ff0dc3997c4549f51a"},
+    {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_universal2.whl", hash = "sha256:0f27acb55a4e77b9be8d550d762b0513ef3fc658cd3eb15110ebbcbd626db12c"},
+    {file = "cryptography-41.0.6-cp37-abi3-macosx_10_12_x86_64.whl", hash = "sha256:ae236bb8760c1e55b7a39b6d4d32d2279bc6c7c8500b7d5a13b6fb9fc97be35b"},
+    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afda76d84b053923c27ede5edc1ed7d53e3c9f475ebaf63c68e69f1403c405a8"},
+    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:da46e2b5df770070412c46f87bac0849b8d685c5f2679771de277a422c7d0b86"},
+    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_aarch64.whl", hash = "sha256:ff369dd19e8fe0528b02e8df9f2aeb2479f89b1270d90f96a63500afe9af5cae"},
+    {file = "cryptography-41.0.6-cp37-abi3-manylinux_2_28_x86_64.whl", hash = "sha256:b648fe2a45e426aaee684ddca2632f62ec4613ef362f4d681a9a6283d10e079d"},
+    {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_aarch64.whl", hash = "sha256:5daeb18e7886a358064a68dbcaf441c036cbdb7da52ae744e7b9207b04d3908c"},
+    {file = "cryptography-41.0.6-cp37-abi3-musllinux_1_1_x86_64.whl", hash = "sha256:068bc551698c234742c40049e46840843f3d98ad7ce265fd2bd4ec0d11306596"},
+    {file = "cryptography-41.0.6-cp37-abi3-win32.whl", hash = "sha256:2132d5865eea673fe6712c2ed5fb4fa49dba10768bb4cc798345748380ee3660"},
+    {file = "cryptography-41.0.6-cp37-abi3-win_amd64.whl", hash = "sha256:48783b7e2bef51224020efb61b42704207dde583d7e371ef8fc2a5fb6c0aabc7"},
+    {file = "cryptography-41.0.6-pp310-pypy310_pp73-macosx_10_12_x86_64.whl", hash = "sha256:8efb2af8d4ba9dbc9c9dd8f04d19a7abb5b49eab1f3694e7b5a16a5fc2856f5c"},
+    {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:c5a550dc7a3b50b116323e3d376241829fd326ac47bc195e04eb33a8170902a9"},
+    {file = "cryptography-41.0.6-pp310-pypy310_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:85abd057699b98fce40b41737afb234fef05c67e116f6f3650782c10862c43da"},
+    {file = "cryptography-41.0.6-pp310-pypy310_pp73-win_amd64.whl", hash = "sha256:f39812f70fc5c71a15aa3c97b2bbe213c3f2a460b79bd21c40d033bb34a9bf36"},
+    {file = "cryptography-41.0.6-pp38-pypy38_pp73-macosx_10_12_x86_64.whl", hash = "sha256:742ae5e9a2310e9dade7932f9576606836ed174da3c7d26bc3d3ab4bd49b9f65"},
+    {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:35f3f288e83c3f6f10752467c48919a7a94b7d88cc00b0668372a0d2ad4f8ead"},
+    {file = "cryptography-41.0.6-pp38-pypy38_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:4d03186af98b1c01a4eda396b137f29e4e3fb0173e30f885e27acec8823c1b09"},
+    {file = "cryptography-41.0.6-pp38-pypy38_pp73-win_amd64.whl", hash = "sha256:b27a7fd4229abef715e064269d98a7e2909ebf92eb6912a9603c7e14c181928c"},
+    {file = "cryptography-41.0.6-pp39-pypy39_pp73-macosx_10_12_x86_64.whl", hash = "sha256:398ae1fc711b5eb78e977daa3cbf47cec20f2c08c5da129b7a296055fbb22aed"},
+    {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_aarch64.whl", hash = "sha256:7e00fb556bda398b99b0da289ce7053639d33b572847181d6483ad89835115f6"},
+    {file = "cryptography-41.0.6-pp39-pypy39_pp73-manylinux_2_28_x86_64.whl", hash = "sha256:60e746b11b937911dc70d164060d28d273e31853bb359e2b2033c9e93e6f3c43"},
+    {file = "cryptography-41.0.6-pp39-pypy39_pp73-win_amd64.whl", hash = "sha256:3288acccef021e3c3c10d58933f44e8602cf04dba96d9796d70d537bb2f4bbc4"},
+    {file = "cryptography-41.0.6.tar.gz", hash = "sha256:422e3e31d63743855e43e5a6fcc8b4acab860f560f9321b0ee6269cc7ed70cc3"},
 ]

 [package.dependencies]
@@ -1979,18 +1967,18 @@ pytest = [

 [[package]]
 name = "pytest-rerunfailures"
-version = "11.1.2"
+version = "13.0"
 description = "pytest plugin to re-run tests to eliminate flaky failures"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-rerunfailures-11.1.2.tar.gz", hash = "sha256:55611661e873f1cafa384c82f08d07883954f4b76435f4b8a5b470c1954573de"},
-    {file = "pytest_rerunfailures-11.1.2-py3-none-any.whl", hash = "sha256:d21fe2e46d9774f8ad95f1aa799544ae95cac3a223477af94aa985adfae92b7e"},
+    {file = "pytest-rerunfailures-13.0.tar.gz", hash = "sha256:e132dbe420bc476f544b96e7036edd0a69707574209b6677263c950d19b09199"},
+    {file = "pytest_rerunfailures-13.0-py3-none-any.whl", hash = "sha256:34919cb3fcb1f8e5d4b940aa75ccdea9661bade925091873b7c6fa5548333069"},
 ]

 [package.dependencies]
 packaging = ">=17.1"
-pytest = ">=5.3"
+pytest = ">=7"

 [[package]]
 name = "pytest-split"
@@ -2488,16 +2476,6 @@ files = [
    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
    {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
    {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
-    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
-    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
-    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
-    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
-    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2719,4 +2697,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "0834e5cb69e5457741d4f476c3e49a4dc83598b5730685c8755da651b96ad3ec"
+content-hash = "9f33b4404dbb9803ede5785469241dde1d09132427b87db8928bdbc37ccd6b7a"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -24,6 +24,7 @@ hostname.workspace = true
 humantime.workspace = true
 hyper-tungstenite.workspace = true
 hyper.workspace = true
+ipnet.workspace = true
 itertools.workspace = true
 md5.workspace = true
 metrics.workspace = true
@@ -68,6 +69,7 @@ webpki-roots.workspace = true
 x509-parser.workspace = true
 native-tls.workspace = true
 postgres-native-tls.workspace = true
+smol_str.workspace = true

 workspace_hack.workspace = true
 tokio-util.workspace = true
@@ -76,3 +78,4 @@ tokio-util.workspace = true
 rcgen.workspace = true
 rstest.workspace = true
 tokio-postgres-rustls.workspace = true
+postgres-protocol.workspace = true
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -4,7 +4,7 @@ pub mod backend;
 pub use backend::BackendType;

 mod credentials;
-pub use credentials::ClientCredentials;
+pub use credentials::{check_peer_addr_is_in_list, ClientCredentials};

 mod password_hack;
 pub use password_hack::parse_endpoint_param;
@@ -56,6 +56,12 @@ pub enum AuthErrorImpl {
    /// Errors produced by e.g. [`crate::stream::PqStream`].
    #[error(transparent)]
    Io(#[from] io::Error),
+
+    #[error(
+        "This IP address is not allowed to connect to this endpoint. \
+        Please add it to the allowed list in the Neon console."
+    )]
+    IpAddressNotAllowed,
 }

 #[derive(Debug, Error)]
@@ -70,6 +76,10 @@ impl AuthError {
    pub fn auth_failed(user: impl Into<Box<str>>) -> Self {
        AuthErrorImpl::AuthFailed(user.into()).into()
    }
+
+    pub fn ip_address_not_allowed() -> Self {
+        AuthErrorImpl::IpAddressNotAllowed.into()
+    }
 }

 impl<E: Into<AuthErrorImpl>> From<E> for AuthError {
@@ -91,6 +101,7 @@ impl UserFacingError for AuthError {
            MalformedPassword(_) => self.to_string(),
            MissingEndpointName => self.to_string(),
            Io(_) => "Internal error".to_string(),
+            IpAddressNotAllowed => self.to_string(),
        }
    }
 }
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -5,7 +5,13 @@ mod link;
 pub use link::LinkAuthError;
 use tokio_postgres::config::AuthKeys;

+use crate::auth::credentials::check_peer_addr_is_in_list;
+use crate::console::errors::GetAuthInfoError;
+use crate::console::provider::AuthInfo;
+use crate::console::AuthSecret;
 use crate::proxy::{handle_try_wake, retry_after, LatencyTimer};
+use crate::scram;
+use crate::stream::Stream;
 use crate::{
    auth::{self, ClientCredentials},
    config::AuthenticationConfig,
@@ -19,6 +25,7 @@ use crate::{
 use futures::TryFutureExt;
 use std::borrow::Cow;
 use std::ops::ControlFlow;
+use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{error, info, warn};

@@ -63,6 +70,7 @@ pub enum BackendType<'a, T> {

 pub trait TestBackend: Send + Sync + 'static {
    fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
+    fn get_allowed_ips(&self) -> Result<Arc<Vec<String>>, console::errors::GetAuthInfoError>;
 }

 impl std::fmt::Display for BackendType<'_, ()> {
@@ -131,7 +139,7 @@ async fn auth_quirks_creds(
    api: &impl console::Api,
    extra: &ConsoleReqExtra<'_>,
    creds: &mut ClientCredentials<'_>,
-    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    allow_cleartext: bool,
    config: &'static AuthenticationConfig,
    latency_timer: &mut LatencyTimer,
@@ -139,14 +147,38 @@ async fn auth_quirks_creds(
    // If there's no project so far, that entails that client doesn't
    // support SNI or other means of passing the endpoint (project) name.
    // We now expect to see a very specific payload in the place of password.
-    if creds.project.is_none() {
+    let maybe_success = if creds.project.is_none() {
        // Password will be checked by the compute node later.
-        return hacks::password_hack(creds, client, latency_timer).await;
-    }
+        Some(hacks::password_hack(creds, client, latency_timer).await?)
+    } else {
+        None
+    };

    // Password hack should set the project name.
    // TODO: make `creds.project` more type-safe.
    assert!(creds.project.is_some());
+    info!("fetching user's authentication info");
+    // TODO(anna): this will slow down both "hacks" below; we probably need a cache.
+    let AuthInfo {
+        secret,
+        allowed_ips,
+    } = api.get_auth_info(extra, creds).await?;
+
+    // check allowed list
+    if !check_peer_addr_is_in_list(&creds.peer_addr.ip(), &allowed_ips) {
+        return Err(auth::AuthError::ip_address_not_allowed());
+    }
+    let secret = secret.unwrap_or_else(|| {
+        // If we don't have an authentication secret, we mock one to
+        // prevent malicious probing (possible due to missing protocol steps).
+        // This mocked secret will never lead to successful authentication.
+        info!("authentication info not found, mocking it");
+        AuthSecret::Scram(scram::ServerSecret::mock(creds.user, rand::random()))
+    });
+
+    if let Some(success) = maybe_success {
+        return Ok(success);
+    }

    // Perform cleartext auth if we're allowed to do that.
    // Currently, we use it for websocket connections (latency).
@@ -156,7 +188,7 @@ async fn auth_quirks_creds(
    }

    // Finally, proceed with the main auth flow (SCRAM-based).
-    classic::authenticate(api, extra, creds, client, config, latency_timer).await
+    classic::authenticate(creds, client, config, latency_timer, secret).await
 }

 /// True to its name, this function encapsulates our current auth trade-offs.
@@ -165,7 +197,7 @@ async fn auth_quirks(
    api: &impl console::Api,
    extra: &ConsoleReqExtra<'_>,
    creds: &mut ClientCredentials<'_>,
-    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    allow_cleartext: bool,
    config: &'static AuthenticationConfig,
    latency_timer: &mut LatencyTimer,
@@ -241,7 +273,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
    pub async fn authenticate(
        &mut self,
        extra: &ConsoleReqExtra<'_>,
-        client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+        client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
        allow_cleartext: bool,
        config: &'static AuthenticationConfig,
        latency_timer: &mut LatencyTimer,
@@ -304,6 +336,19 @@ impl BackendType<'_, ClientCredentials<'_>> {
        Ok(res)
    }

+    pub async fn get_allowed_ips(
+        &self,
+        extra: &ConsoleReqExtra<'_>,
+    ) -> Result<Arc<Vec<String>>, GetAuthInfoError> {
+        use BackendType::*;
+        match self {
+            Console(api, creds) => api.get_allowed_ips(extra, creds).await,
+            Postgres(api, creds) => api.get_allowed_ips(extra, creds).await,
+            Link(_) => Ok(Arc::new(vec![])),
+            Test(x) => x.get_allowed_ips(),
+        }
+    }
+
    /// When applicable, wake the compute node, gaining its connection info in the process.
    /// The link auth flow doesn't support this, so we return [`None`] in that case.
    pub async fn wake_compute(
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -3,38 +3,28 @@ use crate::{
    auth::{self, AuthFlow, ClientCredentials},
    compute,
    config::AuthenticationConfig,
-    console::{self, AuthInfo, ConsoleReqExtra},
+    console::AuthSecret,
    proxy::LatencyTimer,
-    sasl, scram,
-    stream::PqStream,
+    sasl,
+    stream::{PqStream, Stream},
 };
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};

 pub(super) async fn authenticate(
-    api: &impl console::Api,
-    extra: &ConsoleReqExtra<'_>,
    creds: &ClientCredentials<'_>,
-    client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    client: &mut PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    config: &'static AuthenticationConfig,
    latency_timer: &mut LatencyTimer,
+    secret: AuthSecret,
 ) -> auth::Result<AuthSuccess<ComputeCredentials>> {
-    info!("fetching user's authentication info");
-    let info = api.get_auth_info(extra, creds).await?.unwrap_or_else(|| {
-        // If we don't have an authentication secret, we mock one to
-        // prevent malicious probing (possible due to missing protocol steps).
-        // This mocked secret will never lead to successful authentication.
-        info!("authentication info not found, mocking it");
-        AuthInfo::Scram(scram::ServerSecret::mock(creds.user, rand::random()))
-    });
-
    let flow = AuthFlow::new(client);
-    let scram_keys = match info {
-        AuthInfo::Md5(_) => {
+    let scram_keys = match secret {
+        AuthSecret::Md5(_) => {
            info!("auth endpoint chooses MD5");
            return Err(auth::AuthError::bad_auth_method("MD5"));
        }
-        AuthInfo::Scram(secret) => {
+        AuthSecret::Scram(secret) => {
            info!("auth endpoint chooses SCRAM");
            let scram = auth::Scram(&secret);

--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -2,7 +2,7 @@ use super::{AuthSuccess, ComputeCredentials};
 use crate::{
    auth::{self, AuthFlow, ClientCredentials},
    proxy::LatencyTimer,
-    stream,
+    stream::{self, Stream},
 };
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
@@ -12,7 +12,7 @@ use tracing::{info, warn};
 /// These properties are benefical for serverless JS workers, so we
 /// use this mechanism for websocket connections.
 pub async fn cleartext_hack(
-    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    warn!("cleartext auth flow override is enabled, proceeding");
@@ -37,7 +37,7 @@ pub async fn cleartext_hack(
 /// Very similar to [`cleartext_hack`], but there's a specific password format.
 pub async fn password_hack(
    creds: &mut ClientCredentials<'_>,
-    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    client: &mut stream::PqStream<Stream<impl AsyncRead + AsyncWrite + Unpin>>,
    latency_timer: &mut LatencyTimer,
 ) -> auth::Result<AuthSuccess<ComputeCredentials>> {
    warn!("project not specified, resorting to the password hack auth flow");
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -106,7 +106,7 @@ pub(super) async fn authenticate(
        reported_auth_ok: true,
        value: NodeInfo {
            config,
-            aux: db_info.aux.into(),
+            aux: db_info.aux,
            allow_self_signed_compute: false, // caller may override
        },
    })
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -7,9 +7,12 @@ use crate::{
 };
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
-use std::collections::HashSet;
+use std::{
+    collections::HashSet,
+    net::{IpAddr, SocketAddr},
+};
 use thiserror::Error;
-use tracing::info;
+use tracing::{info, warn};

 #[derive(Debug, Error, PartialEq, Eq, Clone)]
 pub enum ClientCredsParseError {
@@ -44,6 +47,7 @@ pub struct ClientCredentials<'a> {
    pub project: Option<String>,

    pub cache_key: String,
+    pub peer_addr: SocketAddr,
 }

 impl ClientCredentials<'_> {
@@ -54,19 +58,11 @@ impl ClientCredentials<'_> {
 }

 impl<'a> ClientCredentials<'a> {
-    #[cfg(test)]
-    pub fn new_noop() -> Self {
-        ClientCredentials {
-            user: "",
-            project: None,
-            cache_key: "".to_string(),
-        }
-    }
-
    pub fn parse(
        params: &'a StartupMessageParams,
        sni: Option<&str>,
        common_names: Option<HashSet<String>>,
+        peer_addr: SocketAddr,
    ) -> Result<Self, ClientCredsParseError> {
        use ClientCredsParseError::*;

@@ -153,10 +149,59 @@ impl<'a> ClientCredentials<'a> {
            user,
            project,
            cache_key,
+            peer_addr,
        })
    }
 }

+pub fn check_peer_addr_is_in_list(peer_addr: &IpAddr, ip_list: &Vec<String>) -> bool {
+    if ip_list.is_empty() {
+        return true;
+    }
+    for ip in ip_list {
+        // We expect that all ip addresses from control plane are correct.
+        // However, if some of them are broken, we still can check the others.
+        match parse_ip_pattern(ip) {
+            Ok(pattern) => {
+                if check_ip(peer_addr, &pattern) {
+                    return true;
+                }
+            }
+            Err(err) => warn!("Cannot parse ip: {}; err: {}", ip, err),
+        }
+    }
+    false
+}
+
+#[derive(Debug, Clone, Eq, PartialEq)]
+enum IpPattern {
+    Subnet(ipnet::IpNet),
+    Range(IpAddr, IpAddr),
+    Single(IpAddr),
+}
+
+fn parse_ip_pattern(pattern: &str) -> anyhow::Result<IpPattern> {
+    if pattern.contains('/') {
+        let subnet: ipnet::IpNet = pattern.parse()?;
+        return Ok(IpPattern::Subnet(subnet));
+    }
+    if let Some((start, end)) = pattern.split_once('-') {
+        let start: IpAddr = start.parse()?;
+        let end: IpAddr = end.parse()?;
+        return Ok(IpPattern::Range(start, end));
+    }
+    let addr: IpAddr = pattern.parse()?;
+    Ok(IpPattern::Single(addr))
+}
+
+fn check_ip(ip: &IpAddr, pattern: &IpPattern) -> bool {
+    match pattern {
+        IpPattern::Subnet(subnet) => subnet.contains(ip),
+        IpPattern::Range(start, end) => start <= ip && ip <= end,
+        IpPattern::Single(addr) => addr == ip,
+    }
+}
+
 fn project_name_valid(name: &str) -> bool {
    name.chars().all(|c| c.is_alphanumeric() || c == '-')
 }
@@ -176,8 +221,8 @@ mod tests {
    fn parse_bare_minimum() -> anyhow::Result<()> {
        // According to postgresql, only `user` should be required.
        let options = StartupMessageParams::new([("user", "john_doe")]);
-
-        let creds = ClientCredentials::parse(&options, None, None)?;
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project, None);

@@ -191,8 +236,8 @@ mod tests {
            ("database", "world"), // should be ignored
            ("foo", "bar"),        // should be ignored
        ]);
-
-        let creds = ClientCredentials::parse(&options, None, None)?;
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project, None);

@@ -206,7 +251,8 @@ mod tests {
        let sni = Some("foo.localhost");
        let common_names = Some(["localhost".into()].into());

-        let creds = ClientCredentials::parse(&options, sni, common_names)?;
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("foo"));
        assert_eq!(creds.cache_key, "foo");
@@ -221,7 +267,8 @@ mod tests {
            ("options", "-ckey=1 project=bar -c geqo=off"),
        ]);

-        let creds = ClientCredentials::parse(&options, None, None)?;
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("bar"));

@@ -235,7 +282,8 @@ mod tests {
            ("options", "-ckey=1 endpoint=bar -c geqo=off"),
        ]);

-        let creds = ClientCredentials::parse(&options, None, None)?;
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("bar"));

@@ -252,7 +300,8 @@ mod tests {
            ),
        ]);

-        let creds = ClientCredentials::parse(&options, None, None)?;
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert!(creds.project.is_none());

@@ -266,7 +315,8 @@ mod tests {
            ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
        ]);

-        let creds = ClientCredentials::parse(&options, None, None)?;
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let creds = ClientCredentials::parse(&options, None, None, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert!(creds.project.is_none());

@@ -280,7 +330,8 @@ mod tests {
        let sni = Some("baz.localhost");
        let common_names = Some(["localhost".into()].into());

-        let creds = ClientCredentials::parse(&options, sni, common_names)?;
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
        assert_eq!(creds.user, "john_doe");
        assert_eq!(creds.project.as_deref(), Some("baz"));

@@ -293,12 +344,14 @@ mod tests {

        let common_names = Some(["a.com".into(), "b.com".into()].into());
        let sni = Some("p1.a.com");
-        let creds = ClientCredentials::parse(&options, sni, common_names)?;
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
        assert_eq!(creds.project.as_deref(), Some("p1"));

        let common_names = Some(["a.com".into(), "b.com".into()].into());
        let sni = Some("p1.b.com");
-        let creds = ClientCredentials::parse(&options, sni, common_names)?;
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
        assert_eq!(creds.project.as_deref(), Some("p1"));

        Ok(())
@@ -312,7 +365,9 @@ mod tests {
        let sni = Some("second.localhost");
        let common_names = Some(["localhost".into()].into());

-        let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail");
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
+            .expect_err("should fail");
        match err {
            InconsistentProjectNames { domain, option } => {
                assert_eq!(option, "first");
@@ -329,7 +384,9 @@ mod tests {
        let sni = Some("project.localhost");
        let common_names = Some(["example.com".into()].into());

-        let err = ClientCredentials::parse(&options, sni, common_names).expect_err("should fail");
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let err = ClientCredentials::parse(&options, sni, common_names, peer_addr)
+            .expect_err("should fail");
        match err {
            UnknownCommonName { cn } => {
                assert_eq!(cn, "localhost");
@@ -347,7 +404,8 @@ mod tests {

        let sni = Some("project.localhost");
        let common_names = Some(["localhost".into()].into());
-        let creds = ClientCredentials::parse(&options, sni, common_names)?;
+        let peer_addr = SocketAddr::from(([127, 0, 0, 1], 1234));
+        let creds = ClientCredentials::parse(&options, sni, common_names, peer_addr)?;
        assert_eq!(creds.project.as_deref(), Some("project"));
        assert_eq!(
            creds.cache_key,
@@ -356,4 +414,91 @@ mod tests {

        Ok(())
    }
+
+    #[test]
+    fn test_check_peer_addr_is_in_list() {
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        assert!(check_peer_addr_is_in_list(&peer_addr, &vec![]));
+        assert!(check_peer_addr_is_in_list(
+            &peer_addr,
+            &vec!["127.0.0.1".into()]
+        ));
+        assert!(!check_peer_addr_is_in_list(
+            &peer_addr,
+            &vec!["8.8.8.8".into()]
+        ));
+        // If there is an incorrect address, it will be skipped.
+        assert!(check_peer_addr_is_in_list(
+            &peer_addr,
+            &vec!["88.8.8".into(), "127.0.0.1".into()]
+        ));
+    }
+    #[test]
+    fn test_parse_ip_v4() -> anyhow::Result<()> {
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        // Ok
+        assert_eq!(parse_ip_pattern("127.0.0.1")?, IpPattern::Single(peer_addr));
+        assert_eq!(
+            parse_ip_pattern("127.0.0.1/31")?,
+            IpPattern::Subnet(ipnet::IpNet::new(peer_addr, 31)?)
+        );
+        assert_eq!(
+            parse_ip_pattern("0.0.0.0-200.0.1.2")?,
+            IpPattern::Range(IpAddr::from([0, 0, 0, 0]), IpAddr::from([200, 0, 1, 2]))
+        );
+
+        // Error
+        assert!(parse_ip_pattern("300.0.1.2").is_err());
+        assert!(parse_ip_pattern("30.1.2").is_err());
+        assert!(parse_ip_pattern("127.0.0.1/33").is_err());
+        assert!(parse_ip_pattern("127.0.0.1-127.0.3").is_err());
+        assert!(parse_ip_pattern("1234.0.0.1-127.0.3.0").is_err());
+        Ok(())
+    }
+
+    #[test]
+    fn test_check_ipv4() -> anyhow::Result<()> {
+        let peer_addr = IpAddr::from([127, 0, 0, 1]);
+        let peer_addr_next = IpAddr::from([127, 0, 0, 2]);
+        let peer_addr_prev = IpAddr::from([127, 0, 0, 0]);
+        // Success
+        assert!(check_ip(&peer_addr, &IpPattern::Single(peer_addr)));
+        assert!(check_ip(
+            &peer_addr,
+            &IpPattern::Subnet(ipnet::IpNet::new(peer_addr_prev, 31)?)
+        ));
+        assert!(check_ip(
+            &peer_addr,
+            &IpPattern::Subnet(ipnet::IpNet::new(peer_addr_next, 30)?)
+        ));
+        assert!(check_ip(
+            &peer_addr,
+            &IpPattern::Range(IpAddr::from([0, 0, 0, 0]), IpAddr::from([200, 0, 1, 2]))
+        ));
+        assert!(check_ip(
+            &peer_addr,
+            &IpPattern::Range(peer_addr, peer_addr)
+        ));
+
+        // Not success
+        assert!(!check_ip(&peer_addr, &IpPattern::Single(peer_addr_prev)));
+        assert!(!check_ip(
+            &peer_addr,
+            &IpPattern::Subnet(ipnet::IpNet::new(peer_addr_next, 31)?)
+        ));
+        assert!(!check_ip(
+            &peer_addr,
+            &IpPattern::Range(IpAddr::from([0, 0, 0, 0]), peer_addr_prev)
+        ));
+        assert!(!check_ip(
+            &peer_addr,
+            &IpPattern::Range(peer_addr_next, IpAddr::from([128, 0, 0, 0]))
+        ));
+        // There is no check that for range start <= end. But it's fine as long as for all this cases the result is false.
+        assert!(!check_ip(
+            &peer_addr,
+            &IpPattern::Range(peer_addr, peer_addr_prev)
+        ));
+        Ok(())
+    }
 }
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -1,16 +1,21 @@
 //! Main authentication flow.

 use super::{AuthErrorImpl, PasswordHackPayload};
-use crate::{sasl, scram, stream::PqStream};
+use crate::{
+    config::TlsServerEndPoint,
+    sasl, scram,
+    stream::{PqStream, Stream},
+};
 use pq_proto::{BeAuthenticationSaslMessage, BeMessage, BeMessage as Be};
 use std::io;
 use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::info;

 /// Every authentication selector is supposed to implement this trait.
 pub trait AuthMethod {
    /// Any authentication selector should provide initial backend message
    /// containing auth method name and parameters, e.g. md5 salt.
-    fn first_message(&self) -> BeMessage<'_>;
+    fn first_message(&self, channel_binding: bool) -> BeMessage<'_>;
 }

 /// Initial state of [`AuthFlow`].
@@ -21,8 +26,14 @@ pub struct Scram<'a>(pub &'a scram::ServerSecret);

 impl AuthMethod for Scram<'_> {
    #[inline(always)]
-    fn first_message(&self) -> BeMessage<'_> {
-        Be::AuthenticationSasl(BeAuthenticationSaslMessage::Methods(scram::METHODS))
+    fn first_message(&self, channel_binding: bool) -> BeMessage<'_> {
+        if channel_binding {
+            Be::AuthenticationSasl(BeAuthenticationSaslMessage::Methods(scram::METHODS))
+        } else {
+            Be::AuthenticationSasl(BeAuthenticationSaslMessage::Methods(
+                scram::METHODS_WITHOUT_PLUS,
+            ))
+        }
    }
 }

@@ -32,7 +43,7 @@ pub struct PasswordHack;

 impl AuthMethod for PasswordHack {
    #[inline(always)]
-    fn first_message(&self) -> BeMessage<'_> {
+    fn first_message(&self, _channel_binding: bool) -> BeMessage<'_> {
        Be::AuthenticationCleartextPassword
    }
 }
@@ -43,37 +54,44 @@ pub struct CleartextPassword;

 impl AuthMethod for CleartextPassword {
    #[inline(always)]
-    fn first_message(&self) -> BeMessage<'_> {
+    fn first_message(&self, _channel_binding: bool) -> BeMessage<'_> {
        Be::AuthenticationCleartextPassword
    }
 }

 /// This wrapper for [`PqStream`] performs client authentication.
 #[must_use]
-pub struct AuthFlow<'a, Stream, State> {
+pub struct AuthFlow<'a, S, State> {
    /// The underlying stream which implements libpq's protocol.
-    stream: &'a mut PqStream<Stream>,
+    stream: &'a mut PqStream<Stream<S>>,
    /// State might contain ancillary data (see [`Self::begin`]).
    state: State,
+    tls_server_end_point: TlsServerEndPoint,
 }

 /// Initial state of the stream wrapper.
-impl<'a, S: AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
+impl<'a, S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'a, S, Begin> {
    /// Create a new wrapper for client authentication.
-    pub fn new(stream: &'a mut PqStream<S>) -> Self {
+    pub fn new(stream: &'a mut PqStream<Stream<S>>) -> Self {
+        let tls_server_end_point = stream.get_ref().tls_server_end_point();
+
        Self {
            stream,
            state: Begin,
+            tls_server_end_point,
        }
    }

    /// Move to the next step by sending auth method's name & params to client.
    pub async fn begin<M: AuthMethod>(self, method: M) -> io::Result<AuthFlow<'a, S, M>> {
-        self.stream.write_message(&method.first_message()).await?;
+        self.stream
+            .write_message(&method.first_message(self.tls_server_end_point.supported()))
+            .await?;

        Ok(AuthFlow {
            stream: self.stream,
            state: method,
+            tls_server_end_point: self.tls_server_end_point,
        })
    }
 }
@@ -123,9 +141,15 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, Scram<'_>> {
            return Err(super::AuthError::bad_auth_method(sasl.method));
        }

+        info!("client chooses {}", sasl.method);
+
        let secret = self.state.0;
        let outcome = sasl::SaslStream::new(self.stream, sasl.message)
-            .authenticate(scram::Exchange::new(secret, rand::random, None))
+            .authenticate(scram::Exchange::new(
+                secret,
+                rand::random,
+                self.tls_server_end_point,
+            ))
            .await?;

        Ok(outcome)
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -6,6 +6,8 @@
 use std::{net::SocketAddr, sync::Arc};

 use futures::future::Either;
+use itertools::Itertools;
+use proxy::config::TlsServerEndPoint;
 use tokio::net::TcpListener;

 use anyhow::{anyhow, bail, ensure, Context};
@@ -65,7 +67,7 @@ async fn main() -> anyhow::Result<()> {
    let destination: String = args.get_one::<String>("dest").unwrap().parse()?;

    // Configure TLS
-    let tls_config: Arc<rustls::ServerConfig> = match (
+    let (tls_config, tls_server_end_point): (Arc<rustls::ServerConfig>, TlsServerEndPoint) = match (
        args.get_one::<String>("tls-key"),
        args.get_one::<String>("tls-cert"),
    ) {
@@ -89,16 +91,22 @@ async fn main() -> anyhow::Result<()> {
                    ))?
                    .into_iter()
                    .map(rustls::Certificate)
-                    .collect()
+                    .collect_vec()
            };

-            rustls::ServerConfig::builder()
+            // needed for channel bindings
+            let first_cert = cert_chain.first().context("missing certificate")?;
+            let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
+
+            let tls_config = rustls::ServerConfig::builder()
                .with_safe_default_cipher_suites()
                .with_safe_default_kx_groups()
                .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
                .with_no_client_auth()
                .with_single_cert(cert_chain, key)?
-                .into()
+                .into();
+
+            (tls_config, tls_server_end_point)
        }
        _ => bail!("tls-key and tls-cert must be specified"),
    };
@@ -113,6 +121,7 @@ async fn main() -> anyhow::Result<()> {
    let main = tokio::spawn(task_main(
        Arc::new(destination),
        tls_config,
+        tls_server_end_point,
        proxy_listener,
        cancellation_token.clone(),
    ));
@@ -134,6 +143,7 @@ async fn main() -> anyhow::Result<()> {
 async fn task_main(
    dest_suffix: Arc<String>,
    tls_config: Arc<rustls::ServerConfig>,
+    tls_server_end_point: TlsServerEndPoint,
    listener: tokio::net::TcpListener,
    cancellation_token: CancellationToken,
 ) -> anyhow::Result<()> {
@@ -159,7 +169,7 @@ async fn task_main(
                            .context("failed to set socket option")?;

                        info!(%peer_addr, "serving");
-                        handle_client(dest_suffix, tls_config, socket).await
+                        handle_client(dest_suffix, tls_config, tls_server_end_point, socket).await
                    }
                    .unwrap_or_else(|e| {
                        // Acknowledge that the task has finished with an error.
@@ -207,6 +217,7 @@ const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmod
 async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
    raw_stream: S,
    tls_config: Arc<rustls::ServerConfig>,
+    tls_server_end_point: TlsServerEndPoint,
 ) -> anyhow::Result<Stream<S>> {
    let mut stream = PqStream::new(Stream::from_raw(raw_stream));

@@ -231,7 +242,11 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
            if !read_buf.is_empty() {
                bail!("data is sent before server replied with EncryptionResponse");
            }
-            Ok(raw.upgrade(tls_config).await?)
+
+            Ok(Stream::Tls {
+                tls: Box::new(raw.upgrade(tls_config).await?),
+                tls_server_end_point,
+            })
        }
        unexpected => {
            info!(
@@ -246,9 +261,10 @@ async fn ssl_handshake<S: AsyncRead + AsyncWrite + Unpin>(
 async fn handle_client(
    dest_suffix: Arc<String>,
    tls_config: Arc<rustls::ServerConfig>,
+    tls_server_end_point: TlsServerEndPoint,
    stream: impl AsyncRead + AsyncWrite + Unpin,
 ) -> anyhow::Result<()> {
-    let tls_stream = ssl_handshake(stream, tls_config).await?;
+    let tls_stream = ssl_handshake(stream, tls_config, tls_server_end_point).await?;

    // Cut off first part of the SNI domain
    // We receive required destination details in the format of
@@ -268,5 +284,5 @@ async fn handle_client(
    let client = tokio::net::TcpStream::connect(destination).await?;

    let metrics_aux: MetricsAuxInfo = Default::default();
-    proxy::proxy::proxy_pass(tls_stream, client, &metrics_aux).await
+    proxy::proxy::proxy_pass(tls_stream, client, metrics_aux).await
 }
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -1,8 +1,11 @@
 use futures::future::Either;
 use proxy::auth;
 use proxy::config::AuthenticationConfig;
+use proxy::config::CacheOptions;
 use proxy::config::HttpConfig;
 use proxy::console;
+use proxy::console::provider::AllowedIpsCache;
+use proxy::console::provider::NodeInfoCache;
 use proxy::http;
 use proxy::rate_limiter::RateLimiterConfig;
 use proxy::usage_metrics;
@@ -90,6 +93,9 @@ struct ProxyCliArgs {
    /// timeout for http connections
    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
    sql_over_http_timeout: tokio::time::Duration,
+    /// Whether the SQL over http pool is opt-in
+    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    sql_over_http_pool_opt_in: bool,
    /// timeout for scram authentication protocol
    #[clap(long, default_value = "15s", value_parser = humantime::parse_duration)]
    scram_protocol_timeout: tokio::time::Duration,
@@ -97,7 +103,7 @@ struct ProxyCliArgs {
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    require_client_ip: bool,
    /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
-    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    disable_dynamic_rate_limiter: bool,
    /// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`.
    #[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)]
@@ -110,6 +116,12 @@ struct ProxyCliArgs {
    initial_limit: usize,
    #[clap(flatten)]
    aimd_config: proxy::rate_limiter::AimdConfig,
+    /// cache for `allowed_ips` (use `size=0` to disable)
+    #[clap(long, default_value = config::CacheOptions::DEFAULT_OPTIONS_NODE_INFO)]
+    allowed_ips_cache: String,
+    /// disable ip check for http requests. If it is too time consuming, it could be turned off.
+    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    disable_ip_check_for_http: bool,
 }

 #[tokio::main]
@@ -238,11 +250,24 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {

    let auth_backend = match &args.auth_backend {
        AuthBackend::Console => {
-            let config::CacheOptions { size, ttl } = args.wake_compute_cache.parse()?;
+            let wake_compute_cache_config: CacheOptions = args.wake_compute_cache.parse()?;
+            let allowed_ips_cache_config: CacheOptions = args.allowed_ips_cache.parse()?;

-            info!("Using NodeInfoCache (wake_compute) with size={size} ttl={ttl:?}");
+            info!("Using NodeInfoCache (wake_compute) with options={wake_compute_cache_config:?}");
+            info!("Using AllowedIpsCache (wake_compute) with options={allowed_ips_cache_config:?}");
            let caches = Box::leak(Box::new(console::caches::ApiCaches {
-                node_info: console::caches::NodeInfoCache::new("node_info_cache", size, ttl),
+                node_info: NodeInfoCache::new(
+                    "node_info_cache",
+                    wake_compute_cache_config.size,
+                    wake_compute_cache_config.ttl,
+                    true,
+                ),
+                allowed_ips: AllowedIpsCache::new(
+                    "allowed_ips_cache",
+                    allowed_ips_cache_config.size,
+                    allowed_ips_cache_config.ttl,
+                    false,
+                ),
            }));

            let config::WakeComputeLockOptions {
@@ -275,7 +300,8 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        }
    };
    let http_config = HttpConfig {
-        sql_over_http_timeout: args.sql_over_http_timeout,
+        timeout: args.sql_over_http_timeout,
+        pool_opt_in: args.sql_over_http_pool_opt_in,
    };
    let authentication_config = AuthenticationConfig {
        scram_protocol_timeout: args.scram_protocol_timeout,
@@ -288,6 +314,7 @@ fn build_config(args: &ProxyCliArgs) -> anyhow::Result<&'static ProxyConfig> {
        http_config,
        authentication_config,
        require_client_ip: args.require_client_ip,
+        disable_ip_check_for_http: args.disable_ip_check_for_http,
    }));

    Ok(config)
--- a/proxy/src/cache.rs
+++ b/proxy/src/cache.rs
@@ -55,7 +55,7 @@ pub mod timed_lru {
    /// * Whenever a new entry is inserted, the least recently accessed one is evicted.
    ///   The cache also keeps track of entry's insertion time (`created_at`) and TTL (`expires_at`).
    ///
-    /// * When the entry is about to be retrieved, we check its expiration timestamp.
+    /// * If `update_ttl_on_retrieval` is `true`. When the entry is about to be retrieved, we check its expiration timestamp.
    ///   If the entry has expired, we remove it from the cache; Otherwise we bump the
    ///   expiration timestamp (e.g. +5mins) and change its place in LRU list to prolong
    ///   its existence.
@@ -79,6 +79,8 @@ pub mod timed_lru {

        /// Default time-to-live of a single entry.
        ttl: Duration,
+
+        update_ttl_on_retrieval: bool,
    }

    impl<K: Hash + Eq, V> Cache for TimedLru<K, V> {
@@ -99,11 +101,17 @@ pub mod timed_lru {

    impl<K: Hash + Eq, V> TimedLru<K, V> {
        /// Construct a new LRU cache with timed entries.
-        pub fn new(name: &'static str, capacity: usize, ttl: Duration) -> Self {
+        pub fn new(
+            name: &'static str,
+            capacity: usize,
+            ttl: Duration,
+            update_ttl_on_retrieval: bool,
+        ) -> Self {
            Self {
                name,
                cache: LruCache::new(capacity).into(),
                ttl,
+                update_ttl_on_retrieval,
            }
        }

@@ -165,7 +173,9 @@ pub mod timed_lru {
            let (created_at, expires_at) = (entry.created_at, entry.expires_at);

            // Update the deadline and the entry's position in the LRU list.
-            raw_entry.get_mut().expires_at = deadline;
+            if self.update_ttl_on_retrieval {
+                raw_entry.get_mut().expires_at = deadline;
+            }
            raw_entry.to_back();

            drop(cache); // drop lock before logging
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,9 +1,6 @@
 use crate::{
-    auth::parse_endpoint_param,
-    cancellation::CancelClosure,
-    console::errors::WakeComputeError,
-    error::{io_error, UserFacingError},
-    proxy::is_neon_param,
+    auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError,
+    error::UserFacingError, proxy::is_neon_param,
 };
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
@@ -28,12 +25,9 @@ pub enum ConnectionError {

    #[error("{COULD_NOT_CONNECT}: {0}")]
    TlsError(#[from] native_tls::Error),
-}

-impl From<WakeComputeError> for ConnectionError {
-    fn from(value: WakeComputeError) -> Self {
-        io_error(value).into()
-    }
+    #[error("{COULD_NOT_CONNECT}: {0}")]
+    WakeComputeError(#[from] WakeComputeError),
 }

 impl UserFacingError for ConnectionError {
@@ -46,6 +40,7 @@ impl UserFacingError for ConnectionError {
                Some(err) => err.message().to_owned(),
                None => err.to_string(),
            },
+            WakeComputeError(err) => err.to_string_client(),
            _ => COULD_NOT_CONNECT.to_owned(),
        }
    }
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -1,12 +1,15 @@
 use crate::auth;
 use anyhow::{bail, ensure, Context, Ok};
-use rustls::sign;
+use rustls::{sign, Certificate, PrivateKey};
+use sha2::{Digest, Sha256};
 use std::{
    collections::{HashMap, HashSet},
    str::FromStr,
    sync::Arc,
    time::Duration,
 };
+use tracing::{error, info};
+use x509_parser::oid_registry;

 pub struct ProxyConfig {
    pub tls_config: Option<TlsConfig>,
@@ -16,6 +19,7 @@ pub struct ProxyConfig {
    pub http_config: HttpConfig,
    pub authentication_config: AuthenticationConfig,
    pub require_client_ip: bool,
+    pub disable_ip_check_for_http: bool,
 }

 #[derive(Debug)]
@@ -27,10 +31,12 @@ pub struct MetricCollectionConfig {
 pub struct TlsConfig {
    pub config: Arc<rustls::ServerConfig>,
    pub common_names: Option<HashSet<String>>,
+    pub cert_resolver: Arc<CertResolver>,
 }

 pub struct HttpConfig {
-    pub sql_over_http_timeout: tokio::time::Duration,
+    pub timeout: tokio::time::Duration,
+    pub pool_opt_in: bool,
 }

 pub struct AuthenticationConfig {
@@ -52,7 +58,7 @@ pub fn configure_tls(
    let mut cert_resolver = CertResolver::new();

    // add default certificate
-    cert_resolver.add_cert(key_path, cert_path, true)?;
+    cert_resolver.add_cert_path(key_path, cert_path, true)?;

    // add extra certificates
    if let Some(certs_dir) = certs_dir {
@@ -64,7 +70,7 @@ pub fn configure_tls(
                let key_path = path.join("tls.key");
                let cert_path = path.join("tls.crt");
                if key_path.exists() && cert_path.exists() {
-                    cert_resolver.add_cert(
+                    cert_resolver.add_cert_path(
                        &key_path.to_string_lossy(),
                        &cert_path.to_string_lossy(),
                        false,
@@ -76,35 +82,97 @@ pub fn configure_tls(

    let common_names = cert_resolver.get_common_names();

+    let cert_resolver = Arc::new(cert_resolver);
+
    let config = rustls::ServerConfig::builder()
        .with_safe_default_cipher_suites()
        .with_safe_default_kx_groups()
        // allow TLS 1.2 to be compatible with older client libraries
        .with_protocol_versions(&[&rustls::version::TLS13, &rustls::version::TLS12])?
        .with_no_client_auth()
-        .with_cert_resolver(Arc::new(cert_resolver))
+        .with_cert_resolver(cert_resolver.clone())
        .into();

    Ok(TlsConfig {
        config,
        common_names: Some(common_names),
+        cert_resolver,
    })
 }

-struct CertResolver {
-    certs: HashMap<String, Arc<rustls::sign::CertifiedKey>>,
-    default: Option<Arc<rustls::sign::CertifiedKey>>,
+/// Channel binding parameter
+///
+/// <https://www.rfc-editor.org/rfc/rfc5929#section-4>
+/// Description: The hash of the TLS server's certificate as it
+/// appears, octet for octet, in the server's Certificate message.  Note
+/// that the Certificate message contains a certificate_list, in which
+/// the first element is the server's certificate.
+///
+/// The hash function is to be selected as follows:
+///
+/// * if the certificate's signatureAlgorithm uses a single hash
+///   function, and that hash function is either MD5 or SHA-1, then use SHA-256;
+///
+/// * if the certificate's signatureAlgorithm uses a single hash
+///   function and that hash function neither MD5 nor SHA-1, then use
+///   the hash function associated with the certificate's
+///   signatureAlgorithm;
+///
+/// * if the certificate's signatureAlgorithm uses no hash functions or
+///   uses multiple hash functions, then this channel binding type's
+///   channel bindings are undefined at this time (updates to is channel
+///   binding type may occur to address this issue if it ever arises).
+#[derive(Debug, Clone, Copy)]
+pub enum TlsServerEndPoint {
+    Sha256([u8; 32]),
+    Undefined,
 }

-impl CertResolver {
-    fn new() -> Self {
-        Self {
-            certs: HashMap::new(),
-            default: None,
+impl TlsServerEndPoint {
+    pub fn new(cert: &Certificate) -> anyhow::Result<Self> {
+        let sha256_oids = [
+            // I'm explicitly not adding MD5 or SHA1 here... They're bad.
+            oid_registry::OID_SIG_ECDSA_WITH_SHA256,
+            oid_registry::OID_PKCS1_SHA256WITHRSA,
+        ];
+
+        let pem = x509_parser::parse_x509_certificate(&cert.0)
+            .context("Failed to parse PEM object from cerficiate")?
+            .1;
+
+        info!(subject = %pem.subject, "parsing TLS certificate");
+
+        let reg = oid_registry::OidRegistry::default().with_all_crypto();
+        let oid = pem.signature_algorithm.oid();
+        let alg = reg.get(oid);
+        if sha256_oids.contains(oid) {
+            let tls_server_end_point: [u8; 32] =
+                Sha256::new().chain_update(&cert.0).finalize().into();
+            info!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), tls_server_end_point = %base64::encode(tls_server_end_point), "determined channel binding");
+            Ok(Self::Sha256(tls_server_end_point))
+        } else {
+            error!(subject = %pem.subject, signature_algorithm = alg.map(|a| a.description()), "unknown channel binding");
+            Ok(Self::Undefined)
        }
    }

-    fn add_cert(
+    pub fn supported(&self) -> bool {
+        !matches!(self, TlsServerEndPoint::Undefined)
+    }
+}
+
+#[derive(Default)]
+pub struct CertResolver {
+    certs: HashMap<String, (Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
+    default: Option<(Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)>,
+}
+
+impl CertResolver {
+    pub fn new() -> Self {
+        Self::default()
+    }
+
+    fn add_cert_path(
        &mut self,
        key_path: &str,
        cert_path: &str,
@@ -120,57 +188,65 @@ impl CertResolver {
            keys.pop().map(rustls::PrivateKey).unwrap()
        };

-        let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
-
        let cert_chain_bytes = std::fs::read(cert_path)
            .context(format!("Failed to read TLS cert file at '{cert_path}.'"))?;

        let cert_chain = {
            rustls_pemfile::certs(&mut &cert_chain_bytes[..])
-                .context(format!(
+                .with_context(|| {
+                    format!(
                    "Failed to read TLS certificate chain from bytes from file at '{cert_path}'."
-                ))?
+                )
+                })?
                .into_iter()
                .map(rustls::Certificate)
                .collect()
        };

-        let common_name = {
-            let pem = x509_parser::pem::parse_x509_pem(&cert_chain_bytes)
-                .context(format!(
-                    "Failed to parse PEM object from bytes from file at '{cert_path}'."
-                ))?
-                .1;
-            let common_name = pem.parse_x509()?.subject().to_string();
+        self.add_cert(priv_key, cert_chain, is_default)
+    }

-            // We only use non-wildcard certificates in link proxy so it seems okay to treat them the same as
-            // wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so
-            // verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names
-            // and passed None instead, which blows up number of cases downstream code should handle. Proper coding
-            // here should better avoid Option for common_names, and do wildcard-based certificate selection instead
-            // of cutting off '*.' parts.
-            if common_name.starts_with("CN=*.") {
-                common_name.strip_prefix("CN=*.").map(|s| s.to_string())
-            } else {
-                common_name.strip_prefix("CN=").map(|s| s.to_string())
-            }
+    pub fn add_cert(
+        &mut self,
+        priv_key: PrivateKey,
+        cert_chain: Vec<Certificate>,
+        is_default: bool,
+    ) -> anyhow::Result<()> {
+        let key = sign::any_supported_type(&priv_key).context("invalid private key")?;
+
+        let first_cert = &cert_chain[0];
+        let tls_server_end_point = TlsServerEndPoint::new(first_cert)?;
+        let pem = x509_parser::parse_x509_certificate(&first_cert.0)
+            .context("Failed to parse PEM object from cerficiate")?
+            .1;
+
+        let common_name = pem.subject().to_string();
+
+        // We only use non-wildcard certificates in link proxy so it seems okay to treat them the same as
+        // wildcard ones as we don't use SNI there. That treatment only affects certificate selection, so
+        // verify-full will still check wildcard match. Old coding here just ignored non-wildcard common names
+        // and passed None instead, which blows up number of cases downstream code should handle. Proper coding
+        // here should better avoid Option for common_names, and do wildcard-based certificate selection instead
+        // of cutting off '*.' parts.
+        let common_name = if common_name.starts_with("CN=*.") {
+            common_name.strip_prefix("CN=*.").map(|s| s.to_string())
+        } else {
+            common_name.strip_prefix("CN=").map(|s| s.to_string())
        }
-        .context(format!(
-            "Failed to parse common name from certificate at '{cert_path}'."
-        ))?;
+        .context("Failed to parse common name from certificate")?;

        let cert = Arc::new(rustls::sign::CertifiedKey::new(cert_chain, key));

        if is_default {
-            self.default = Some(cert.clone());
+            self.default = Some((cert.clone(), tls_server_end_point));
        }

-        self.certs.insert(common_name, cert);
+        self.certs.insert(common_name, (cert, tls_server_end_point));

        Ok(())
    }

-    fn get_common_names(&self) -> HashSet<String> {
+    pub fn get_common_names(&self) -> HashSet<String> {
        self.certs.keys().map(|s| s.to_string()).collect()
    }
 }
@@ -178,15 +254,24 @@ impl CertResolver {
 impl rustls::server::ResolvesServerCert for CertResolver {
    fn resolve(
        &self,
-        _client_hello: rustls::server::ClientHello,
+        client_hello: rustls::server::ClientHello,
    ) -> Option<Arc<rustls::sign::CertifiedKey>> {
+        self.resolve(client_hello.server_name()).map(|x| x.0)
+    }
+}
+
+impl CertResolver {
+    pub fn resolve(
+        &self,
+        server_name: Option<&str>,
+    ) -> Option<(Arc<rustls::sign::CertifiedKey>, TlsServerEndPoint)> {
        // loop here and cut off more and more subdomains until we find
        // a match to get a proper wildcard support. OTOH, we now do not
        // use nested domains, so keep this simple for now.
        //
        // With the current coding foo.com will match *.foo.com and that
        // repeats behavior of the old code.
-        if let Some(mut sni_name) = _client_hello.server_name() {
+        if let Some(mut sni_name) = server_name {
            loop {
                if let Some(cert) = self.certs.get(sni_name) {
                    return Some(cert.clone());
@@ -214,6 +299,7 @@ impl rustls::server::ResolvesServerCert for CertResolver {
 }

 /// Helper for cmdline cache options parsing.
+#[derive(Debug)]
 pub struct CacheOptions {
    /// Max number of entries.
    pub size: usize,
--- a/proxy/src/console.rs
+++ b/proxy/src/console.rs
@@ -6,7 +6,7 @@ pub mod messages;

 /// Wrappers for console APIs and their mocks.
 pub mod provider;
-pub use provider::{errors, Api, AuthInfo, CachedNodeInfo, ConsoleReqExtra, NodeInfo};
+pub use provider::{errors, Api, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo};

 /// Various cache-related types.
 pub mod caches {
--- a/Show More
+++ b/Show More