Merge with #5837

Fix flushing prefetch requests in page_server_request
DNM: script for sharding demo
2026-05-30 03:20:36 +00:00 · 2023-12-05 12:05:27 +00:00 · 2023-12-05 12:05:27 +00:00 · 2023-12-05 12:05:27 +00:00 · 2023-12-05 12:05:27 +00:00 · 2023-12-05 12:05:27 +00:00
39 changed files with 1228 additions and 1002 deletions
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -387,20 +387,10 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ARG PG_VERSION
 ENV PATH "/usr/local/pgsql/bin:$PATH"

-RUN case "${PG_VERSION}" in \
-      "v14" | "v15") \
-        export TIMESCALEDB_VERSION=2.10.1 \
-        export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
-        ;; \
-      *) \
-        export TIMESCALEDB_VERSION=2.13.0 \
-        export TIMESCALEDB_CHECKSUM=584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d \
-        ;; \
-    esac && \
-    apt-get update && \
+RUN apt-get update && \
    apt-get install -y cmake && \
-    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
-    echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
+    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.13.0.tar.gz -O timescaledb.tar.gz && \
+    echo "584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d timescaledb.tar.gz" | sha256sum --check && \
    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
    cd build && \
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -274,13 +274,7 @@ fn main() -> Result<()> {
            let mut state = compute.state.lock().unwrap();
            state.error = Some(format!("{:?}", err));
            state.status = ComputeStatus::Failed;
-            // Notify others that Postgres failed to start. In case of configuring the
-            // empty compute, it's likely that API handler is still waiting for compute
-            // state change. With this we will notify it that compute is in Failed state,
-            // so control plane will know about it earlier and record proper error instead
-            // of timeout.
-            compute.state_changed.notify_all();
-            drop(state); // unlock
+            drop(state);
            delay_exit = true;
            None
        }
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -15,7 +15,8 @@ use control_plane::pageserver::{PageServerNode, PAGESERVER_REMOTE_STORAGE_DIR};
 use control_plane::safekeeper::SafekeeperNode;
 use control_plane::tenant_migration::migrate_tenant;
 use control_plane::{broker, local_env};
-use pageserver_api::models::TimelineInfo;
+use pageserver_api::models::{LocationConfig, LocationConfigMode, TimelineInfo};
+use pageserver_api::shard::{ShardCount, ShardNumber, TenantShardId};
 use pageserver_api::{
    DEFAULT_HTTP_LISTEN_PORT as DEFAULT_PAGESERVER_HTTP_PORT,
    DEFAULT_PG_LISTEN_PORT as DEFAULT_PAGESERVER_PG_PORT,
@@ -26,6 +27,7 @@ use safekeeper_api::{
    DEFAULT_PG_LISTEN_PORT as DEFAULT_SAFEKEEPER_PG_PORT,
 };
 use std::collections::{BTreeSet, HashMap};
+use std::num::ParseIntError;
 use std::path::PathBuf;
 use std::process::exit;
 use std::str::FromStr;
@@ -97,6 +99,22 @@ struct TimelineTreeEl {
    pub children: BTreeSet<TimelineId>,
 }

+/// Helper for CLI args that contain a comma-separate list of NodeId
+fn parse_ids_arg(
+    matches: &ArgMatches,
+    arg: &str,
+) -> Result<Option<Vec<NodeId>>, std::num::ParseIntError> {
+    if let Some(id_str) = matches.get_one::<String>(arg) {
+        let r: Result<Vec<_>, ParseIntError> = id_str
+            .split(',')
+            .map(|ps_id| u64::from_str(str::trim(ps_id)).map(NodeId))
+            .collect();
+        r.map(Some)
+    } else {
+        Ok(Some(vec![DEFAULT_PAGESERVER_ID]))
+    }
+}
+
 // Main entry point for the 'neon_local' CLI utility
 //
 // This utility helps to manage neon installation. That includes following:
@@ -374,9 +392,10 @@ fn pageserver_config_overrides(init_match: &ArgMatches) -> Vec<&str> {
 }

 fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> anyhow::Result<()> {
-    let pageserver = get_default_pageserver(env);
    match tenant_match.subcommand() {
        Some(("list", _)) => {
+            // TODO: make command aware of multiple pageservers
+            let pageserver = get_default_pageserver(env);
            for t in pageserver.tenant_list()? {
                println!("{} {:?}", t.id, t.state);
            }
@@ -387,38 +406,94 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                .map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
                .unwrap_or_default();

+            let shard_count: u8 = create_match
+                .get_one::<u8>("shard-count")
+                .cloned()
+                .unwrap_or(1);
+
            // If tenant ID was not specified, generate one
            let tenant_id = parse_tenant_id(create_match)?.unwrap_or_else(TenantId::generate);

-            let generation = if env.control_plane_api.is_some() {
-                // We must register the tenant with the attachment service, so
-                // that when the pageserver restarts, it will be re-attached.
-                let attachment_service = AttachmentService::from_env(env);
-                attachment_service.attach_hook(tenant_id, pageserver.conf.id)?
-            } else {
-                None
-            };
-
-            pageserver.tenant_create(tenant_id, generation, tenant_conf)?;
-            println!("tenant {tenant_id} successfully created on the pageserver");
-
-            // Create an initial timeline for the new tenant
-            let new_timeline_id = parse_timeline_id(create_match)?;
+            // We will create an initial timeline for the new tenant
+            let new_timeline_id =
+                parse_timeline_id(create_match)?.unwrap_or(TimelineId::generate());
            let pg_version = create_match
                .get_one::<u32>("pg-version")
                .copied()
                .context("Failed to parse postgres version from the argument string")?;

-            let timeline_info = pageserver.timeline_create(
-                tenant_id,
-                new_timeline_id,
-                None,
-                None,
-                Some(pg_version),
-                None,
-            )?;
-            let new_timeline_id = timeline_info.timeline_id;
-            let last_record_lsn = timeline_info.last_record_lsn;
+            // TODO: implement ability for one pageserver to hold multiple
+            // shards for the same tenant.  Until then, we must place each
+            // shard on a different pageserver.
+            assert!(env.pageservers.len() >= shard_count as usize);
+
+            let cfg_shard_count = if shard_count > 1 {
+                shard_count
+            } else {
+                // For single-sharded mode, use the legacy unsharded configuration.  This avoids
+                // breaking any existing tests that assume legacy unsharded storage paths
+                0
+            };
+
+            for shard_number in 0..shard_count {
+                let ps_conf = env.pageservers.get(shard_number as usize).unwrap();
+                let pageserver = PageServerNode::from_env(env, ps_conf);
+
+                // TODO: per-shard generations
+                let generation = if env.control_plane_api.is_some() {
+                    // We must register the tenant with the attachment service, so
+                    // that when the pageserver restarts, it will be re-attached.
+                    let attachment_service = AttachmentService::from_env(env);
+                    attachment_service.attach_hook(tenant_id, pageserver.conf.id)?
+                } else {
+                    None
+                };
+
+                // TODO: shard-aware POST /v1/tenant.  Currently tenant creation on the
+                // pageserver is a no-op, but we shouldn't skip the command entirely.
+
+                let tenant_conf = PageServerNode::build_config(tenant_conf.clone())?;
+
+                let tenant_shard_id = TenantShardId {
+                    shard_number: ShardNumber(shard_number),
+                    shard_count: ShardCount(cfg_shard_count),
+                    tenant_id,
+                };
+
+                let location_conf = LocationConfig {
+                    shard_count: cfg_shard_count,
+                    shard_number,
+                    shard_stripe_size: 32768,
+                    mode: LocationConfigMode::AttachedSingle,
+                    generation,
+                    secondary_conf: None,
+                    tenant_conf,
+                };
+                pageserver.location_config(tenant_shard_id, location_conf, None)?;
+                println!(
+                    "tenant {tenant_id} successfully created on pageserver {}",
+                    pageserver.conf.id
+                );
+            }
+
+            for shard_number in 0..shard_count {
+                let ps_conf = env.pageservers.get(shard_number as usize).unwrap();
+                let pageserver = PageServerNode::from_env(env, ps_conf);
+                let tenant_shard_id = TenantShardId {
+                    shard_number: ShardNumber(shard_number),
+                    shard_count: ShardCount(cfg_shard_count),
+                    tenant_id,
+                };
+
+                pageserver.timeline_create(
+                    tenant_shard_id,
+                    Some(new_timeline_id),
+                    None,
+                    None,
+                    Some(pg_version),
+                    None,
+                )?;
+            }

            env.register_branch_mapping(
                DEFAULT_BRANCH_NAME.to_string(),
@@ -426,9 +501,7 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                new_timeline_id,
            )?;

-            println!(
-                "Created an initial timeline '{new_timeline_id}' at Lsn {last_record_lsn} for tenant: {tenant_id}",
-            );
+            println!("Created an initial timeline '{new_timeline_id}' for tenant: {tenant_id}",);

            if create_match.get_flag("set-default") {
                println!("Setting tenant {tenant_id} as a default one");
@@ -448,6 +521,8 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                .map(|vals| vals.flat_map(|c| c.split_once(':')).collect())
                .unwrap_or_default();

+            // TODO: make command aware of multiple pageservers
+            let pageserver = get_default_pageserver(env);
            pageserver
                .tenant_config(tenant_id, tenant_conf)
                .with_context(|| format!("Tenant config failed for tenant with id {tenant_id}"))?;
@@ -491,7 +566,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
            let new_timeline_id_opt = parse_timeline_id(create_match)?;

            let timeline_info = pageserver.timeline_create(
-                tenant_id,
+                TenantShardId::unsharded(tenant_id),
                new_timeline_id_opt,
                None,
                None,
@@ -554,7 +629,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                None,
                pg_version,
                ComputeMode::Primary,
-                DEFAULT_PAGESERVER_ID,
+                vec![DEFAULT_PAGESERVER_ID],
            )?;
            println!("Done");
        }
@@ -579,7 +654,7 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                .transpose()
                .context("Failed to parse ancestor start Lsn from the request")?;
            let timeline_info = pageserver.timeline_create(
-                tenant_id,
+                TenantShardId::unsharded(tenant_id),
                None,
                start_lsn,
                Some(ancestor_timeline_id),
@@ -704,13 +779,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .copied()
                .unwrap_or(false);

-            let pageserver_id =
-                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    NodeId(id_str.parse().context("while parsing pageserver id")?)
-                } else {
-                    DEFAULT_PAGESERVER_ID
-                };
-
+            let pageserver_ids = parse_ids_arg(sub_args, "endpoint-pageserver-id")?
+                .unwrap_or(vec![DEFAULT_PAGESERVER_ID]);
            let mode = match (lsn, hot_standby) {
                (Some(lsn), false) => ComputeMode::Static(lsn),
                (None, true) => ComputeMode::Replica,
@@ -738,7 +808,7 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                http_port,
                pg_version,
                mode,
-                pageserver_id,
+                pageserver_ids,
            )?;
        }
        "start" => {
@@ -746,29 +816,14 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .get_one::<String>("endpoint_id")
                .ok_or_else(|| anyhow!("No endpoint ID was provided to start"))?;

-            let pageserver_id =
-                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    NodeId(id_str.parse().context("while parsing pageserver id")?)
-                } else {
-                    DEFAULT_PAGESERVER_ID
-                };
+            let pageservers = parse_ids_arg(sub_args, "endpoint-pageserver-id")?
+                .unwrap_or(vec![DEFAULT_PAGESERVER_ID]);

            let remote_ext_config = sub_args.get_one::<String>("remote-ext-config");

            // If --safekeepers argument is given, use only the listed safekeeper nodes.
-            let safekeepers =
-                if let Some(safekeepers_str) = sub_args.get_one::<String>("safekeepers") {
-                    let mut safekeepers: Vec<NodeId> = Vec::new();
-                    for sk_id in safekeepers_str.split(',').map(str::trim) {
-                        let sk_id = NodeId(u64::from_str(sk_id).map_err(|_| {
-                            anyhow!("invalid node ID \"{sk_id}\" in --safekeepers list")
-                        })?);
-                        safekeepers.push(sk_id);
-                    }
-                    safekeepers
-                } else {
-                    env.safekeepers.iter().map(|sk| sk.id).collect()
-                };
+            let safekeepers = parse_ids_arg(sub_args, "safekeepers")?
+                .unwrap_or_else(|| env.safekeepers.iter().map(|sk| sk.id).collect());

            let endpoint = cplane
                .endpoints
@@ -781,7 +836,8 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                endpoint.timeline_id,
            )?;

-            let ps_conf = env.get_pageserver_conf(pageserver_id)?;
+            // We assume that all pageservers have the same auth conf
+            let ps_conf = env.get_pageserver_conf(pageservers[0])?;
            let auth_token = if matches!(ps_conf.pg_auth_type, AuthType::NeonJWT) {
                let claims = Claims::new(Some(endpoint.tenant_id), Scope::Tenant);

@@ -801,15 +857,21 @@ fn handle_endpoint(ep_match: &ArgMatches, env: &local_env::LocalEnv) -> Result<(
                .endpoints
                .get(endpoint_id.as_str())
                .with_context(|| format!("postgres endpoint {endpoint_id} is not found"))?;
-            let pageserver_id =
-                if let Some(id_str) = sub_args.get_one::<String>("endpoint-pageserver-id") {
-                    Some(NodeId(
-                        id_str.parse().context("while parsing pageserver id")?,
-                    ))
-                } else {
-                    None
-                };
-            endpoint.reconfigure(pageserver_id)?;
+            let pageserver_ids: Option<Result<Vec<NodeId>, _>> = sub_args
+                .get_many::<String>("endpoint-pageserver-id")
+                .map(|ids| {
+                    ids.map(|id_str| id_str.parse().context("while parsing pageserver id"))
+                        .map(|r| r.map(NodeId))
+                        .collect()
+                });
+
+            let pageserver_ids = match pageserver_ids {
+                Some(Ok(v)) => Ok(Some(v)),
+                Some(Err(e)) => Err(e),
+                None => Ok(None),
+            }?;
+
+            endpoint.reconfigure(pageserver_ids)?;
        }
        "stop" => {
            let endpoint_id = sub_args
@@ -1313,6 +1375,7 @@ fn cli() -> Command {
                .arg(pg_version_arg.clone())
                .arg(Arg::new("set-default").long("set-default").action(ArgAction::SetTrue).required(false)
                    .help("Use this tenant in future CLI commands where tenant_id is needed, but not specified"))
+                .arg(Arg::new("shard-count").value_parser(value_parser!(u8)).long("shard-count").action(ArgAction::Set).help("Number of shards in the new tenant (default 1)"))
                )
            .subcommand(Command::new("set-default").arg(tenant_id_arg.clone().required(true))
                .about("Set a particular tenant as default in future CLI commands where tenant_id is needed, but not specified"))
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -67,7 +67,7 @@ pub struct EndpointConf {
    http_port: u16,
    pg_version: u32,
    skip_pg_catalog_updates: bool,
-    pageserver_id: NodeId,
+    pageservers: Vec<NodeId>,
 }

 //
@@ -82,6 +82,33 @@ pub struct ComputeControlPlane {
    env: LocalEnv,
 }

+fn load_pageservers(
+    env: &LocalEnv,
+    pageserver_ids: &Vec<NodeId>,
+) -> anyhow::Result<Vec<PageServerNode>> {
+    let mut pageservers = Vec::new();
+    for ps_id in pageserver_ids {
+        let pageserver = env
+            .get_pageserver_conf(*ps_id)
+            .map(|conf| PageServerNode::from_env(env, conf))?;
+        pageservers.push(pageserver);
+    }
+    Ok(pageservers)
+}
+
+fn build_pageserver_connstr(pageservers: &[PageServerNode]) -> String {
+    pageservers
+        .iter()
+        .map(|ps| {
+            let config = ps.pg_connection_config.clone();
+            let (host, port) = (config.host(), config.port());
+            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
+            format!("postgresql://no_user@{host}:{port}")
+        })
+        .collect::<Vec<_>>()
+        .join(",")
+}
+
 impl ComputeControlPlane {
    // Load current endpoints from the endpoints/ subdirectories
    pub fn load(env: LocalEnv) -> Result<ComputeControlPlane> {
@@ -119,19 +146,16 @@ impl ComputeControlPlane {
        http_port: Option<u16>,
        pg_version: u32,
        mode: ComputeMode,
-        pageserver_id: NodeId,
+        pageservers: Vec<NodeId>,
    ) -> Result<Arc<Endpoint>> {
        let pg_port = pg_port.unwrap_or_else(|| self.get_port());
        let http_port = http_port.unwrap_or_else(|| self.get_port() + 1);
-        let pageserver =
-            PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
-
        let ep = Arc::new(Endpoint {
            endpoint_id: endpoint_id.to_owned(),
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), http_port),
            env: self.env.clone(),
-            pageserver,
+            pageservers: load_pageservers(&self.env, &pageservers)?,
            timeline_id,
            mode,
            tenant_id,
@@ -157,7 +181,7 @@ impl ComputeControlPlane {
                pg_port,
                pg_version,
                skip_pg_catalog_updates: true,
-                pageserver_id,
+                pageservers,
            })?,
        )?;
        std::fs::write(
@@ -216,7 +240,7 @@ pub struct Endpoint {
    // These are not part of the endpoint as such, but the environment
    // the endpoint runs in.
    pub env: LocalEnv,
-    pageserver: PageServerNode,
+    pageservers: Vec<PageServerNode>,

    // Optimizations
    skip_pg_catalog_updates: bool,
@@ -239,15 +263,14 @@ impl Endpoint {
        let conf: EndpointConf =
            serde_json::from_slice(&std::fs::read(entry.path().join("endpoint.json"))?)?;

-        let pageserver =
-            PageServerNode::from_env(env, env.get_pageserver_conf(conf.pageserver_id)?);
+        let pageservers: Vec<PageServerNode> = load_pageservers(env, &conf.pageservers)?;

        Ok(Endpoint {
            pg_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.pg_port),
            http_address: SocketAddr::new("127.0.0.1".parse().unwrap(), conf.http_port),
            endpoint_id,
            env: env.clone(),
-            pageserver,
+            pageservers,
            timeline_id: conf.timeline_id,
            mode: conf.mode,
            tenant_id: conf.tenant_id,
@@ -482,13 +505,7 @@ impl Endpoint {
            std::fs::remove_dir_all(self.pgdata())?;
        }

-        let pageserver_connstring = {
-            let config = &self.pageserver.pg_connection_config;
-            let (host, port) = (config.host(), config.port());
-
-            // NOTE: avoid spaces in connection string, because it is less error prone if we forward it somewhere.
-            format!("postgresql://no_user@{host}:{port}")
-        };
+        let pageserver_connstring = build_pageserver_connstr(&self.pageservers);
        let mut safekeeper_connstrings = Vec::new();
        if self.mode == ComputeMode::Primary {
            for sk_id in safekeepers {
@@ -658,7 +675,7 @@ impl Endpoint {
        }
    }

-    pub fn reconfigure(&self, pageserver_id: Option<NodeId>) -> Result<()> {
+    pub fn reconfigure(&self, pageservers: Option<Vec<NodeId>>) -> Result<()> {
        let mut spec: ComputeSpec = {
            let spec_path = self.endpoint_path().join("spec.json");
            let file = std::fs::File::open(spec_path)?;
@@ -668,23 +685,20 @@ impl Endpoint {
        let postgresql_conf = self.read_postgresql_conf()?;
        spec.cluster.postgresql_conf = Some(postgresql_conf);

-        if let Some(pageserver_id) = pageserver_id {
+        if let Some(pageservers) = pageservers {
            let endpoint_config_path = self.endpoint_path().join("endpoint.json");
            let mut endpoint_conf: EndpointConf = {
                let file = std::fs::File::open(&endpoint_config_path)?;
                serde_json::from_reader(file)?
            };
-            endpoint_conf.pageserver_id = pageserver_id;
+            endpoint_conf.pageservers = pageservers.clone();
            std::fs::write(
                endpoint_config_path,
                serde_json::to_string_pretty(&endpoint_conf)?,
            )?;

-            let pageserver =
-                PageServerNode::from_env(&self.env, self.env.get_pageserver_conf(pageserver_id)?);
-            let ps_http_conf = &pageserver.pg_connection_config;
-            let (host, port) = (ps_http_conf.host(), ps_http_conf.port());
-            spec.pageserver_connstring = Some(format!("postgresql://no_user@{host}:{port}"));
+            let pageservers = load_pageservers(&self.env, &pageservers)?;
+            spec.pageserver_connstring = Some(build_pageserver_connstr(&pageservers));
        }

        let client = reqwest::blocking::Client::new();
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -340,15 +340,8 @@ impl PageServerNode {
            .json()?)
    }

-    pub fn tenant_create(
-        &self,
-        new_tenant_id: TenantId,
-        generation: Option<u32>,
-        settings: HashMap<&str, &str>,
-    ) -> anyhow::Result<TenantId> {
-        let mut settings = settings.clone();
-
-        let config = models::TenantConfig {
+    pub fn build_config(mut settings: HashMap<&str, &str>) -> anyhow::Result<models::TenantConfig> {
+        Ok(models::TenantConfig {
            checkpoint_distance: settings
                .remove("checkpoint_distance")
                .map(|x| x.parse::<u64>())
@@ -407,8 +400,16 @@ impl PageServerNode {
                .map(|x| x.parse::<bool>())
                .transpose()
                .context("Failed to parse 'gc_feedback' as bool")?,
-        };
+        })
+    }

+    pub fn tenant_create(
+        &self,
+        new_tenant_id: TenantId,
+        generation: Option<u32>,
+        settings: HashMap<&str, &str>,
+    ) -> anyhow::Result<TenantId> {
+        let config = Self::build_config(settings.clone())?;
        let request = models::TenantCreateRequest {
            new_tenant_id: TenantShardId::unsharded(new_tenant_id),
            generation,
@@ -521,15 +522,18 @@ impl PageServerNode {

    pub fn location_config(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        config: LocationConfig,
        flush_ms: Option<Duration>,
    ) -> anyhow::Result<()> {
-        let req_body = TenantLocationConfigRequest { tenant_id, config };
+        let req_body = TenantLocationConfigRequest {
+            tenant_shard_id,
+            config,
+        };

        let path = format!(
            "{}/tenant/{}/location_config",
-            self.http_base_url, tenant_id
+            self.http_base_url, tenant_shard_id
        );
        let path = if let Some(flush_ms) = flush_ms {
            format!("{}?flush_ms={}", path, flush_ms.as_millis())
@@ -560,7 +564,7 @@ impl PageServerNode {

    pub fn timeline_create(
        &self,
-        tenant_id: TenantId,
+        tenant_shard_id: TenantShardId,
        new_timeline_id: Option<TimelineId>,
        ancestor_start_lsn: Option<Lsn>,
        ancestor_timeline_id: Option<TimelineId>,
@@ -572,7 +576,7 @@ impl PageServerNode {

        self.http_request(
            Method::POST,
-            format!("{}/tenant/{}/timeline", self.http_base_url, tenant_id),
+            format!("{}/tenant/{}/timeline", self.http_base_url, tenant_shard_id),
        )?
        .json(&models::TimelineCreateRequest {
            new_timeline_id,
@@ -585,11 +589,11 @@ impl PageServerNode {
        .error_from_body()?
        .json::<Option<TimelineInfo>>()
        .with_context(|| {
-            format!("Failed to parse timeline creation response for tenant id: {tenant_id}")
+            format!("Failed to parse timeline creation response for tenant id: {tenant_shard_id}")
        })?
        .with_context(|| {
            format!(
-                "No timeline id was found in the timeline creation response for tenant {tenant_id}"
+                "No timeline id was found in the timeline creation response for tenant {tenant_shard_id}"
            )
        })
    }
--- a/control_plane/src/tenant_migration.rs
+++ b/control_plane/src/tenant_migration.rs
@@ -11,6 +11,7 @@ use crate::{
 use pageserver_api::models::{
    LocationConfig, LocationConfigMode, LocationConfigSecondary, TenantConfig,
 };
+use pageserver_api::shard::TenantShardId;
 use std::collections::HashMap;
 use std::time::Duration;
 use utils::{
@@ -108,6 +109,9 @@ pub fn migrate_tenant(
        }
    }

+    // No support for sharding in this function yet
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
    let previous = attachment_service.inspect(tenant_id)?;
    let mut baseline_lsns = None;
    if let Some((generation, origin_ps_id)) = &previous {
@@ -117,7 +121,7 @@ pub fn migrate_tenant(
            println!("🔁 Already attached to {origin_ps_id}, freshening...");
            let gen = attachment_service.attach_hook(tenant_id, dest_ps.conf.id)?;
            let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-            dest_ps.location_config(tenant_id, dest_conf, None)?;
+            dest_ps.location_config(tenant_shard_id, dest_conf, None)?;
            println!("✅ Migration complete");
            return Ok(());
        }
@@ -126,7 +130,7 @@ pub fn migrate_tenant(

        let stale_conf =
            build_location_config(LocationConfigMode::AttachedStale, Some(*generation), None);
-        origin_ps.location_config(tenant_id, stale_conf, Some(Duration::from_secs(10)))?;
+        origin_ps.location_config(tenant_shard_id, stale_conf, Some(Duration::from_secs(10)))?;

        baseline_lsns = Some(get_lsns(tenant_id, &origin_ps)?);
    }
@@ -135,7 +139,7 @@ pub fn migrate_tenant(
    let dest_conf = build_location_config(LocationConfigMode::AttachedMulti, gen, None);

    println!("🔁 Attaching to pageserver {}", dest_ps.conf.id);
-    dest_ps.location_config(tenant_id, dest_conf, None)?;
+    dest_ps.location_config(tenant_shard_id, dest_conf, None)?;

    if let Some(baseline) = baseline_lsns {
        println!("🕑 Waiting for LSN to catch up...");
@@ -149,7 +153,7 @@ pub fn migrate_tenant(
                "🔁 Reconfiguring endpoint {} to use pageserver {}",
                endpoint_name, dest_ps.conf.id
            );
-            endpoint.reconfigure(Some(dest_ps.conf.id))?;
+            endpoint.reconfigure(Some(vec![dest_ps.conf.id]))?;
        }
    }

@@ -181,7 +185,7 @@ pub fn migrate_tenant(
            "💤 Switching to secondary mode on pageserver {}",
            other_ps.conf.id
        );
-        other_ps.location_config(tenant_id, secondary_conf, None)?;
+        other_ps.location_config(tenant_shard_id, secondary_conf, None)?;
    }

    println!(
@@ -189,7 +193,7 @@ pub fn migrate_tenant(
        dest_ps.conf.id
    );
    let dest_conf = build_location_config(LocationConfigMode::AttachedSingle, gen, None);
-    dest_ps.location_config(tenant_id, dest_conf, None)?;
+    dest_ps.location_config(tenant_shard_id, dest_conf, None)?;

    println!("✅ Migration complete");

--- a/demo_sharding.sh
+++ b/demo_sharding.sh
@@ -0,0 +1,21 @@
+
+
+export RUST_LOG=DEBUG
+SHARDS=4
+PAGESERVERS=`seq -s , 1 $SHARDS`
+SCALE=10
+ARGS=--features=testing
+
+set -e
+
+set +e
+cargo neon $ARGS stop ; killall -9 storage_broker ; killall -9 safekeeper ; killall -9 pageserver ; killall -9 postgres ; killall -9 attachment_service ; rm -rf .neon
+set -e
+
+cargo build --package=pageserver && cargo neon $ARGS init --num-pageservers=$SHARDS && RUST_LOG=debug cargo neon $ARGS start && cargo neon $ARGS tenant create --shard-count=$SHARDS --tenant-id=1f359dd625e519a1a4e8d7509690f6fc --timeline-id=3d34095be52fec4c44a92e774c573b57 --set-default
+
+cargo neon $ARGS endpoint create --pageserver-id=$PAGESERVERS && cargo neon endpoint start --pageserver-id=$PAGESERVERS ep-main
+
+pgbench postgres -i -h 127.0.0.1 -p 55432 -U cloud_admin -s $SCALE
+
+du -sh .neon/local_fs_remote_storage/pageserver/tenants/1f359dd625e519a1a4e8d7509690f6fc*
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -293,7 +293,7 @@ pub struct StatusResponse {
 #[derive(Serialize, Deserialize, Debug)]
 #[serde(deny_unknown_fields)]
 pub struct TenantLocationConfigRequest {
-    pub tenant_id: TenantId,
+    pub tenant_shard_id: TenantShardId,
    #[serde(flatten)]
    pub config: LocationConfig, // as we have a flattened field, we should reject all unknown fields in it
 }
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -73,28 +73,19 @@ impl TenantShardId {
        )
    }

-    pub fn shard_slug(&self) -> impl std::fmt::Display + '_ {
-        ShardSlug(self)
-    }
-}
-
-/// Formatting helper
-struct ShardSlug<'a>(&'a TenantShardId);
-
-impl<'a> std::fmt::Display for ShardSlug<'a> {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(
-            f,
-            "{:02x}{:02x}",
-            self.0.shard_number.0, self.0.shard_count.0
-        )
+    pub fn shard_slug(&self) -> String {
+        format!("{:02x}{:02x}", self.shard_number.0, self.shard_count.0)
    }
 }

 impl std::fmt::Display for TenantShardId {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        if self.shard_count != ShardCount(0) {
-            write!(f, "{}-{}", self.tenant_id, self.shard_slug())
+            write!(
+                f,
+                "{}-{:02x}{:02x}",
+                self.tenant_id, self.shard_number.0, self.shard_count.0
+            )
        } else {
            // Legacy case (shard_count == 0) -- format as just the tenant id.  Note that this
            // is distinct from the normal single shard case (shard count == 1).
@@ -420,6 +411,12 @@ impl ShardIdentity {
            String::new()
        }
    }
+
+    /// Convenience for checking if this identity is the 0th shard in a tenant,
+    /// for special cases on shard 0 such as ingesting relation sizes.
+    pub fn is_zero(&self) -> bool {
+        self.number == ShardNumber(0)
+    }
 }

 impl Serialize for ShardIndex {
--- a/libs/utils/src/simple_rcu.rs
+++ b/libs/utils/src/simple_rcu.rs
@@ -1,10 +1,10 @@
 //!
 //! RCU stands for Read-Copy-Update. It's a synchronization mechanism somewhat
 //! similar to a lock, but it allows readers to "hold on" to an old value of RCU
-//! without blocking writers, and allows writing a new value without blocking
-//! readers. When you update the value, the new value is immediately visible
+//! without blocking writers, and allows writing a new values without blocking
+//! readers. When you update the new value, the new value is immediately visible
 //! to new readers, but the update waits until all existing readers have
-//! finished, so that on return, no one sees the old value anymore.
+//! finishe, so that no one sees the old value anymore.
 //!
 //! This implementation isn't wait-free; it uses an RwLock that is held for a
 //! short duration when the value is read or updated.
@@ -26,7 +26,6 @@
 //! Increment the value by one, and wait for old readers to finish:
 //!
 //! ```
-//! # async fn dox() {
 //! # let rcu = utils::simple_rcu::Rcu::new(1);
 //! let write_guard = rcu.lock_for_write();
 //!
@@ -37,17 +36,15 @@
 //!
 //! // Concurrent reads and writes are now possible again. Wait for all the readers
 //! // that still observe the old value to finish.
-//! waitlist.wait().await;
-//! # }
+//! waitlist.wait();
 //! ```
 //!
 #![warn(missing_docs)]

 use std::ops::Deref;
+use std::sync::mpsc::{sync_channel, Receiver, SyncSender};
 use std::sync::{Arc, Weak};
-use std::sync::{RwLock, RwLockWriteGuard};
-
-use tokio::sync::watch;
+use std::sync::{Mutex, RwLock, RwLockWriteGuard};

 ///
 /// Rcu allows multiple readers to read and hold onto a value without blocking
@@ -71,21 +68,22 @@ struct RcuCell<V> {
    value: V,

    /// A dummy channel. We never send anything to this channel. The point is
-    /// that when the RcuCell is dropped, any subscribed Receivers will be notified
+    /// that when the RcuCell is dropped, any cloned Senders will be notified
    /// that the channel is closed. Updaters can use this to wait out until the
    /// RcuCell has been dropped, i.e. until the old value is no longer in use.
    ///
-    /// We never send anything to this, we just need to hold onto it so that the
-    /// Receivers will be notified when it's dropped.
-    watch: watch::Sender<()>,
+    /// We never do anything with the receiver, we just need to hold onto it so
+    /// that the Senders will be notified when it's dropped. But because it's
+    /// not Sync, we need a Mutex on it.
+    watch: (SyncSender<()>, Mutex<Receiver<()>>),
 }

 impl<V> RcuCell<V> {
    fn new(value: V) -> Self {
-        let (watch_sender, _) = watch::channel(());
+        let (watch_sender, watch_receiver) = sync_channel(0);
        RcuCell {
            value,
-            watch: watch_sender,
+            watch: (watch_sender, Mutex::new(watch_receiver)),
        }
    }
 }
@@ -143,10 +141,10 @@ impl<V> Deref for RcuReadGuard<V> {
 ///
 /// Write guard returned by `write`
 ///
-/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so it should only be
-/// held for a short duration!
+/// NB: Holding this guard blocks all concurrent `read` and `write` calls, so
+/// it should only be held for a short duration!
 ///
-/// Calling [`Self::store_and_unlock`] consumes the guard, making new reads and new writes possible
+/// Calling `store` consumes the guard, making new reads and new writes possible
 /// again.
 ///
 pub struct RcuWriteGuard<'a, V> {
@@ -181,7 +179,7 @@ impl<'a, V> RcuWriteGuard<'a, V> {
            // the watches for any that do.
            self.inner.old_cells.retain(|weak| {
                if let Some(cell) = weak.upgrade() {
-                    watches.push(cell.watch.subscribe());
+                    watches.push(cell.watch.0.clone());
                    true
                } else {
                    false
@@ -195,20 +193,20 @@ impl<'a, V> RcuWriteGuard<'a, V> {
 ///
 /// List of readers who can still see old values.
 ///
-pub struct RcuWaitList(Vec<watch::Receiver<()>>);
+pub struct RcuWaitList(Vec<SyncSender<()>>);

 impl RcuWaitList {
    ///
    /// Wait for old readers to finish.
    ///
-    pub async fn wait(mut self) {
+    pub fn wait(mut self) {
        // after all the old_cells are no longer in use, we're done
        for w in self.0.iter_mut() {
            // This will block until the Receiver is closed. That happens when
            // the RcuCell is dropped.
            #[allow(clippy::single_match)]
-            match w.changed().await {
-                Ok(_) => panic!("changed() unexpectedly succeeded on dummy channel"),
+            match w.send(()) {
+                Ok(_) => panic!("send() unexpectedly succeeded on dummy channel"),
                Err(_) => {
                    // closed, which means that the cell has been dropped, and
                    // its value is no longer in use
@@ -222,10 +220,11 @@ impl RcuWaitList {
 mod tests {
    use super::*;
    use std::sync::{Arc, Mutex};
+    use std::thread::{sleep, spawn};
    use std::time::Duration;

-    #[tokio::test]
-    async fn two_writers() {
+    #[test]
+    fn two_writers() {
        let rcu = Rcu::new(1);

        let read1 = rcu.read();
@@ -249,35 +248,33 @@ mod tests {
        assert_eq!(*read1, 1);

        let log = Arc::new(Mutex::new(Vec::new()));
-        // Wait for the old readers to finish in separate tasks.
+        // Wait for the old readers to finish in separate threads.
        let log_clone = Arc::clone(&log);
-        let task2 = tokio::spawn(async move {
-            wait2.wait().await;
+        let thread2 = spawn(move || {
+            wait2.wait();
            log_clone.lock().unwrap().push("wait2 done");
        });
        let log_clone = Arc::clone(&log);
-        let task3 = tokio::spawn(async move {
-            wait3.wait().await;
+        let thread3 = spawn(move || {
+            wait3.wait();
            log_clone.lock().unwrap().push("wait3 done");
        });

        // without this sleep the test can pass on accident if the writer is slow
-        tokio::time::sleep(Duration::from_millis(100)).await;
+        sleep(Duration::from_millis(500));

        // Release first reader. This allows first write to finish, but calling
-        // wait() on the 'task3' would still block.
+        // wait() on the second one would still block.
        log.lock().unwrap().push("dropping read1");
        drop(read1);
-        task2.await.unwrap();
+        thread2.join().unwrap();

-        assert!(!task3.is_finished());
-
-        tokio::time::sleep(Duration::from_millis(100)).await;
+        sleep(Duration::from_millis(500));

        // Release second reader, and finish second writer.
        log.lock().unwrap().push("dropping read2");
        drop(read2);
-        task3.await.unwrap();
+        thread3.join().unwrap();

        assert_eq!(
            log.lock().unwrap().as_slice(),
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -3,6 +3,7 @@ use pageserver::repository::Key;
 use pageserver::tenant::layer_map::LayerMap;
 use pageserver::tenant::storage_layer::LayerFileName;
 use pageserver::tenant::storage_layer::PersistentLayerDesc;
+use pageserver_api::shard::TenantShardId;
 use rand::prelude::{SeedableRng, SliceRandom, StdRng};
 use std::cmp::{max, min};
 use std::fs::File;
@@ -10,6 +11,7 @@ use std::io::{BufRead, BufReader};
 use std::path::PathBuf;
 use std::str::FromStr;
 use std::time::Instant;
+use utils::id::{TenantId, TimelineId};

 use utils::lsn::Lsn;

@@ -209,8 +211,13 @@ fn bench_sequential(c: &mut Criterion) {
    for i in 0..100_000 {
        let i32 = (i as u32) % 100;
        let zero = Key::from_hex("000000000000000000000000000000000000").unwrap();
-        let layer =
-            PersistentLayerDesc::new_img(zero.add(10 * i32)..zero.add(10 * i32 + 1), Lsn(i), 0);
+        let layer = PersistentLayerDesc::new_img(
+            TenantShardId::unsharded(TenantId::generate()),
+            TimelineId::generate(),
+            zero.add(10 * i32)..zero.add(10 * i32 + 1),
+            Lsn(i),
+            0,
+        );
        updates.insert_historic(layer);
    }
    updates.flush();
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -310,8 +310,8 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
                .unwrap()
                .as_micros(),
            partition,
-            candidate.timeline.tenant_shard_id,
-            candidate.timeline.timeline_id,
+            desc.tenant_shard_id,
+            desc.timeline_id,
            candidate.layer,
        );
    }
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -709,26 +709,6 @@ async fn tenant_detach_handler(
    json_response(StatusCode::OK, ())
 }

-async fn tenant_reset_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    let drop_cache: Option<bool> = parse_query_param(&request, "drop_cache")?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-    let state = get_state(&request);
-    state
-        .tenant_manager
-        .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn tenant_load_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -844,7 +824,7 @@ async fn tenant_delete_handler(
    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_shard_id)
        .instrument(info_span!("tenant_delete_handler",
            tenant_id = %tenant_shard_id.tenant_id,
-            shard = %tenant_shard_id.shard_slug()
+            shard = tenant_shard_id.shard_slug()
        ))
        .await?;

@@ -1193,7 +1173,7 @@ async fn put_tenant_location_config_handler(
            mgr::detach_tenant(conf, tenant_shard_id, true, &state.deletion_queue_client)
                .instrument(info_span!("tenant_detach",
                    tenant_id = %tenant_shard_id.tenant_id,
-                    shard = %tenant_shard_id.shard_slug()
+                    shard = tenant_shard_id.shard_slug()
                ))
                .await
        {
@@ -1848,9 +1828,6 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_id/detach", |r| {
            api_handler(r, tenant_detach_handler)
        })
-        .post("/v1/tenant/:tenant_shard_id/reset", |r| {
-            api_handler(r, tenant_reset_handler)
-        })
        .post("/v1/tenant/:tenant_id/load", |r| {
            api_handler(r, tenant_load_handler)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -205,7 +205,7 @@ async fn timed<Fut: std::future::Future>(
    match tokio::time::timeout(warn_at, &mut fut).await {
        Ok(ret) => {
            tracing::info!(
-                stage = name,
+                task = name,
                elapsed_ms = started.elapsed().as_millis(),
                "completed"
            );
@@ -213,7 +213,7 @@ async fn timed<Fut: std::future::Future>(
        }
        Err(_) => {
            tracing::info!(
-                stage = name,
+                task = name,
                elapsed_ms = started.elapsed().as_millis(),
                "still waiting, taking longer than expected..."
            );
@@ -222,7 +222,7 @@ async fn timed<Fut: std::future::Future>(

            // this has a global allowed_errors
            tracing::warn!(
-                stage = name,
+                task = name,
                elapsed_ms = started.elapsed().as_millis(),
                "completed, took longer than expected"
            );
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1167,6 +1167,30 @@ pub(crate) static DELETION_QUEUE: Lazy<DeletionQueueMetrics> = Lazy::new(|| {
 }
 });

+pub(crate) struct WalIngestMetrics {
+    pub(crate) records_received: IntCounter,
+    pub(crate) records_committed: IntCounter,
+    pub(crate) records_filtered: IntCounter,
+}
+
+pub(crate) static WAL_INGEST: Lazy<WalIngestMetrics> = Lazy::new(|| WalIngestMetrics {
+    records_received: register_int_counter!(
+        "pageserver_wal_ingest_records_received",
+        "Number of WAL records received from safekeeper"
+    )
+    .expect("failed to define a metric"),
+    records_committed: register_int_counter!(
+        "pageserver_wal_ingest_records_committed",
+        "Number of WAL records which resulted in writes to pageserver storage"
+    )
+    .expect("failed to define a metric"),
+    records_filtered: register_int_counter!(
+        "pageserver_wal_ingest_records_filtered",
+        "Number of WAL records filtered out due to sharding"
+    )
+    .expect("failed to define a metric"),
+});
+
 #[derive(Debug, Clone, Copy, PartialEq, Eq, Hash)]
 pub enum RemoteOpKind {
    Upload,
@@ -2094,8 +2118,6 @@ pub fn preinitialize_metrics() {
    // Tenant manager stats
    Lazy::force(&TENANT_MANAGER);

-    Lazy::force(&crate::tenant::storage_layer::layer::LAYER_IMPL_METRICS);
-
    // countervecs
    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
        .into_iter()
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -67,9 +67,9 @@ use crate::trace::Tracer;
 use postgres_ffi::pg_constants::DEFAULTTABLESPACE_OID;
 use postgres_ffi::BLCKSZ;

-// How long we may wait for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
+// How long we may block waiting for a [`TenantSlot::InProgress`]` and/or a [`Tenant`] which
 // is not yet in state [`TenantState::Active`].
-const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(30000);
+const ACTIVE_TENANT_TIMEOUT: Duration = Duration::from_millis(5000);

 /// Read the end of a tar archive.
 ///
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -1368,6 +1368,10 @@ impl<'a> DatadirModification<'a> {
        Ok(())
    }

+    pub(crate) fn is_empty(&self) -> bool {
+        self.pending_updates.is_empty() && self.pending_deletions.is_empty()
+    }
+
    // Internal helper functions to batch the modifications

    async fn get(&self, key: Key, ctx: &RequestContext) -> Result<Bytes, PageReconstructError> {
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2515,7 +2515,7 @@ impl Tenant {
            }
        }

-        debug!("persisting tenantconf to {config_path}");
+        info!("persisting tenantconf to {config_path}");

        let mut conf_content = r#"# This file contains a specific per-tenant's config.
 #  It is read in case of pageserver restart.
@@ -2550,7 +2550,7 @@ impl Tenant {
        target_config_path: &Utf8Path,
        tenant_conf: &TenantConfOpt,
    ) -> anyhow::Result<()> {
-        debug!("persisting tenantconf to {target_config_path}");
+        info!("persisting tenantconf to {target_config_path}");

        let mut conf_content = r#"# This file contains a specific per-tenant's config.
 #  It is read in case of pageserver restart.
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -270,6 +270,49 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
    Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));

+/// Create a directory, including parents.  This does no fsyncs and makes
+/// no guarantees about the persistence of the resulting metadata: for
+/// use when creating dirs for use as cache.
+async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
+    let mut dirs_to_create = Vec::new();
+    let mut path: &Utf8Path = path.as_ref();
+
+    // Figure out which directories we need to create.
+    loop {
+        let meta = tokio::fs::metadata(path).await;
+        match meta {
+            Ok(metadata) if metadata.is_dir() => break,
+            Ok(_) => {
+                return Err(std::io::Error::new(
+                    std::io::ErrorKind::AlreadyExists,
+                    format!("non-directory found in path: {path}"),
+                ));
+            }
+            Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {}
+            Err(e) => return Err(e),
+        }
+
+        dirs_to_create.push(path);
+
+        match path.parent() {
+            Some(parent) => path = parent,
+            None => {
+                return Err(std::io::Error::new(
+                    std::io::ErrorKind::InvalidInput,
+                    format!("can't find parent of path '{path}'"),
+                ));
+            }
+        }
+    }
+
+    // Create directories from parent to child.
+    for &path in dirs_to_create.iter().rev() {
+        tokio::fs::create_dir(path).await?;
+    }
+
+    Ok(())
+}
+
 /// The TenantManager is responsible for storing and mutating the collection of all tenants
 /// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
 /// lives inside the TenantManager.
@@ -603,13 +646,7 @@ pub(crate) fn tenant_spawn(
        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
    );

-    info!(
-        tenant_id = %tenant_shard_id.tenant_id,
-        shard_id = %tenant_shard_id.shard_slug(),
-        generation = ?location_conf.location.generation,
-        attach_mode = ?location_conf.location.attach_mode,
-        "Attaching tenant"
-    );
+    info!("Attaching tenant {tenant_shard_id}");
    let tenant = match Tenant::spawn(
        conf,
        tenant_shard_id,
@@ -998,7 +1035,7 @@ impl TenantManager {
            LocationMode::Secondary(_) => {
                // Directory doesn't need to be fsync'd because if we crash it can
                // safely be recreated next time this tenant location is configured.
-                tokio::fs::create_dir_all(&tenant_path)
+                unsafe_create_dir_all(&tenant_path)
                    .await
                    .with_context(|| format!("Creating {tenant_path}"))?;

@@ -1014,7 +1051,7 @@ impl TenantManager {
                // Directory doesn't need to be fsync'd because we do not depend on
                // it to exist after crashes: it may be recreated when tenant is
                // re-attached, see https://github.com/neondatabase/neon/issues/5550
-                tokio::fs::create_dir_all(&tenant_path)
+                unsafe_create_dir_all(&timelines_path)
                    .await
                    .with_context(|| format!("Creating {timelines_path}"))?;

@@ -1044,81 +1081,6 @@ impl TenantManager {

        Ok(())
    }
-
-    /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
-    /// LocationConf that was last used to attach it.  Optionally, the local file cache may be
-    /// dropped before re-attaching.
-    ///
-    /// This is not part of a tenant's normal lifecycle: it is used for debug/support, in situations
-    /// where an issue is identified that would go away with a restart of the tenant.
-    ///
-    /// This does not have any special "force" shutdown of a tenant: it relies on the tenant's tasks
-    /// to respect the cancellation tokens used in normal shutdown().
-    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %drop_cache))]
-    pub(crate) async fn reset_tenant(
-        &self,
-        tenant_shard_id: TenantShardId,
-        drop_cache: bool,
-        ctx: RequestContext,
-    ) -> anyhow::Result<()> {
-        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
-        let Some(old_slot) = slot_guard.get_old_value() else {
-            anyhow::bail!("Tenant not found when trying to reset");
-        };
-
-        let Some(tenant) = old_slot.get_attached() else {
-            slot_guard.revert();
-            anyhow::bail!("Tenant is not in attached state");
-        };
-
-        let (_guard, progress) = utils::completion::channel();
-        match tenant.shutdown(progress, false).await {
-            Ok(()) => {
-                slot_guard.drop_old_value()?;
-            }
-            Err(_barrier) => {
-                slot_guard.revert();
-                anyhow::bail!("Cannot reset Tenant, already shutting down");
-            }
-        }
-
-        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-        let timelines_path = self.conf.timelines_path(&tenant_shard_id);
-        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
-
-        if drop_cache {
-            tracing::info!("Dropping local file cache");
-
-            match tokio::fs::read_dir(&timelines_path).await {
-                Err(e) => {
-                    tracing::warn!("Failed to list timelines while dropping cache: {}", e);
-                }
-                Ok(mut entries) => {
-                    while let Some(entry) = entries.next_entry().await? {
-                        tokio::fs::remove_dir_all(entry.path()).await?;
-                    }
-                }
-            }
-        }
-
-        let shard_identity = config.shard;
-        let tenant = tenant_spawn(
-            self.conf,
-            tenant_shard_id,
-            &tenant_path,
-            self.resources.clone(),
-            AttachedTenantConf::try_from(config)?,
-            shard_identity,
-            None,
-            self.tenants,
-            SpawnMode::Normal,
-            &ctx,
-        )?;
-
-        slot_guard.upsert(TenantSlot::Attached(tenant))?;
-
-        Ok(())
-    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -1327,7 +1289,8 @@ pub(crate) async fn delete_tenant(
    // See https://github.com/neondatabase/neon/issues/5080

    // TODO(sharding): make delete API sharding-aware
-    let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
+    let mut slot_guard =
+        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;

    // unwrap is safe because we used MustExist mode when acquiring
    let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
@@ -1654,10 +1617,9 @@ pub enum TenantSlotUpsertError {
    MapState(#[from] TenantMapError),
 }

-#[derive(Debug, thiserror::Error)]
+#[derive(Debug)]
 enum TenantSlotDropError {
    /// It is only legal to drop a TenantSlot if its contents are fully shut down
-    #[error("Tenant was not shut down")]
    NotShutdown,
 }

@@ -1717,9 +1679,9 @@ impl SlotGuard {
        }
    }

-    /// Get any value that was present in the slot before we acquired ownership
+    /// Take any value that was present in the slot before we acquired ownership
    /// of it: in state transitions, this will be the old state.
-    fn get_old_value(&self) -> &Option<TenantSlot> {
+    fn get_old_value(&mut self) -> &Option<TenantSlot> {
        &self.old_value
    }

@@ -1937,7 +1899,7 @@ fn tenant_map_acquire_slot_impl(
    METRICS.tenant_slot_writes.inc();

    let mut locked = tenants.write().unwrap();
-    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard = %tenant_shard_id.shard_slug());
+    let span = tracing::info_span!("acquire_slot", tenant_id=%tenant_shard_id.tenant_id, shard=tenant_shard_id.shard_slug());
    let _guard = span.enter();

    let m = match &mut *locked {
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -1271,12 +1271,11 @@ impl RemoteTimelineClient {

            let upload_result: anyhow::Result<()> = match &task.op {
                UploadOp::UploadLayer(ref layer, ref layer_metadata) => {
-                    let path = layer.local_path_from_id(&self.tenant_shard_id, &self.timeline_id);
-
+                    let path = layer.local_path();
                    upload::upload_timeline_layer(
                        self.conf,
                        &self.storage_impl,
-                        &path,
+                        path,
                        layer_metadata,
                        self.generation,
                    )
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -4,7 +4,7 @@ pub mod delta_layer;
 mod filename;
 pub mod image_layer;
 mod inmemory_layer;
-pub(crate) mod layer;
+mod layer;
 mod layer_desc;

 use crate::context::{AccessStatsBehavior, RequestContext};
@@ -24,7 +24,7 @@ use tracing::warn;
 use utils::history_buffer::HistoryBufferWithDropCounter;
 use utils::rate_limit::RateLimit;

-use utils::lsn::Lsn;
+use utils::{id::TimelineId, lsn::Lsn};

 pub use delta_layer::{DeltaLayer, DeltaLayerWriter, ValueRef};
 pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
@@ -301,17 +301,31 @@ pub trait AsLayerDesc {
 }

 pub mod tests {
+    use pageserver_api::shard::TenantShardId;
+
    use super::*;

    impl From<DeltaFileName> for PersistentLayerDesc {
        fn from(value: DeltaFileName) -> Self {
-            PersistentLayerDesc::new_delta(value.key_range, value.lsn_range, 233)
+            PersistentLayerDesc::new_delta(
+                TenantShardId::from([0; 18]),
+                TimelineId::from_array([0; 16]),
+                value.key_range,
+                value.lsn_range,
+                233,
+            )
        }
    }

    impl From<ImageFileName> for PersistentLayerDesc {
        fn from(value: ImageFileName) -> Self {
-            PersistentLayerDesc::new_img(value.key_range, value.lsn, 233)
+            PersistentLayerDesc::new_img(
+                TenantShardId::from([0; 18]),
+                TimelineId::from_array([0; 16]),
+                value.key_range,
+                value.lsn,
+                233,
+            )
        }
    }

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -84,6 +84,17 @@ pub struct Summary {
    pub index_root_blk: u32,
 }

+impl From<&DeltaLayer> for Summary {
+    fn from(layer: &DeltaLayer) -> Self {
+        Self::expected(
+            layer.desc.tenant_shard_id.tenant_id,
+            layer.desc.timeline_id,
+            layer.desc.key_range.clone(),
+            layer.desc.lsn_range.clone(),
+        )
+    }
+}
+
 impl Summary {
    pub(super) fn expected(
        tenant_id: TenantId,
@@ -309,9 +320,15 @@ impl DeltaLayer {
            .metadata()
            .context("get file metadata to determine size")?;

+        // TODO(sharding): we must get the TenantShardId from the path instead of reading the Summary.
+        // we should also validate the path against the Summary, as both should contain the same tenant, timeline, key, lsn.
+        let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
+
        Ok(DeltaLayer {
            path: path.to_path_buf(),
            desc: PersistentLayerDesc::new_delta(
+                tenant_shard_id,
+                summary.timeline_id,
                summary.key_range,
                summary.lsn_range,
                metadata.len(),
@@ -488,6 +505,8 @@ impl DeltaLayerWriterInner {
        // set inner.file here. The first read will have to re-open it.

        let desc = PersistentLayerDesc::new_delta(
+            self.tenant_shard_id,
+            self.timeline_id,
            self.key_start..key_end,
            self.lsn_range.clone(),
            metadata.len(),
@@ -498,7 +517,7 @@ impl DeltaLayerWriterInner {

        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;

-        trace!("created delta layer {}", self.path);
+        trace!("created delta layer {}", layer.local_path());

        Ok(layer)
    }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -85,6 +85,17 @@ pub struct Summary {
    // the 'values' part starts after the summary header, on block 1.
 }

+impl From<&ImageLayer> for Summary {
+    fn from(layer: &ImageLayer) -> Self {
+        Self::expected(
+            layer.desc.tenant_shard_id.tenant_id,
+            layer.desc.timeline_id,
+            layer.desc.key_range.clone(),
+            layer.lsn,
+        )
+    }
+}
+
 impl Summary {
    pub(super) fn expected(
        tenant_id: TenantId,
@@ -267,9 +278,19 @@ impl ImageLayer {
            .metadata()
            .context("get file metadata to determine size")?;

+        // TODO(sharding): we should get TenantShardId from path.
+        // OR, not at all: any layer we load from disk should also get reconciled with remote IndexPart.
+        let tenant_shard_id = TenantShardId::unsharded(summary.tenant_id);
+
        Ok(ImageLayer {
            path: path.to_path_buf(),
-            desc: PersistentLayerDesc::new_img(summary.key_range, summary.lsn, metadata.len()), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
+            desc: PersistentLayerDesc::new_img(
+                tenant_shard_id,
+                summary.timeline_id,
+                summary.key_range,
+                summary.lsn,
+                metadata.len(),
+            ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
            lsn: summary.lsn,
            access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
            inner: OnceCell::new(),
@@ -560,7 +581,13 @@ impl ImageLayerWriterInner {
            .await
            .context("get metadata to determine file size")?;

-        let desc = PersistentLayerDesc::new_img(self.key_range.clone(), self.lsn, metadata.len());
+        let desc = PersistentLayerDesc::new_img(
+            self.tenant_shard_id,
+            self.timeline_id,
+            self.key_range.clone(),
+            self.lsn,
+            metadata.len(),
+        );

        // Note: Because we open the file in write-only mode, we cannot
        // reuse the same VirtualFile for reading later. That's why we don't
@@ -572,7 +599,7 @@ impl ImageLayerWriterInner {
        // FIXME: why not carry the virtualfile here, it supports renaming?
        let layer = Layer::finish_creating(self.conf, timeline, desc, &self.path)?;

-        trace!("created image layer {}", self.path);
+        trace!("created image layer {}", layer.local_path());

        Ok(layer)
    }
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -3,15 +3,13 @@ use camino::{Utf8Path, Utf8PathBuf};
 use pageserver_api::models::{
    HistoricLayerInfo, LayerAccessKind, LayerResidenceEventReason, LayerResidenceStatus,
 };
-use pageserver_api::shard::{ShardIndex, TenantShardId};
+use pageserver_api::shard::ShardIndex;
 use std::ops::Range;
 use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
 use std::sync::{Arc, Weak};
 use std::time::SystemTime;
 use tracing::Instrument;
-use utils::id::TimelineId;
 use utils::lsn::Lsn;
-use utils::sync::gate::GateError;
 use utils::sync::heavier_once_cell;

 use crate::config::PageServerConf;
@@ -83,7 +81,12 @@ impl Layer {
        file_name: LayerFileName,
        metadata: LayerFileMetadata,
    ) -> Self {
-        let desc = PersistentLayerDesc::from_filename(file_name, metadata.file_size());
+        let desc = PersistentLayerDesc::from_filename(
+            timeline.tenant_shard_id,
+            timeline.timeline_id,
+            file_name,
+            metadata.file_size(),
+        );

        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Evicted);

@@ -97,7 +100,7 @@ impl Layer {
            metadata.shard,
        )));

-        debug_assert!(owner.0.needs_download_blocking(timeline).unwrap().is_some());
+        debug_assert!(owner.0.needs_download_blocking().unwrap().is_some());

        owner
    }
@@ -109,7 +112,12 @@ impl Layer {
        file_name: LayerFileName,
        metadata: LayerFileMetadata,
    ) -> ResidentLayer {
-        let desc = PersistentLayerDesc::from_filename(file_name, metadata.file_size());
+        let desc = PersistentLayerDesc::from_filename(
+            timeline.tenant_shard_id,
+            timeline.timeline_id,
+            file_name,
+            metadata.file_size(),
+        );

        let access_stats = LayerAccessStats::for_loading_layer(LayerResidenceStatus::Resident);

@@ -136,7 +144,7 @@ impl Layer {

        let downloaded = resident.expect("just initialized");

-        debug_assert!(owner.0.needs_download_blocking(timeline).unwrap().is_none());
+        debug_assert!(owner.0.needs_download_blocking().unwrap().is_none());

        timeline
            .metrics
@@ -181,7 +189,7 @@ impl Layer {
        let downloaded = resident.expect("just initialized");

        // if the rename works, the path is as expected
-        std::fs::rename(temp_path, owner.local_path(timeline))
+        std::fs::rename(temp_path, owner.local_path())
            .with_context(|| format!("rename temporary file as correct path for {owner}"))?;

        Ok(ResidentLayer { downloaded, owner })
@@ -214,8 +222,8 @@ impl Layer {
    ///
    /// [gc]: [`RemoteTimelineClient::schedule_gc_update`]
    /// [compaction]: [`RemoteTimelineClient::schedule_compaction_update`]
-    pub(crate) fn delete_on_drop(&self) {
-        self.0.delete_on_drop();
+    pub(crate) fn garbage_collect_on_drop(&self) {
+        self.0.garbage_collect_on_drop();
    }

    /// Return data needed to reconstruct given page at LSN.
@@ -301,12 +309,8 @@ impl Layer {
        &self.0.access_stats
    }

-    fn local_path(&self, timeline: &Timeline) -> Utf8PathBuf {
-        self.0.local_path(timeline)
-    }
-
-    pub(crate) fn filename(&self) -> LayerFileName {
-        self.0.desc.filename()
+    pub(crate) fn local_path(&self) -> &Utf8Path {
+        &self.0.path
    }

    pub(crate) fn metadata(&self) -> LayerFileMetadata {
@@ -327,10 +331,10 @@ impl Layer {
        Ok(())
    }

-    /// Waits until this layer has been dropped (and if needed, local file deletion and remote
+    /// Waits until this layer has been dropped (and if needed, local garbage collection and remote
    /// deletion scheduling has completed).
    ///
-    /// Does not start local deletion, use [`Self::delete_on_drop`] for that
+    /// Does not start garbage collection, use [`Self::garbage_collect_on_drop`] for that
    /// separatedly.
    #[cfg(feature = "testing")]
    pub(crate) fn wait_drop(&self) -> impl std::future::Future<Output = ()> + 'static {
@@ -398,9 +402,13 @@ impl ResidentOrWantedEvicted {
 }

 struct LayerInner {
-    /// Only needed to check ondemand_download_behavior_treat_error_as_warn and in [`Self::local_path_from_id`]
+    /// Only needed to check ondemand_download_behavior_treat_error_as_warn and creation of
+    /// [`Self::path`].
    conf: &'static PageServerConf,

+    /// Full path to the file; unclear if this should exist anymore.
+    path: Utf8PathBuf,
+
    desc: PersistentLayerDesc,

    /// Timeline access is needed for remote timeline client and metrics.
@@ -415,8 +423,8 @@ struct LayerInner {
    /// Initialization and deinitialization are done while holding a permit.
    inner: heavier_once_cell::OnceCell<ResidentOrWantedEvicted>,

-    /// Do we want to delete locally and remotely this when `LayerInner` is dropped
-    wanted_deleted: AtomicBool,
+    /// Do we want to garbage collect this when `LayerInner` is dropped
+    wanted_garbage_collected: AtomicBool,

    /// Do we want to evict this layer as soon as possible? After being set to `true`, all accesses
    /// will try to downgrade [`ResidentOrWantedEvicted`], which will eventually trigger
@@ -430,6 +438,10 @@ struct LayerInner {
    version: AtomicUsize,

    /// Allow subscribing to when the layer actually gets evicted.
+    ///
+    /// If in future we need to implement "wait until layer instances are gone and done", carrying
+    /// this over to the gc spawn_blocking from LayerInner::drop will do the trick, and adding a
+    /// method for "wait_gc" which will wait to this being closed.
    status: tokio::sync::broadcast::Sender<Status>,

    /// Counter for exponential backoff with the download
@@ -471,39 +483,19 @@ enum Status {

 impl Drop for LayerInner {
    fn drop(&mut self) {
-        if !*self.wanted_deleted.get_mut() {
+        if !*self.wanted_garbage_collected.get_mut() {
            // should we try to evict if the last wish was for eviction?
            // feels like there's some hazard of overcrowding near shutdown near by, but we don't
            // run drops during shutdown (yet)
            return;
        }

-        // We will only do I/O on drop if our Timeline still exists.  Otherwise, we may safely
-        // leave garbage layers behind to be cleaned up the next time this Timeline is instantiated.
-        let Some(timeline) = self.timeline.upgrade() else {
-            // no need to nag that timeline is gone: under normal situation on
-            // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
-            LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
-            return;
-        };
-
-        // We will only do I/O during drop if our Timeline's layer_gate is open: this avoids
-        // the risk that we would race with Timeline::shutdown and end up doing I/O to a timeline
-        // path for which the Timeline object has been torn down already.
-        let _gate_guard = match timeline.layer_gate.enter() {
-            Ok(g) => g,
-            Err(GateError::GateClosed) => {
-                LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::TimelineGone);
-                return;
-            }
-        };
-
-        // If timeline is alive, we can construct a span with IDs for this function.
-        let span = tracing::info_span!(parent: None, "layer_delete", tenant_id = %timeline.tenant_shard_id.tenant_id, shard_id=%timeline.tenant_shard_id.shard_slug(), timeline_id = %timeline.timeline_id);
-        let path = self.local_path(&timeline);
+        let span = tracing::info_span!(parent: None, "layer_gc", tenant_id = %self.layer_desc().tenant_shard_id.tenant_id, shard_id=%self.layer_desc().tenant_shard_id.shard_slug(), timeline_id = %self.layer_desc().timeline_id);

+        let path = std::mem::take(&mut self.path);
        let file_name = self.layer_desc().filename();
        let file_size = self.layer_desc().file_size;
+        let timeline = self.timeline.clone();
        let meta = self.metadata();
        let status = self.status.clone();

@@ -525,32 +517,38 @@ impl Drop for LayerInner {
                    false
                }
                Err(e) => {
-                    tracing::error!("failed to remove wanted deleted layer: {e}");
-                    LAYER_IMPL_METRICS.inc_delete_removes_failed();
+                    tracing::error!("failed to remove garbage collected layer: {e}");
+                    LAYER_IMPL_METRICS.inc_gc_removes_failed();
                    false
                }
            };

-            if removed {
-                timeline.metrics.resident_physical_size_sub(file_size);
-            }
-            if let Some(remote_client) = timeline.remote_client.as_ref() {
-                let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
-
-                if let Err(e) = res {
-                    // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
-                    // demonstrating this deadlock (without spawn_blocking): stop will drop
-                    // queued items, which will have ResidentLayer's, and those drops would try
-                    // to re-entrantly lock the RemoteTimelineClient inner state.
-                    if !timeline.is_active() {
-                        tracing::info!("scheduling deletion on drop failed: {e:#}");
-                    } else {
-                        tracing::warn!("scheduling deletion on drop failed: {e:#}");
-                    }
-                    LAYER_IMPL_METRICS.inc_deletes_failed(DeleteFailed::DeleteSchedulingFailed);
-                } else {
-                    LAYER_IMPL_METRICS.inc_completed_deletes();
+            if let Some(timeline) = timeline.upgrade() {
+                if removed {
+                    timeline.metrics.resident_physical_size_sub(file_size);
                }
+                if let Some(remote_client) = timeline.remote_client.as_ref() {
+                    let res = remote_client.schedule_deletion_of_unlinked(vec![(file_name, meta)]);
+
+                    if let Err(e) = res {
+                        // test_timeline_deletion_with_files_stuck_in_upload_queue is good at
+                        // demonstrating this deadlock (without spawn_blocking): stop will drop
+                        // queued items, which will have ResidentLayer's, and those drops would try
+                        // to re-entrantly lock the RemoteTimelineClient inner state.
+                        if !timeline.is_active() {
+                            tracing::info!("scheduling deletion on drop failed: {e:#}");
+                        } else {
+                            tracing::warn!("scheduling deletion on drop failed: {e:#}");
+                        }
+                        LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::DeleteSchedulingFailed);
+                    } else {
+                        LAYER_IMPL_METRICS.inc_completed_gcs();
+                    }
+                }
+            } else {
+                // no need to nag that timeline is gone: under normal situation on
+                // task_mgr::remove_tenant_from_memory the timeline is gone before we get dropped.
+                LAYER_IMPL_METRICS.inc_gcs_failed(GcFailed::TimelineGone);
            }
        });
    }
@@ -566,6 +564,10 @@ impl LayerInner {
        generation: Generation,
        shard: ShardIndex,
    ) -> Self {
+        let path = conf
+            .timeline_path(&timeline.tenant_shard_id, &timeline.timeline_id)
+            .join(desc.filename().to_string());
+
        let (inner, version) = if let Some(inner) = downloaded {
            let version = inner.version;
            let resident = ResidentOrWantedEvicted::Resident(inner);
@@ -576,11 +578,12 @@ impl LayerInner {

        LayerInner {
            conf,
+            path,
            desc,
            timeline: Arc::downgrade(timeline),
            have_remote_client: timeline.remote_client.is_some(),
            access_stats,
-            wanted_deleted: AtomicBool::new(false),
+            wanted_garbage_collected: AtomicBool::new(false),
            wanted_evicted: AtomicBool::new(false),
            inner,
            version: AtomicUsize::new(version),
@@ -591,32 +594,16 @@ impl LayerInner {
        }
    }

-    /// All call sites that need this function should already have a Timeline (e.g. from
-    /// upgrading the Self::timeline weak pointer) -- it doesn't make sense to try and
-    /// do anything with the local file if the Timeline isn't still alive.
-    fn local_path(&self, timeline: &Timeline) -> Utf8PathBuf {
-        self.local_path_from_id(&timeline.tenant_shard_id, &timeline.timeline_id)
-    }
-
-    /// Use this instead of `local_path` if you don't have a Timeline but do have its ID: this
-    /// is used by external callers such as [`crate::tenant::RemoteTimelineClient`]
-    pub(crate) fn local_path_from_id(
-        &self,
-        tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
-    ) -> Utf8PathBuf {
-        self.conf
-            .timeline_path(tenant_shard_id, timeline_id)
-            .join(self.desc.filename().to_string())
-    }
-
-    fn delete_on_drop(&self) {
-        let res =
-            self.wanted_deleted
-                .compare_exchange(false, true, Ordering::Release, Ordering::Relaxed);
+    fn garbage_collect_on_drop(&self) {
+        let res = self.wanted_garbage_collected.compare_exchange(
+            false,
+            true,
+            Ordering::Release,
+            Ordering::Relaxed,
+        );

        if res.is_ok() {
-            LAYER_IMPL_METRICS.inc_started_deletes();
+            LAYER_IMPL_METRICS.inc_started_gcs();
        }
    }

@@ -684,10 +671,6 @@ impl LayerInner {
                // disable any scheduled but not yet running eviction deletions for this
                let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed);

-                // count cancellations, which currently remain largely unexpected
-                let init_cancelled =
-                    scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled());
-
                // no need to make the evict_and_wait wait for the actual download to complete
                drop(self.status.send(Status::Downloaded));

@@ -696,14 +679,12 @@ impl LayerInner {
                    .upgrade()
                    .ok_or_else(|| DownloadError::TimelineShutdown)?;

-                // FIXME: grab a gate
-
                let can_ever_evict = timeline.remote_client.as_ref().is_some();

                // check if we really need to be downloaded; could have been already downloaded by a
                // cancelled previous attempt.
                let needs_download = self
-                    .needs_download(&timeline)
+                    .needs_download()
                    .await
                    .map_err(DownloadError::PreStatFailed)?;

@@ -758,8 +739,6 @@ impl LayerInner {
                    tracing::info!(waiters, "completing the on-demand download for other tasks");
                }

-                scopeguard::ScopeGuard::into_inner(init_cancelled);
-
                Ok((ResidentOrWantedEvicted::Resident(res), permit))
            };

@@ -853,13 +832,12 @@ impl LayerInner {
        // block tenant::mgr::remove_tenant_from_memory.

        let this: Arc<Self> = self.clone();
-        let timeline_clone = timeline.clone();

        crate::task_mgr::spawn(
            &tokio::runtime::Handle::current(),
            crate::task_mgr::TaskKind::RemoteDownloadTask,
-            Some(timeline.tenant_shard_id.tenant_id),
-            Some(timeline.timeline_id),
+            Some(self.desc.tenant_shard_id.tenant_id),
+            Some(self.desc.timeline_id),
            &task_name,
            false,
            async move {
@@ -889,13 +867,14 @@ impl LayerInner {
                    match res {
                        (Ok(()), _) => {
                            // our caller is cancellation safe so this is fine; if someone
-                            // else requests the layer, they'll find it already downloaded.
+                            // else requests the layer, they'll find it already downloaded
+                            // or redownload.
                            //
-                            // See counter [`LayerImplMetrics::inc_init_needed_no_download`]
-                            //
-                            // FIXME(#6028): however, could be that we should consider marking the
-                            // layer for eviction? alas, cannot: because only DownloadedLayer will
-                            // handle that.
+                            // however, could be that we should consider marking the layer
+                            // for eviction? alas, cannot: because only DownloadedLayer
+                            // will handle that.
+                            tracing::info!("layer file download completed after requester had cancelled");
+                            LAYER_IMPL_METRICS.inc_download_completed_without_requester();
                        },
                        (Err(e), _) => {
                            // our caller is cancellation safe, but we might be racing with
@@ -915,7 +894,7 @@ impl LayerInner {
        match rx.await {
            Ok((Ok(()), permit)) => {
                if let Some(reason) = self
-                    .needs_download(&timeline_clone)
+                    .needs_download()
                    .await
                    .map_err(DownloadError::PostStatFailed)?
                {
@@ -950,26 +929,16 @@ impl LayerInner {
        }
    }

-    async fn needs_download(
-        &self,
-        timeline: &Timeline,
-    ) -> Result<Option<NeedsDownload>, std::io::Error> {
-        let path = self.local_path(timeline);
-
-        match tokio::fs::metadata(path).await {
+    async fn needs_download(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
+        match tokio::fs::metadata(&self.path).await {
            Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
            Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
            Err(e) => Err(e),
        }
    }

-    fn needs_download_blocking(
-        &self,
-        timeline: &Timeline,
-    ) -> Result<Option<NeedsDownload>, std::io::Error> {
-        let path = self.local_path(timeline);
-
-        match path.metadata() {
+    fn needs_download_blocking(&self) -> Result<Option<NeedsDownload>, std::io::Error> {
+        match self.path.metadata() {
            Ok(m) => Ok(self.is_file_present_and_good_size(&m).err()),
            Err(e) if e.kind() == std::io::ErrorKind::NotFound => Ok(Some(NeedsDownload::NotFound)),
            Err(e) => Err(e),
@@ -1025,20 +994,14 @@ impl LayerInner {

    /// `DownloadedLayer` is being dropped, so it calls this method.
    fn on_downloaded_layer_drop(self: Arc<LayerInner>, version: usize) {
-        let delete = self.wanted_deleted.load(Ordering::Acquire);
+        let gc = self.wanted_garbage_collected.load(Ordering::Acquire);
        let evict = self.wanted_evicted.load(Ordering::Acquire);
        let can_evict = self.have_remote_client;

-        if delete {
-            // do nothing now, only in LayerInner::drop -- this was originally implemented because
-            // we could had already scheduled the deletion at the time.
-            //
-            // FIXME: this is not true anymore, we can safely evict wanted deleted files.
+        if gc {
+            // do nothing now, only in LayerInner::drop
        } else if can_evict && evict {
-            // If timeline is alive, we can construct a span with IDs for this function.
-            let span = self.timeline.upgrade().map(|timeline| {
-                tracing::info_span!(parent: None, "layer_evict", tenant_id = %timeline.tenant_shard_id.tenant_id, shard_id=%timeline.tenant_shard_id.shard_slug(), timeline_id = %timeline.timeline_id)
-            });
+            let span = tracing::info_span!(parent: None, "layer_evict", tenant_id = %self.desc.tenant_shard_id.tenant_id, shard_id = %self.desc.tenant_shard_id.shard_slug(), timeline_id = %self.desc.timeline_id, layer=%self, %version);

            // downgrade for queueing, in case there's a tear down already ongoing we should not
            // hold it alive.
@@ -1049,9 +1012,9 @@ impl LayerInner {
            // drop while the `self.inner` is being locked, leading to a deadlock.

            crate::task_mgr::BACKGROUND_RUNTIME.spawn_blocking(move || {
-                let _g = span.map(|s| s.entered());
+                let _g = span.entered();

-                // if LayerInner is already dropped here, do nothing because the delete on drop
+                // if LayerInner is already dropped here, do nothing because the garbage collection
                // has already ran while we were in queue
                let Some(this) = this.upgrade() else {
                    LAYER_IMPL_METRICS.inc_eviction_cancelled(EvictionCancelled::LayerGone);
@@ -1109,9 +1072,7 @@ impl LayerInner {
            LayerResidenceEventReason::ResidenceChange,
        );

-        let local_path = self.local_path(&timeline);
-
-        let res = match capture_mtime_and_remove(&local_path) {
+        let res = match capture_mtime_and_remove(&self.path) {
            Ok(local_layer_mtime) => {
                let duration = SystemTime::now().duration_since(local_layer_mtime);
                match duration {
@@ -1263,11 +1224,6 @@ impl DownloadedLayer {
        owner: &Arc<LayerInner>,
        ctx: &RequestContext,
    ) -> anyhow::Result<&'a LayerKind> {
-        let timeline = owner
-            .timeline
-            .upgrade()
-            .ok_or(DownloadError::TimelineShutdown)?;
-
        let init = || async {
            assert_eq!(
                Weak::as_ptr(&self.owner),
@@ -1277,23 +1233,23 @@ impl DownloadedLayer {

            let res = if owner.desc.is_delta {
                let summary = Some(delta_layer::Summary::expected(
-                    timeline.tenant_shard_id.tenant_id,
-                    timeline.timeline_id,
+                    owner.desc.tenant_shard_id.tenant_id,
+                    owner.desc.timeline_id,
                    owner.desc.key_range.clone(),
                    owner.desc.lsn_range.clone(),
                ));
-                delta_layer::DeltaLayerInner::load(&owner.local_path(&timeline), summary, ctx)
+                delta_layer::DeltaLayerInner::load(&owner.path, summary, ctx)
                    .await
                    .map(|res| res.map(LayerKind::Delta))
            } else {
                let lsn = owner.desc.image_layer_lsn();
                let summary = Some(image_layer::Summary::expected(
-                    timeline.tenant_shard_id.tenant_id,
-                    timeline.timeline_id,
+                    owner.desc.tenant_shard_id.tenant_id,
+                    owner.desc.timeline_id,
                    owner.desc.key_range.clone(),
                    lsn,
                ));
-                image_layer::ImageLayerInner::load(&owner.local_path(&timeline), lsn, summary, ctx)
+                image_layer::ImageLayerInner::load(&owner.path, lsn, summary, ctx)
                    .await
                    .map(|res| res.map(LayerKind::Image))
            };
@@ -1417,14 +1373,8 @@ impl ResidentLayer {
        }
    }

-    pub(crate) fn local_path_from_id(
-        &self,
-        tenant_shard_id: &TenantShardId,
-        timeline_id: &TimelineId,
-    ) -> Utf8PathBuf {
-        self.owner
-            .0
-            .local_path_from_id(tenant_shard_id, timeline_id)
+    pub(crate) fn local_path(&self) -> &Utf8Path {
+        &self.owner.0.path
    }

    pub(crate) fn access_stats(&self) -> &LayerAccessStats {
@@ -1455,38 +1405,36 @@ impl From<ResidentLayer> for Layer {
    }
 }

-use metrics::IntCounter;
+use metrics::{IntCounter, IntCounterVec};

-pub(crate) struct LayerImplMetrics {
+struct LayerImplMetrics {
    started_evictions: IntCounter,
    completed_evictions: IntCounter,
-    cancelled_evictions: enum_map::EnumMap<EvictionCancelled, IntCounter>,
+    cancelled_evictions: IntCounterVec,

-    started_deletes: IntCounter,
-    completed_deletes: IntCounter,
-    failed_deletes: enum_map::EnumMap<DeleteFailed, IntCounter>,
+    started_gcs: IntCounter,
+    completed_gcs: IntCounter,
+    failed_gcs: IntCounterVec,

-    rare_counters: enum_map::EnumMap<RareEvent, IntCounter>,
-    inits_cancelled: metrics::core::GenericCounter<metrics::core::AtomicU64>,
+    rare_counters: IntCounterVec,
 }

 impl Default for LayerImplMetrics {
    fn default() -> Self {
-        use enum_map::Enum;
-
-        // reminder: these will be pageserver_layer_* with "_total" suffix
-
-        let started_evictions = metrics::register_int_counter!(
-            "pageserver_layer_started_evictions",
-            "Evictions started in the Layer implementation"
-        )
-        .unwrap();
-        let completed_evictions = metrics::register_int_counter!(
-            "pageserver_layer_completed_evictions",
-            "Evictions completed in the Layer implementation"
+        let evictions = metrics::register_int_counter_vec!(
+            "pageserver_layer_evictions_count",
+            "Evictions started and completed in the Layer implementation",
+            &["state"]
        )
        .unwrap();

+        let started_evictions = evictions
+            .get_metric_with_label_values(&["started"])
+            .unwrap();
+        let completed_evictions = evictions
+            .get_metric_with_label_values(&["completed"])
+            .unwrap();
+
        let cancelled_evictions = metrics::register_int_counter_vec!(
            "pageserver_layer_cancelled_evictions_count",
            "Different reasons for evictions to have been cancelled or failed",
@@ -1494,36 +1442,24 @@ impl Default for LayerImplMetrics {
        )
        .unwrap();

-        let cancelled_evictions = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-            let reason = EvictionCancelled::from_usize(i);
-            let s = reason.as_str();
-            cancelled_evictions.with_label_values(&[s])
-        }));
-
-        let started_deletes = metrics::register_int_counter!(
-            "pageserver_layer_started_deletes",
-            "Deletions on drop pending in the Layer implementation"
-        )
-        .unwrap();
-        let completed_deletes = metrics::register_int_counter!(
-            "pageserver_layer_completed_deletes",
-            "Deletions on drop completed in the Layer implementation"
+        // reminder: this will be pageserver_layer_gcs_count_total with "_total" suffix
+        let gcs = metrics::register_int_counter_vec!(
+            "pageserver_layer_gcs_count",
+            "Garbage collections started and completed in the Layer implementation",
+            &["state"]
        )
        .unwrap();

-        let failed_deletes = metrics::register_int_counter_vec!(
-            "pageserver_layer_failed_deletes_count",
-            "Different reasons for deletions on drop to have failed",
+        let started_gcs = gcs.get_metric_with_label_values(&["pending"]).unwrap();
+        let completed_gcs = gcs.get_metric_with_label_values(&["completed"]).unwrap();
+
+        let failed_gcs = metrics::register_int_counter_vec!(
+            "pageserver_layer_failed_gcs_count",
+            "Different reasons for garbage collections to have failed",
            &["reason"]
        )
        .unwrap();

-        let failed_deletes = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-            let reason = DeleteFailed::from_usize(i);
-            let s = reason.as_str();
-            failed_deletes.with_label_values(&[s])
-        }));
-
        let rare_counters = metrics::register_int_counter_vec!(
            "pageserver_layer_assumed_rare_count",
            "Times unexpected or assumed rare event happened",
@@ -1531,29 +1467,16 @@ impl Default for LayerImplMetrics {
        )
        .unwrap();

-        let rare_counters = enum_map::EnumMap::from_array(std::array::from_fn(|i| {
-            let event = RareEvent::from_usize(i);
-            let s = event.as_str();
-            rare_counters.with_label_values(&[s])
-        }));
-
-        let inits_cancelled = metrics::register_int_counter!(
-            "pageserver_layer_inits_cancelled_count",
-            "Times Layer initialization was cancelled",
-        )
-        .unwrap();
-
        Self {
            started_evictions,
            completed_evictions,
            cancelled_evictions,

-            started_deletes,
-            completed_deletes,
-            failed_deletes,
+            started_gcs,
+            completed_gcs,
+            failed_gcs,

            rare_counters,
-            inits_cancelled,
        }
    }
 }
@@ -1566,33 +1489,57 @@ impl LayerImplMetrics {
        self.completed_evictions.inc();
    }
    fn inc_eviction_cancelled(&self, reason: EvictionCancelled) {
-        self.cancelled_evictions[reason].inc()
+        self.cancelled_evictions
+            .get_metric_with_label_values(&[reason.as_str()])
+            .unwrap()
+            .inc()
    }

-    fn inc_started_deletes(&self) {
-        self.started_deletes.inc();
+    fn inc_started_gcs(&self) {
+        self.started_gcs.inc();
    }
-    fn inc_completed_deletes(&self) {
-        self.completed_deletes.inc();
+    fn inc_completed_gcs(&self) {
+        self.completed_gcs.inc();
    }
-    fn inc_deletes_failed(&self, reason: DeleteFailed) {
-        self.failed_deletes[reason].inc();
+    fn inc_gcs_failed(&self, reason: GcFailed) {
+        self.failed_gcs
+            .get_metric_with_label_values(&[reason.as_str()])
+            .unwrap()
+            .inc();
    }

-    /// Counted separatedly from failed layer deletes because we will complete the layer deletion
-    /// attempt regardless of failure to delete local file.
-    fn inc_delete_removes_failed(&self) {
-        self.rare_counters[RareEvent::RemoveOnDropFailed].inc();
+    /// Counted separatedly from failed gcs because we will complete the gc attempt regardless of
+    /// failure to delete local file.
+    fn inc_gc_removes_failed(&self) {
+        self.rare_counters
+            .get_metric_with_label_values(&["gc_remove_failed"])
+            .unwrap()
+            .inc();
    }

-    /// Expected rare because requires a race with `evict_blocking` and `get_or_maybe_download`.
+    /// Expected rare because requires a race with `evict_blocking` and
+    /// `get_or_maybe_download`.
    fn inc_retried_get_or_maybe_download(&self) {
-        self.rare_counters[RareEvent::RetriedGetOrMaybeDownload].inc();
+        self.rare_counters
+            .get_metric_with_label_values(&["retried_gomd"])
+            .unwrap()
+            .inc();
    }

-    /// Expected rare because cancellations are unexpected, and failures are unexpected
+    /// Expected rare because cancellations are unexpected
+    fn inc_download_completed_without_requester(&self) {
+        self.rare_counters
+            .get_metric_with_label_values(&["download_completed_without"])
+            .unwrap()
+            .inc();
+    }
+
+    /// Expected rare because cancellations are unexpected
    fn inc_download_failed_without_requester(&self) {
-        self.rare_counters[RareEvent::DownloadFailedWithoutRequester].inc();
+        self.rare_counters
+            .get_metric_with_label_values(&["download_failed_without"])
+            .unwrap()
+            .inc();
    }

    /// The Weak in ResidentOrWantedEvicted::WantedEvicted was successfully upgraded.
@@ -1600,30 +1547,37 @@ impl LayerImplMetrics {
    /// If this counter is always zero, we should replace ResidentOrWantedEvicted type with an
    /// Option.
    fn inc_raced_wanted_evicted_accesses(&self) {
-        self.rare_counters[RareEvent::UpgradedWantedEvicted].inc();
+        self.rare_counters
+            .get_metric_with_label_values(&["raced_wanted_evicted"])
+            .unwrap()
+            .inc();
    }

-    /// These are only expected for [`Self::inc_init_cancelled`] amount when
+    /// These are only expected for [`Self::inc_download_completed_without_requester`] amount when
    /// running with remote storage.
    fn inc_init_needed_no_download(&self) {
-        self.rare_counters[RareEvent::InitWithoutDownload].inc();
+        self.rare_counters
+            .get_metric_with_label_values(&["init_needed_no_download"])
+            .unwrap()
+            .inc();
    }

    /// Expected rare because all layer files should be readable and good
    fn inc_permanent_loading_failures(&self) {
-        self.rare_counters[RareEvent::PermanentLoadingFailure].inc();
+        self.rare_counters
+            .get_metric_with_label_values(&["permanent_loading_failure"])
+            .unwrap()
+            .inc();
    }

    fn inc_broadcast_lagged(&self) {
-        self.rare_counters[RareEvent::EvictAndWaitLagged].inc();
-    }
-
-    fn inc_init_cancelled(&self) {
-        self.inits_cancelled.inc()
+        self.rare_counters
+            .get_metric_with_label_values(&["broadcast_lagged"])
+            .unwrap()
+            .inc();
    }
 }

-#[derive(enum_map::Enum)]
 enum EvictionCancelled {
    LayerGone,
    TimelineGone,
@@ -1652,47 +1606,19 @@ impl EvictionCancelled {
    }
 }

-#[derive(enum_map::Enum)]
-enum DeleteFailed {
+enum GcFailed {
    TimelineGone,
    DeleteSchedulingFailed,
 }

-impl DeleteFailed {
+impl GcFailed {
    fn as_str(&self) -> &'static str {
        match self {
-            DeleteFailed::TimelineGone => "timeline_gone",
-            DeleteFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
+            GcFailed::TimelineGone => "timeline_gone",
+            GcFailed::DeleteSchedulingFailed => "delete_scheduling_failed",
        }
    }
 }

-#[derive(enum_map::Enum)]
-enum RareEvent {
-    RemoveOnDropFailed,
-    RetriedGetOrMaybeDownload,
-    DownloadFailedWithoutRequester,
-    UpgradedWantedEvicted,
-    InitWithoutDownload,
-    PermanentLoadingFailure,
-    EvictAndWaitLagged,
-}
-
-impl RareEvent {
-    fn as_str(&self) -> &'static str {
-        use RareEvent::*;
-
-        match self {
-            RemoveOnDropFailed => "remove_on_drop_failed",
-            RetriedGetOrMaybeDownload => "retried_gomd",
-            DownloadFailedWithoutRequester => "download_failed_without",
-            UpgradedWantedEvicted => "raced_wanted_evicted",
-            InitWithoutDownload => "init_needed_no_download",
-            PermanentLoadingFailure => "permanent_loading_failure",
-            EvictAndWaitLagged => "broadcast_lagged",
-        }
-    }
-}
-
-pub(crate) static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
+static LAYER_IMPL_METRICS: once_cell::sync::Lazy<LayerImplMetrics> =
    once_cell::sync::Lazy::new(LayerImplMetrics::default);
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,6 +1,7 @@
 use core::fmt::Display;
+use pageserver_api::shard::TenantShardId;
 use std::ops::Range;
-use utils::lsn::Lsn;
+use utils::{id::TimelineId, lsn::Lsn};

 use crate::repository::Key;

@@ -8,11 +9,16 @@ use super::{DeltaFileName, ImageFileName, LayerFileName};

 use serde::{Deserialize, Serialize};

+#[cfg(test)]
+use utils::id::TenantId;
+
 /// A unique identifier of a persistent layer. This is different from `LayerDescriptor`, which is only used in the
 /// benchmarks. This struct contains all necessary information to find the image / delta layer. It also provides
 /// a unified way to generate layer information like file name.
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize)]
 pub struct PersistentLayerDesc {
+    pub tenant_shard_id: TenantShardId,
+    pub timeline_id: TimelineId,
    /// Range of keys that this layer covers
    pub key_range: Range<Key>,
    /// Inclusive start, exclusive end of the LSN range that this layer holds.
@@ -51,6 +57,8 @@ impl PersistentLayerDesc {
    #[cfg(test)]
    pub fn new_test(key_range: Range<Key>) -> Self {
        Self {
+            tenant_shard_id: TenantShardId::unsharded(TenantId::generate()),
+            timeline_id: TimelineId::generate(),
            key_range,
            lsn_range: Lsn(0)..Lsn(1),
            is_delta: false,
@@ -58,8 +66,16 @@ impl PersistentLayerDesc {
        }
    }

-    pub fn new_img(key_range: Range<Key>, lsn: Lsn, file_size: u64) -> Self {
+    pub fn new_img(
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        key_range: Range<Key>,
+        lsn: Lsn,
+        file_size: u64,
+    ) -> Self {
        Self {
+            tenant_shard_id,
+            timeline_id,
            key_range,
            lsn_range: Self::image_layer_lsn_range(lsn),
            is_delta: false,
@@ -67,8 +83,16 @@ impl PersistentLayerDesc {
        }
    }

-    pub fn new_delta(key_range: Range<Key>, lsn_range: Range<Lsn>, file_size: u64) -> Self {
+    pub fn new_delta(
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        key_range: Range<Key>,
+        lsn_range: Range<Lsn>,
+        file_size: u64,
+    ) -> Self {
        Self {
+            tenant_shard_id,
+            timeline_id,
            key_range,
            lsn_range,
            is_delta: true,
@@ -76,10 +100,23 @@ impl PersistentLayerDesc {
        }
    }

-    pub fn from_filename(filename: LayerFileName, file_size: u64) -> Self {
+    pub fn from_filename(
+        tenant_shard_id: TenantShardId,
+        timeline_id: TimelineId,
+        filename: LayerFileName,
+        file_size: u64,
+    ) -> Self {
        match filename {
-            LayerFileName::Image(i) => Self::new_img(i.key_range, i.lsn, file_size),
-            LayerFileName::Delta(d) => Self::new_delta(d.key_range, d.lsn_range, file_size),
+            LayerFileName::Image(i) => {
+                Self::new_img(tenant_shard_id, timeline_id, i.key_range, i.lsn, file_size)
+            }
+            LayerFileName::Delta(d) => Self::new_delta(
+                tenant_shard_id,
+                timeline_id,
+                d.key_range,
+                d.lsn_range,
+                file_size,
+            ),
        }
    }

@@ -136,6 +173,10 @@ impl PersistentLayerDesc {
        self.key_range.clone()
    }

+    pub fn get_timeline_id(&self) -> TimelineId {
+        self.timeline_id
+    }
+
    /// Does this layer only contain some data for the key-range (incremental),
    /// or does it contain a version of every page? This is important to know
    /// for garbage collecting old layers: an incremental layer depends on
@@ -151,7 +192,9 @@ impl PersistentLayerDesc {
    pub fn dump(&self) {
        if self.is_delta {
            println!(
-                "----- delta layer keys {}-{} lsn {}-{} is_incremental {} size {} ----",
+                "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} is_incremental {} size {} ----",
+                self.tenant_shard_id,
+                self.timeline_id,
                self.key_range.start,
                self.key_range.end,
                self.lsn_range.start,
@@ -161,7 +204,9 @@ impl PersistentLayerDesc {
            );
        } else {
            println!(
-                "----- image layer key {}-{} at {} is_incremental {} size {} ----",
+                "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
+                self.tenant_shard_id,
+                self.timeline_id,
                self.key_range.start,
                self.key_range.end,
                self.image_layer_lsn(),
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -313,10 +313,6 @@ pub struct Timeline {
    /// Gate to prevent shutdown completing while I/O is still happening to this timeline's data
    pub(crate) gate: Gate,

-    /// Gate to prevent shutdown completing until all Layers for this Timeline have finished
-    /// doing any background I/O such as deleting files on drop.
-    pub(crate) layer_gate: Gate,
-
    /// Cancellation token scoped to this timeline: anything doing long-running work relating
    /// to the timeline should drop out when this token fires.
    pub(crate) cancel: CancellationToken,
@@ -482,7 +478,7 @@ impl Timeline {
            .map(|ancestor| ancestor.timeline_id)
    }

-    /// Lock and get timeline's GC cutoff
+    /// Lock and get timeline's GC cuttof
    pub fn get_latest_gc_cutoff_lsn(&self) -> RcuReadGuard<Lsn> {
        self.latest_gc_cutoff_lsn.read()
    }
@@ -1006,15 +1002,8 @@ impl Timeline {
        )
        .await;

-        // Wait until any normal gate-holders such as page_service requests are complete
+        // Finally wait until any gate-holders are complete
        self.gate.close().await;
-
-        // Drop our references to layers: this should permit all layers to be dropped, and any I/O
-        // in their drop() method to complete.
-        self.layers.write().await.clear();
-
-        // Wait until any Layer gate holders such as LayerInner::drop are complete
-        self.layer_gate.close().await;
    }

    pub fn set_state(&self, new_state: TimelineState) {
@@ -1456,7 +1445,6 @@ impl Timeline {

                cancel,
                gate: Gate::new(format!("Timeline<{tenant_shard_id}/{timeline_id}>")),
-                layer_gate: Gate::new(format!("TimelineLayers<{tenant_shard_id}/{timeline_id}>")),

                compaction_lock: tokio::sync::Mutex::default(),
                gc_lock: tokio::sync::Mutex::default(),
@@ -2188,7 +2176,7 @@ trait TraversalLayerExt {

 impl TraversalLayerExt for Layer {
    fn traversal_id(&self) -> TraversalId {
-        self.filename().to_string()
+        self.local_path().to_string()
    }
 }

@@ -2902,8 +2890,7 @@ impl Timeline {
                let _g = span.entered();
                let new_delta =
                    Handle::current().block_on(frozen_layer.write_to_disk(&self_clone, &ctx))?;
-                let new_delta_path = new_delta
-                    .local_path_from_id(&self_clone.tenant_shard_id, &self_clone.timeline_id);
+                let new_delta_path = new_delta.local_path().to_owned();

                // Sync it to disk.
                //
@@ -3147,7 +3134,7 @@ impl Timeline {
        // and fsync them all in parallel.
        let all_paths = image_layers
            .iter()
-            .map(|layer| layer.local_path_from_id(&self.tenant_shard_id, &self.timeline_id))
+            .map(|layer| layer.local_path().to_owned())
            .collect::<Vec<_>>();

        par_fsync::par_fsync_async(&all_paths)
@@ -3696,7 +3683,7 @@ impl Timeline {
            // FIXME: the writer already fsyncs all data, only rename needs to be fsynced here
            let layer_paths: Vec<Utf8PathBuf> = new_layers
                .iter()
-                .map(|l| l.local_path_from_id(&self.tenant_shard_id, &self.timeline_id))
+                .map(|l| l.local_path().to_owned())
                .collect();

            // Fsync all the layer files and directory using multiple threads to
@@ -3984,7 +3971,7 @@ impl Timeline {
        // for details. This will block until the old value is no longer in use.
        //
        // The GC cutoff should only ever move forwards.
-        let waitlist = {
+        {
            let write_guard = self.latest_gc_cutoff_lsn.lock_for_write();
            ensure!(
                *write_guard <= new_gc_cutoff,
@@ -3992,9 +3979,8 @@ impl Timeline {
                *write_guard,
                new_gc_cutoff
            );
-            write_guard.store_and_unlock(new_gc_cutoff)
-        };
-        waitlist.wait().await;
+            write_guard.store_and_unlock(new_gc_cutoff).wait();
+        }

        info!("GC starting");

--- a/pageserver/src/tenant/timeline/layer_manager.rs
+++ b/pageserver/src/tenant/timeline/layer_manager.rs
@@ -33,11 +33,6 @@ impl LayerManager {
        }
    }

-    pub(crate) fn clear(&mut self) {
-        self.layer_map = LayerMap::default();
-        self.layer_fmgr.clear();
-    }
-
    pub(crate) fn get_from_desc(&self, desc: &PersistentLayerDesc) -> Layer {
        self.layer_fmgr.get_from_desc(desc)
    }
@@ -248,7 +243,7 @@ impl LayerManager {
        //      map index without actually rebuilding the index.
        updates.remove_historic(desc);
        mapping.remove(layer);
-        layer.delete_on_drop();
+        layer.garbage_collect_on_drop();
    }

    pub(crate) fn contains(&self, layer: &Layer) -> bool {
@@ -276,10 +271,6 @@ impl<T: AsLayerDesc + Clone> LayerFileManager<T> {
        }
    }

-    pub(crate) fn clear(&mut self) {
-        self.0.clear();
-    }
-
    pub(crate) fn contains(&self, layer: &T) -> bool {
        self.0.contains_key(&layer.layer_desc().key())
    }
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -610,11 +610,9 @@ impl Drop for VirtualFile {
            slot.recently_used.store(false, Ordering::Relaxed);
            // there is also operation "close-by-replace" for closes done on eviction for
            // comparison.
-            if let Some(fd) = slot_guard.file.take() {
-                STORAGE_IO_TIME_METRIC
-                    .get(StorageIoOperation::Close)
-                    .observe_closure_duration(|| drop(fd));
-            }
+            STORAGE_IO_TIME_METRIC
+                .get(StorageIoOperation::Close)
+                .observe_closure_duration(|| drop(slot_guard.file.take()));
        }
    }
 }
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -21,6 +21,7 @@
 //! redo Postgres process, but some records it can handle directly with
 //! bespoken Rust code.

+use pageserver_api::shard::ShardIdentity;
 use postgres_ffi::v14::nonrelfile_utils::clogpage_precedes;
 use postgres_ffi::v14::nonrelfile_utils::slru_may_delete_clogsegment;
 use postgres_ffi::{fsm_logical_to_physical, page_is_new, page_set_lsn};
@@ -30,6 +31,7 @@ use bytes::{Buf, Bytes, BytesMut};
 use tracing::*;

 use crate::context::RequestContext;
+use crate::metrics::WAL_INGEST;
 use crate::pgdatadir_mapping::*;
 use crate::tenant::PageReconstructError;
 use crate::tenant::Timeline;
@@ -46,6 +48,7 @@ use postgres_ffi::BLCKSZ;
 use utils::lsn::Lsn;

 pub struct WalIngest<'a> {
+    shard: ShardIdentity,
    timeline: &'a Timeline,

    checkpoint: CheckPoint,
@@ -65,6 +68,7 @@ impl<'a> WalIngest<'a> {
        trace!("CheckPoint.nextXid = {}", checkpoint.nextXid.value);

        Ok(WalIngest {
+            shard: *timeline.get_shard_identity(),
            timeline,
            checkpoint,
            checkpoint_modified: false,
@@ -87,6 +91,8 @@ impl<'a> WalIngest<'a> {
        decoded: &mut DecodedWALRecord,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
+        WAL_INGEST.records_received.inc();
+
        modification.lsn = lsn;
        decode_wal_record(recdata, decoded, self.timeline.pg_version)?;

@@ -355,6 +361,32 @@ impl<'a> WalIngest<'a> {
        // Iterate through all the blocks that the record modifies, and
        // "put" a separate copy of the record for each block.
        for blk in decoded.blocks.iter() {
+            let rel = RelTag {
+                spcnode: blk.rnode_spcnode,
+                dbnode: blk.rnode_dbnode,
+                relnode: blk.rnode_relnode,
+                forknum: blk.forknum,
+            };
+
+            let key = rel_block_to_key(rel, blk.blkno);
+            let key_is_local = self.shard.is_key_local(&key);
+
+            tracing::debug!(
+                "ingest: shard decision {} (checkpoint={}) for key {}",
+                if !key_is_local { "drop" } else { "keep" },
+                self.checkpoint_modified,
+                key
+            );
+
+            if !key_is_local {
+                if self.shard.is_zero() {
+                    // Shard 0 tracks relation sizes.  Although we will not store this block, we will observe
+                    // its blkno in case it implicitly extends a relation.
+                    self.observe_decoded_block(modification, blk, ctx).await?;
+                }
+
+                continue;
+            }
            self.ingest_decoded_block(modification, lsn, decoded, blk, ctx)
                .await?;
        }
@@ -367,13 +399,37 @@ impl<'a> WalIngest<'a> {
            self.checkpoint_modified = false;
        }

+        if modification.is_empty() {
+            tracing::debug!("ingest: filtered out record @ LSN {lsn}");
+            WAL_INGEST.records_filtered.inc();
+            return Ok(());
+        }
+
        // Now that this record has been fully handled, including updating the
        // checkpoint data, let the repository know that it is up-to-date to this LSN
+        WAL_INGEST.records_committed.inc();
        modification.commit(ctx).await?;

        Ok(())
    }

+    /// Do not store this block, but observe it for the purposes of updating our relation size state.
+    async fn observe_decoded_block(
+        &mut self,
+        modification: &mut DatadirModification<'_>,
+        blk: &DecodedBkpBlock,
+        ctx: &RequestContext,
+    ) -> Result<(), PageReconstructError> {
+        let rel = RelTag {
+            spcnode: blk.rnode_spcnode,
+            dbnode: blk.rnode_dbnode,
+            relnode: blk.rnode_relnode,
+            forknum: blk.forknum,
+        };
+        self.handle_rel_extend(modification, rel, blk.blkno, ctx)
+            .await
+    }
+
    async fn ingest_decoded_block(
        &mut self,
        modification: &mut DatadirModification<'_>,
@@ -1465,8 +1521,15 @@ impl<'a> WalIngest<'a> {
            //info!("extending {} {} to {}", rel, old_nblocks, new_nblocks);
            modification.put_rel_extend(rel, new_nblocks, ctx).await?;

+            let mut key = rel_block_to_key(rel, blknum);
            // fill the gap with zeros
            for gap_blknum in old_nblocks..blknum {
+                key.field6 = gap_blknum;
+
+                if self.shard.get_shard_number(&key) != self.shard.number {
+                    continue;
+                }
+
                modification.put_rel_page_image(rel, gap_blknum, ZERO_PAGE.clone())?;
            }
        }
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -18,6 +18,7 @@
 #include "fmgr.h"
 #include "access/xlog.h"
 #include "access/xlogutils.h"
+#include "common/hashfn.h"
 #include "storage/buf_internals.h"
 #include "storage/lwlock.h"
 #include "storage/ipc.h"
@@ -36,22 +37,12 @@
 #include "neon.h"
 #include "walproposer.h"
 #include "neon_utils.h"
+#include "control_plane_connector.h"

 #define PageStoreTrace DEBUG5

 #define RECONNECT_INTERVAL_USEC 1000000

-bool		connected = false;
-PGconn	   *pageserver_conn = NULL;
-
-/*
- * WaitEventSet containing:
- * - WL_SOCKET_READABLE on pageserver_conn,
- * - WL_LATCH_SET on MyLatch, and
- * - WL_EXIT_ON_PM_DEATH.
- */
-WaitEventSet *pageserver_conn_wes = NULL;
-
 /* GUCs */
 char	   *neon_timeline;
 char	   *neon_tenant;
@@ -64,87 +55,175 @@ int			flush_every_n_requests = 8;

 int			n_reconnect_attempts = 0;
 int			max_reconnect_attempts = 60;
-
-#define MAX_PAGESERVER_CONNSTRING_SIZE 256
-
-typedef struct
-{
-    LWLockId lock;
-    pg_atomic_uint64 update_counter;
-    char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
-} PagestoreShmemState;
-
-#if PG_VERSION_NUM >= 150000
-static shmem_request_hook_type prev_shmem_request_hook = NULL;
-static void walproposer_shmem_request(void);
-#endif
-static shmem_startup_hook_type prev_shmem_startup_hook;
-static PagestoreShmemState *pagestore_shared;
-static uint64 pagestore_local_counter = 0;
-static char local_pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
+int			stripe_size;

 bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;

-static bool pageserver_flush(void);
-static void pageserver_disconnect(void);
+static bool pageserver_flush(shardno_t shard_no);
+static void pageserver_disconnect(shardno_t shard_no);
+static void AssignPageserverConnstring(const char *newval, void *extra);

-static bool
-PagestoreShmemIsValid()
-{
-    return pagestore_shared && UsedShmemSegAddr;
-}
+static shmem_startup_hook_type prev_shmem_startup_hook;
+#if PG_VERSION_NUM>=150000
+static shmem_request_hook_type prev_shmem_request_hook;
+#endif

-static bool
-CheckPageserverConnstring(char **newval, void **extra, GucSource source)
+typedef struct
 {
-    return strlen(*newval) < MAX_PAGESERVER_CONNSTRING_SIZE;
+	size_t n_shards;
+	pg_atomic_uint64 begin_update_counter;
+	pg_atomic_uint64 end_update_counter;
+	char   shard_connstr[MAX_SHARDS][MAX_PS_CONNSTR_LEN];
+} ShardMap;
+
+
+static ShardMap* shard_map;
+static uint64    shard_map_update_counter;
+
+typedef struct
+{
+	/*
+	 * Connection for each shard
+	 */
+	PGconn	   *conn;
+    /*
+	 * WaitEventSet containing:
+	 * - WL_SOCKET_READABLE on pageserver_conn,
+	 * - WL_LATCH_SET on MyLatch, and
+	 * - WL_EXIT_ON_PM_DEATH.
+	 */
+	WaitEventSet    *wes;
+} PageServer;
+
+static PageServer page_servers[MAX_SHARDS];
+static shardno_t  max_attached_shard_no;
+
+static void
+psm_shmem_startup(void)
+{
+	bool found;
+	if (prev_shmem_startup_hook)
+	{
+		prev_shmem_startup_hook();
+	}
+
+	LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
+
+	shard_map = (ShardMap*)ShmemInitStruct("shard_map", sizeof(ShardMap), &found);
+	if (!found)
+	{
+		shard_map->n_shards = 0;
+		pg_atomic_init_u64(&shard_map->begin_update_counter, 0);
+		pg_atomic_init_u64(&shard_map->end_update_counter, 0);
+		AssignPageserverConnstring(page_server_connstring, NULL);
+	}
+	LWLockRelease(AddinShmemInitLock);
 }

 static void
-AssignPageserverConnstring(const char *newval, void *extra)
+psm_shmem_request(void)
 {
-    if(!PagestoreShmemIsValid())
-        return;
-    LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
-    strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
-    pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
-    LWLockRelease(pagestore_shared->lock);
-}
+#if PG_VERSION_NUM>=150000
+	if (prev_shmem_request_hook)
+		prev_shmem_request_hook();
+#endif

-static bool
-CheckConnstringUpdated()
-{
-    if(!PagestoreShmemIsValid())
-        return false;
-    return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
+	RequestAddinShmemSpace(sizeof(ShardMap));
 }

 static void
-ReloadConnstring()
+psm_init(void)
 {
-    if(!PagestoreShmemIsValid())
-        return;
-    LWLockAcquire(pagestore_shared->lock, LW_SHARED);
-    strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
-    pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
-    LWLockRelease(pagestore_shared->lock);
+	prev_shmem_startup_hook = shmem_startup_hook;
+	shmem_startup_hook = psm_shmem_startup;
+#if PG_VERSION_NUM>=150000
+	prev_shmem_request_hook = shmem_request_hook;
+	shmem_request_hook = psm_shmem_request;
+#else
+	psm_shmem_request();
+#endif
+}
+
+/*
+ * Reload page map if needed and return number of shards and connection string for the specified shard
+ */
+static shardno_t
+load_shard_map(shardno_t shard_no, char* connstr)
+{
+	shardno_t n_shards;
+	uint64 begin_update_counter;
+	uint64 end_update_counter;
+
+	/*
+	 * There is race condition here between backendc and postmaster which can update shard map.
+	 * We recheck update couner after copying connection string to check that configuration was not changed.
+	 */
+	do
+	{
+		begin_update_counter = pg_atomic_read_u64(&shard_map->begin_update_counter);
+		end_update_counter = pg_atomic_read_u64(&shard_map->end_update_counter);
+
+		n_shards = shard_map->n_shards;
+		if (shard_no >= n_shards)
+			elog(ERROR, "Shard %d is greater or equal than number of shards %d", shard_no, n_shards);
+
+		if (connstr)
+			strncpy(connstr, shard_map->shard_connstr[shard_no], MAX_PS_CONNSTR_LEN);
+
+	}
+	while (begin_update_counter != end_update_counter
+		   || begin_update_counter != pg_atomic_read_u64(&shard_map->begin_update_counter)
+		   || end_update_counter != pg_atomic_read_u64(&shard_map->end_update_counter));
+
+	if (shard_map_update_counter != end_update_counter)
+ 	{
+		/* Reset all connections if connection strings are changed */
+ 		for (shardno_t i = 0; i < max_attached_shard_no; i++)
+ 		{
+ 			if (page_servers[i].conn)
+ 				pageserver_disconnect(i);
+ 		}
+		max_attached_shard_no = 0;
+		shard_map_update_counter = end_update_counter;
+    }
+
+	return n_shards;
+}
+
+#define MB (1024*1024)
+
+shardno_t
+get_shard_number(BufferTag* tag)
+{
+	shardno_t n_shards = load_shard_map(0, NULL);
+	uint32	  hash;
+
+#if PG_MAJORVERSION_NUM < 16
+	hash = murmurhash32(tag->rnode.relNode);
+	hash = hash_combine(hash, murmurhash32(tag->blockNum/(MB/BLCKSZ)/stripe_size));
+#else
+	hash = murmurhash32(tag->relNumber);
+	hash = hash_combine(hash, murmurhash32(tag->blockNum/(MB/BLCKSZ)/stripe_size));
+#endif
+
+	return hash % n_shards;
 }

 static bool
-pageserver_connect(int elevel)
+pageserver_connect(shardno_t shard_no, int elevel)
 {
 	char	   *query;
 	int			ret;
 	const char *keywords[3];
 	const char *values[3];
 	int			n;
+	PGconn*		conn;
+	WaitEventSet *wes;
+	char        connstr[MAX_PS_CONNSTR_LEN];

-	Assert(!connected);
+	Assert(page_servers[shard_no].conn == NULL);

-        if(CheckConnstringUpdated())
-        {
-            ReloadConnstring();
-        }
+	(void)load_shard_map(shard_no, connstr); /* refresh page map if needed */

 	/*
 	 * Connect using the connection string we got from the
@@ -165,19 +244,18 @@ pageserver_connect(int elevel)
 		n++;
 	}
 	keywords[n] = "dbname";
-	values[n] = local_pageserver_connstring;
+	values[n] = connstr;
 	n++;
 	keywords[n] = NULL;
 	values[n] = NULL;
 	n++;
-	pageserver_conn = PQconnectdbParams(keywords, values, 1);
+	conn = PQconnectdbParams(keywords, values, 1);

-	if (PQstatus(pageserver_conn) == CONNECTION_BAD)
+	if (PQstatus(conn) == CONNECTION_BAD)
 	{
-		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+		char	   *msg = pchomp(PQerrorMessage(conn));

-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
+		PQfinish(conn);

 		ereport(elevel,
 				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
@@ -185,30 +263,28 @@ pageserver_connect(int elevel)
 				 errdetail_internal("%s", msg)));
 		return false;
 	}
-
 	query = psprintf("pagestream %s %s", neon_tenant, neon_timeline);
-	ret = PQsendQuery(pageserver_conn, query);
+	ret = PQsendQuery(conn, query);
 	if (ret != 1)
 	{
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
+		PQfinish(conn);
 		neon_log(elevel, "could not send pagestream command to pageserver");
 		return false;
 	}

-	pageserver_conn_wes = CreateWaitEventSet(TopMemoryContext, 3);
-	AddWaitEventToSet(pageserver_conn_wes, WL_LATCH_SET, PGINVALID_SOCKET,
+	wes = CreateWaitEventSet(TopMemoryContext, 3);
+	AddWaitEventToSet(wes, WL_LATCH_SET, PGINVALID_SOCKET,
 			  MyLatch, NULL);
-	AddWaitEventToSet(pageserver_conn_wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
+	AddWaitEventToSet(wes, WL_EXIT_ON_PM_DEATH, PGINVALID_SOCKET,
 			  NULL, NULL);
-	AddWaitEventToSet(pageserver_conn_wes, WL_SOCKET_READABLE, PQsocket(pageserver_conn), NULL, NULL);
+	AddWaitEventToSet(wes, WL_SOCKET_READABLE, PQsocket(conn), NULL, NULL);

-	while (PQisBusy(pageserver_conn))
+	while (PQisBusy(conn))
 	{
 		WaitEvent	event;

 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();
@@ -216,14 +292,12 @@ pageserver_connect(int elevel)
 		/* Data available in socket? */
 		if (event.events & WL_SOCKET_READABLE)
 		{
-			if (!PQconsumeInput(pageserver_conn))
+			if (!PQconsumeInput(conn))
 			{
-				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+				char	   *msg = pchomp(PQerrorMessage(conn));

-				PQfinish(pageserver_conn);
-				pageserver_conn = NULL;
-				FreeWaitEventSet(pageserver_conn_wes);
-				pageserver_conn_wes = NULL;
+				PQfinish(conn);
+				FreeWaitEventSet(wes);

 				neon_log(elevel, "could not complete handshake with pageserver: %s",
 						 msg);
@@ -232,9 +306,11 @@ pageserver_connect(int elevel)
 		}
 	}

-	neon_log(LOG, "libpagestore: connected to '%s'", page_server_connstring);
+	neon_log(LOG, "libpagestore: connected to '%s'", connstr);
+	page_servers[shard_no].conn = conn;
+	page_servers[shard_no].wes = wes;
+	max_attached_shard_no = Max(shard_no+1, max_attached_shard_no);

-	connected = true;
 	return true;
 }

@@ -242,10 +318,10 @@ pageserver_connect(int elevel)
 * A wrapper around PQgetCopyData that checks for interrupts while sleeping.
 */
 static int
-call_PQgetCopyData(char **buffer)
+call_PQgetCopyData(shardno_t shard_no, char **buffer)
 {
 	int			ret;
-
+	PGconn*     pageserver_conn = page_servers[shard_no].conn;
 retry:
 	ret = PQgetCopyData(pageserver_conn, buffer, 1 /* async */ );

@@ -254,7 +330,7 @@ retry:
 		WaitEvent	event;

 		/* Sleep until there's something to do */
-		(void) WaitEventSetWait(pageserver_conn_wes, -1L, &event, 1, PG_WAIT_EXTENSION);
+		(void) WaitEventSetWait(page_servers[shard_no].wes, -1L, &event, 1, PG_WAIT_EXTENSION);
 		ResetLatch(MyLatch);

 		CHECK_FOR_INTERRUPTS();
@@ -279,7 +355,7 @@ retry:


 static void
-pageserver_disconnect(void)
+pageserver_disconnect(shardno_t shard_no)
 {
 	/*
 	 * If anything goes wrong while we were sending a request, it's not clear
@@ -288,38 +364,32 @@ pageserver_disconnect(void)
 	 * time later after we have already sent a new unrelated request. Close
 	 * the connection to avoid getting confused.
 	 */
-	if (connected)
+	if (page_servers[shard_no].conn)
 	{
 		neon_log(LOG, "dropping connection to page server due to error");
-		PQfinish(pageserver_conn);
-		pageserver_conn = NULL;
-		connected = false;
+		PQfinish(page_servers[shard_no].conn);
+		page_servers[shard_no].conn = NULL;

 		prefetch_on_ps_disconnect();
 	}
-	if (pageserver_conn_wes != NULL)
+	if (page_servers[shard_no].wes != NULL)
 	{
-		FreeWaitEventSet(pageserver_conn_wes);
-		pageserver_conn_wes = NULL;
+		FreeWaitEventSet(page_servers[shard_no].wes);
+		page_servers[shard_no].wes = NULL;
 	}
 }

 static bool
-pageserver_send(NeonRequest * request)
+pageserver_send(shardno_t shard_no, NeonRequest * request)
 {
 	StringInfoData req_buff;
-
-        if(CheckConnstringUpdated())
-        {
-            pageserver_disconnect();
-            ReloadConnstring();
-        }
+	PGconn* pageserver_conn = page_servers[shard_no].conn;

 	/* If the connection was lost for some reason, reconnect */
-	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
+	if (pageserver_conn && PQstatus(pageserver_conn) == CONNECTION_BAD)
 	{
 		neon_log(LOG, "pageserver_send disconnect bad connection");
-		pageserver_disconnect();
+		pageserver_disconnect(shard_no);
 	}

 	req_buff = nm_pack_request(request);
@@ -331,9 +401,9 @@ pageserver_send(NeonRequest * request)
 	 * See https://github.com/neondatabase/neon/issues/1138
 	 * So try to reestablish connection in case of failure.
 	 */
-	if (!connected)
+	if (!page_servers[shard_no].conn)
 	{
-		while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
+		while (!pageserver_connect(shard_no, n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
 			HandleMainLoopInterrupts();
 			n_reconnect_attempts += 1;
@@ -342,7 +412,9 @@ pageserver_send(NeonRequest * request)
 		n_reconnect_attempts = 0;
 	}

-	/*
+	pageserver_conn = page_servers[shard_no].conn;
+
+    /*
 	 * Send request.
 	 *
 	 * In principle, this could block if the output buffer is full, and we
@@ -353,7 +425,7 @@ pageserver_send(NeonRequest * request)
 	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
 	{
 		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-		pageserver_disconnect();
+		pageserver_disconnect(shard_no);
 		neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
 		pfree(msg);
 		pfree(req_buff.data);
@@ -373,12 +445,12 @@ pageserver_send(NeonRequest * request)
 }

 static NeonResponse *
-pageserver_receive(void)
+pageserver_receive(shardno_t shard_no)
 {
 	StringInfoData resp_buff;
 	NeonResponse *resp;
-
-	if (!connected)
+	PGconn* pageserver_conn = page_servers[shard_no].conn;
+	if (!pageserver_conn)
 		return NULL;

 	PG_TRY();
@@ -386,7 +458,7 @@ pageserver_receive(void)
 		/* read response */
 		int			rc;

-		rc = call_PQgetCopyData(&resp_buff.data);
+		rc = call_PQgetCopyData(shard_no, &resp_buff.data);
 		if (rc >= 0)
 		{
 			resp_buff.len = rc;
@@ -405,25 +477,25 @@ pageserver_receive(void)
 		else if (rc == -1)
 		{
 			neon_log(LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
-			pageserver_disconnect();
+			pageserver_disconnect(shard_no);
 			resp = NULL;
 		}
 		else if (rc == -2)
 		{
 			char* msg = pchomp(PQerrorMessage(pageserver_conn));
-			pageserver_disconnect();
+			pageserver_disconnect(shard_no);
 			neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
 		}
 		else
 		{
-			pageserver_disconnect();
+			pageserver_disconnect(shard_no);
 			neon_log(ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
 		}
 	}
 	PG_CATCH();
 	{
 		neon_log(LOG, "pageserver_receive disconnect due to caught exception");
-		pageserver_disconnect();
+		pageserver_disconnect(shard_no);
 		PG_RE_THROW();
 	}
 	PG_END_TRY();
@@ -433,9 +505,10 @@ pageserver_receive(void)


 static bool
-pageserver_flush(void)
+pageserver_flush(shardno_t shard_no)
 {
-	if (!connected)
+	PGconn* pageserver_conn = page_servers[shard_no].conn;
+	if (!pageserver_conn)
 	{
 		neon_log(WARNING, "Tried to flush while disconnected");
 	}
@@ -444,7 +517,7 @@ pageserver_flush(void)
 		if (PQflush(pageserver_conn))
 		{
 			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-			pageserver_disconnect();
+			pageserver_disconnect(shard_no);
 			neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
 			pfree(msg);
 			return false;
@@ -468,62 +541,61 @@ check_neon_id(char **newval, void **extra, GucSource source)
 	return **newval == '\0' || HexDecodeString(id, *newval, 16);
 }

-static Size
-PagestoreShmemSize(void)
-{
-    return sizeof(PagestoreShmemState);
-}
-
-static bool
-PagestoreShmemInit(void)
-{
-    bool found;
-    LWLockAcquire(AddinShmemInitLock, LW_EXCLUSIVE);
-    pagestore_shared = ShmemInitStruct("libpagestore shared state",
-                                       PagestoreShmemSize(),
-                                       &found);
-    if(!found)
-    {
-        pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
-        pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
-        AssignPageserverConnstring(page_server_connstring, NULL);
-    }
-    LWLockRelease(AddinShmemInitLock);
-    return found;
-}
-
 static void
-pagestore_shmem_startup_hook(void)
+AssignPageserverConnstring(const char *newval, void *extra)
 {
-    if(prev_shmem_startup_hook)
-        prev_shmem_startup_hook();
+	/*
+	 * Load shard map only at Postmaster.
+	 * If old page server is not available, then backends can be blocked in attempts to reconnect to it and do not reload config in this loop
+	 */
+	if (shard_map != NULL && (MyProcPid == PostmasterPid || shard_map->n_shards == 0))
+ 	{
+		char const* shard_connstr = newval;
+		char const* sep;
+		size_t connstr_len;
+		int i = 0;
+		bool shard_map_changed = false;
+		do
+		{
+			sep = strchr(shard_connstr, ',');
+			connstr_len = sep != NULL ? sep - shard_connstr : strlen(shard_connstr);
+			if (connstr_len == 0)
+				break; /* trailing comma */
+			if (i >= MAX_SHARDS)
+			{
+				elog(LOG, "Too many shards");
+				return;
+			}
+			if (connstr_len >= MAX_PS_CONNSTR_LEN)
+			{
+				elog(LOG, "Connection  string too long");
+				return;
+			}
+			if (i >= shard_map->n_shards ||
+				strcmp(shard_map->shard_connstr[i], shard_connstr) != 0)
+			{
+				if (!shard_map_changed)
+				{
+					pg_atomic_add_fetch_u64(&shard_map->begin_update_counter, 1);
+					shard_map_changed = true;
+				}
+				memcpy(shard_map->shard_connstr[i], shard_connstr, connstr_len+1);
+			}
+			shard_connstr = sep + 1;
+			i += 1;
+		} while (sep != NULL);

-    PagestoreShmemInit();
-}
-
-static void
-pagestore_shmem_request(void)
-{
-#if PG_VERSION_NUM >= 150000
-    if(prev_shmem_request_hook)
-        prev_shmem_request_hook();
-#endif
-
-    RequestAddinShmemSpace(PagestoreShmemSize());
-    RequestNamedLWLockTranche("neon_libpagestore", 1);
-}
-
-static void
-pagestore_prepare_shmem(void)
-{
-#if PG_VERSION_NUM >= 150000
-	prev_shmem_request_hook = shmem_request_hook;
-	shmem_request_hook = pagestore_shmem_request;
-#else
-        pagestore_shmem_request();
-#endif
-	prev_shmem_startup_hook = shmem_startup_hook;
-	shmem_startup_hook = pagestore_shmem_startup_hook;
+		if (i == 0)
+		{
+			elog(LOG, "No shards were specified");
+			return;
+		}
+		if (shard_map_changed)
+		{
+			shard_map->n_shards = i;
+			pg_atomic_add_fetch_u64(&shard_map->end_update_counter, 1);
+		}
+	}
 }

 /*
@@ -532,8 +604,6 @@ pagestore_prepare_shmem(void)
 void
 pg_init_libpagestore(void)
 {
-        pagestore_prepare_shmem();
-
 	DefineCustomStringVariable("neon.pageserver_connstring",
 							   "connection string to the page server",
 							   NULL,
@@ -541,7 +611,7 @@ pg_init_libpagestore(void)
 							   "",
 							   PGC_SIGHUP,
 							   0,	/* no flags required */
-							   CheckPageserverConnstring, AssignPageserverConnstring, NULL);
+							   NULL, AssignPageserverConnstring, NULL);

 	DefineCustomStringVariable("neon.timeline_id",
 							   "Neon timeline_id the server is running on",
@@ -561,6 +631,15 @@ pg_init_libpagestore(void)
 							   0,	/* no flags required */
 							   check_neon_id, NULL, NULL);

+	DefineCustomIntVariable("neon.stripe_size",
+							"sharding sripe size",
+							NULL,
+							&stripe_size,
+							256, 1, INT_MAX,
+							PGC_SIGHUP,
+							GUC_UNIT_MB,
+							NULL, NULL, NULL);
+
 	DefineCustomIntVariable("neon.max_cluster_size",
 							"cluster size limit",
 							NULL,
@@ -623,4 +702,5 @@ pg_init_libpagestore(void)
 	}

 	lfc_init();
+	psm_init();
 }
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -20,12 +20,16 @@
 #include RELFILEINFO_HDR
 #include "storage/block.h"
 #include "storage/smgr.h"
+#include "storage/buf_internals.h"
 #include "lib/stringinfo.h"
 #include "libpq/pqformat.h"
 #include "utils/memutils.h"

 #include "pg_config.h"

+#define MAX_SHARDS 128
+#define MAX_PS_CONNSTR_LEN 128
+
 typedef enum
 {
 	/* pagestore_client -> pagestore */
@@ -144,11 +148,13 @@ extern char *nm_to_string(NeonMessage * msg);
 * API
 */

+typedef unsigned shardno_t;
+
 typedef struct
 {
-	bool		(*send) (NeonRequest * request);
-	NeonResponse *(*receive) (void);
-	bool		(*flush) (void);
+	bool		(*send) (shardno_t  shard_no, NeonRequest * request);
+	NeonResponse *(*receive) (shardno_t shard_no);
+	bool		(*flush) (shardno_t shard_no);
 }			page_server_api;

 extern void prefetch_on_ps_disconnect(void);
@@ -165,6 +171,8 @@ extern char *neon_tenant;
 extern bool wal_redo;
 extern int32 max_cluster_size;

+extern shardno_t get_shard_number(BufferTag* tag);
+
 extern const f_smgr *smgr_neon(BackendId backend, NRelFileInfo rinfo);
 extern void smgr_init_neon(void);
 extern void readahead_buffer_resize(int newsize, void *extra);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -59,7 +59,6 @@
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
 #include "storage/buf_internals.h"
-#include "storage/fsm_internals.h"
 #include "storage/smgr.h"
 #include "storage/md.h"
 #include "pgstat.h"
@@ -165,6 +164,7 @@ typedef struct PrefetchRequest {
 	XLogRecPtr	actual_request_lsn;
 	NeonResponse *response; /* may be null */
 	PrefetchStatus status;
+	shardno_t   shard_no;
 	uint64		my_ring_index;
 } PrefetchRequest;

@@ -226,6 +226,8 @@ typedef struct PrefetchState {

 	/* the buffers */
 	prfh_hash *prf_hash;
+	int     max_shard_no;
+	uint8   shard_bitmap[(MAX_SHARDS + 7)/8];
 	PrefetchRequest prf_buffer[]; /* prefetch buffers */
 } PrefetchState;

@@ -314,6 +316,7 @@ compact_prefetch_buffers(void)
 		Assert(target_slot->status == PRFS_UNUSED);

 		target_slot->buftag = source_slot->buftag;
+		target_slot->shard_no = source_slot->shard_no;
 		target_slot->status = source_slot->status;
 		target_slot->response = source_slot->response;
 		target_slot->effective_request_lsn = source_slot->effective_request_lsn;
@@ -478,6 +481,23 @@ prefetch_cleanup_trailing_unused(void)
 	}
 }

+
+static bool
+prefetch_flush_requests(void)
+{
+	for (shardno_t shard_no = 0; shard_no < MyPState->max_shard_no; shard_no++)
+	{
+		if (MyPState->shard_bitmap[shard_no >> 3] & (1 << (shard_no & 7)))
+		{
+			if (!page_server->flush(shard_no))
+				return false;
+			MyPState->shard_bitmap[shard_no >> 3] &= ~(1 << (shard_no & 7));
+		}
+	}
+	MyPState->max_shard_no = 0;
+	return true;
+}
+
 /*
 * Wait for slot of ring_index to have received its response.
 * The caller is responsible for making sure the request buffer is flushed.
@@ -493,7 +513,7 @@ prefetch_wait_for(uint64 ring_index)
 	if (MyPState->ring_flush <= ring_index &&
 		MyPState->ring_unused > MyPState->ring_flush)
 	{
-		if (!page_server->flush())
+		if (!prefetch_flush_requests())
 			return false;
 		MyPState->ring_flush = MyPState->ring_unused;
 	}
@@ -531,7 +551,7 @@ prefetch_read(PrefetchRequest *slot)
 	Assert(slot->my_ring_index == MyPState->ring_receive);

 	old = MemoryContextSwitchTo(MyPState->errctx);
-	response = (NeonResponse *) page_server->receive();
+	response = (NeonResponse *) page_server->receive(slot->shard_no);
 	MemoryContextSwitchTo(old);
 	if (response)
 	{
@@ -683,12 +703,14 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);

-	while (!page_server->send((NeonRequest *) &request));
+	while (!page_server->send(slot->shard_no, (NeonRequest *) &request));

 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;
 	MyPState->n_unused -= 1;
 	MyPState->ring_unused += 1;
+	MyPState->shard_bitmap[slot->shard_no >> 3] |= 1 << (slot->shard_no & 7);
+	MyPState->max_shard_no = Max(slot->shard_no+1, MyPState->max_shard_no);

 	/* update slot state */
 	slot->status = PRFS_REQUESTED;
@@ -848,6 +870,7 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 	 * function reads the buffer tag from the slot.
 	 */
 	slot->buftag = tag;
+	slot->shard_no = get_shard_number(&tag);
 	slot->my_ring_index = ring_index;

 	prefetch_do_request(slot, force_latest, force_lsn);
@@ -858,7 +881,7 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 	if (flush_every_n_requests > 0 &&
 		MyPState->ring_unused - MyPState->ring_flush >= flush_every_n_requests)
 	{
-		if (!page_server->flush())
+		if (!prefetch_flush_requests())
 		{
 			/* Prefetch set is reset in case of error, so we should try to register our request once again */
 			goto Retry;
@@ -873,11 +896,42 @@ static NeonResponse *
 page_server_request(void const *req)
 {
 	NeonResponse* resp;
+	BufferTag tag = {0};
+	shardno_t shard_no;
+
+	switch (((NeonRequest *) req)->tag)
+	{
+		case T_NeonExistsRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonExistsRequest *) req)->rinfo);
+			break;
+		case T_NeonNblocksRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonNblocksRequest *) req)->rinfo);
+			break;
+		case T_NeonDbSizeRequest:
+			NInfoGetDbOid(BufTagGetNRelFileInfo(tag)) = ((NeonDbSizeRequest *) req)->dbNode;
+			break;
+		case T_NeonGetPageRequest:
+			CopyNRelFileInfoToBufTag(tag, ((NeonGetPageRequest *) req)->rinfo);
+			tag.blockNum = ((NeonGetPageRequest *) req)->blkno;
+			break;
+		default:
+			elog(ERROR, "Unexpected request tag: %d", ((NeonRequest *) req)->tag);
+	}
+	shard_no = get_shard_number(&tag);
+
+	/*
+	 * TODO: temporary workarround - we stream all WAL only to shard 0, so metadata and forks other than main
+	 * should be requested from shard 0. We still need to call get_shard_no() to check if shard map is up-to-date
+	 */
+	if (((NeonRequest *) req)->tag != T_NeonGetPageRequest || ((NeonGetPageRequest *) req)->forknum != MAIN_FORKNUM)
+	{
+		shard_no = 0;
+	}
+
 	do {
-		while (!page_server->send((NeonRequest *) req) || !page_server->flush());
-		MyPState->ring_flush = MyPState->ring_unused;
+		while (!page_server->send(shard_no, (NeonRequest *) req) || !page_server->flush(shard_no));
 		consume_prefetch_responses();
-		resp = page_server->receive();
+		resp = page_server->receive(shard_no);
 	} while (resp == NULL);
 	return resp;

@@ -2723,86 +2777,6 @@ smgr_init_neon(void)
 }


-static void
-neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, XLogRecPtr end_recptr)
-{
-	BlockNumber relsize;
-	/* Extend the relation if we know its size */
-	if (get_cached_relsize(rinfo, forknum, &relsize))
-	{
-		if (relsize < blkno + 1)
-		{
-			update_cached_relsize(rinfo, forknum, blkno + 1);
-			SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
-		}
-	}
-	else
-	{
-		/*
-		 * Size was not cached. We populate the cache now, with the size of the
-		 * relation measured after this WAL record is applied.
-		 *
-		 * This length is later reused when we open the smgr to read the block,
-		 * which is fine and expected.
-		 */
-
-		NeonResponse *response;
-		NeonNblocksResponse *nbresponse;
-		NeonNblocksRequest request = {
-			.req = (NeonRequest) {
-				.lsn = end_recptr,
-				.latest = false,
-				.tag = T_NeonNblocksRequest,
-			},
-			.rinfo = rinfo,
-			.forknum = forknum,
-		};
-
-		response = page_server_request(&request);
-
-		Assert(response->tag == T_NeonNblocksResponse);
-		nbresponse = (NeonNblocksResponse *) response;
-
-		relsize = Max(nbresponse->n_blocks, blkno+1);
-
-		set_cached_relsize(rinfo, forknum, relsize);
-		SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
-
-		elog(SmgrTrace, "Set length to %d", relsize);
-	}
-}
-
-#define FSM_TREE_DEPTH	((SlotsPerFSMPage >= 1626) ? 3 : 4)
-
-/*
- * TODO: May be it is better to make correspondent fgunctio from freespace.c public?
- */
-static BlockNumber
-get_fsm_physical_block(BlockNumber heapblk)
-{
-	BlockNumber pages;
-	int			leafno;
-	int			l;
-
-	/*
-	 * Calculate the logical page number of the first leaf page below the
-	 * given page.
-	 */
-	leafno = heapblk / SlotsPerFSMPage;
-
-	/* Count upper level nodes required to address the leaf page */
-	pages = 0;
-	for (l = 0; l < FSM_TREE_DEPTH; l++)
-	{
-		pages += leafno + 1;
-		leafno /= SlotsPerFSMPage;
-	}
-
-	/* Turn the page count into 0-based block number */
-	return pages - 1;
-}
-
-
 /*
 * Return whether we can skip the redo for this block.
 * 
@@ -2850,6 +2824,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	LWLock	   *partitionLock;
 	Buffer		buffer;
 	bool		no_redo_needed;
+	BlockNumber relsize;

 	if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
 		return true;
@@ -2899,10 +2874,49 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)

 	LWLockRelease(partitionLock);

-	neon_extend_rel_size(rinfo, forknum, blkno, end_recptr);
-	if (forknum == MAIN_FORKNUM)
+	/* Extend the relation if we know its size */
+	if (get_cached_relsize(rinfo, forknum, &relsize))
 	{
-		neon_extend_rel_size(rinfo, FSM_FORKNUM, get_fsm_physical_block(blkno), end_recptr);
+		if (relsize < blkno + 1)
+		{
+			update_cached_relsize(rinfo, forknum, blkno + 1);
+			SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
+		}
 	}
+	else
+	{
+		/*
+		 * Size was not cached. We populate the cache now, with the size of the
+		 * relation measured after this WAL record is applied.
+		 *
+		 * This length is later reused when we open the smgr to read the block,
+		 * which is fine and expected.
+		 */
+
+		NeonResponse *response;
+		NeonNblocksResponse *nbresponse;
+		NeonNblocksRequest request = {
+			.req = (NeonRequest) {
+				.lsn = end_recptr,
+				.latest = false,
+				.tag = T_NeonNblocksRequest,
+			},
+			.rinfo = rinfo,
+			.forknum = forknum,
+		};
+
+		response = page_server_request(&request);
+
+		Assert(response->tag == T_NeonNblocksResponse);
+		nbresponse = (NeonNblocksResponse *) response;
+
+		Assert(nbresponse->n_blocks > blkno);
+
+		set_cached_relsize(rinfo, forknum, nbresponse->n_blocks);
+		SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
+
+		elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
+	}
+
 	return no_redo_needed;
 }
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -456,6 +456,7 @@ class NeonEnvBuilder:
        self.initial_tenant = initial_tenant or TenantId.generate()
        self.initial_timeline = initial_timeline or TimelineId.generate()
        self.enable_generations = False
+        self.initial_shard_count = 1
        self.scrub_on_exit = False
        self.test_output_dir = test_output_dir

@@ -497,7 +498,10 @@ class NeonEnvBuilder:
            f"Services started, creating initial tenant {env.initial_tenant} and its initial timeline"
        )
        initial_tenant, initial_timeline = env.neon_cli.create_tenant(
-            tenant_id=env.initial_tenant, conf=initial_tenant_conf, timeline_id=env.initial_timeline
+            tenant_id=env.initial_tenant,
+            conf=initial_tenant_conf,
+            timeline_id=env.initial_timeline,
+            shard_count=self.initial_shard_count,
        )
        assert env.initial_tenant == initial_tenant
        assert env.initial_timeline == initial_timeline
@@ -1144,6 +1148,7 @@ class NeonCli(AbstractNeonCli):
        timeline_id: Optional[TimelineId] = None,
        conf: Optional[Dict[str, str]] = None,
        set_default: bool = False,
+        shard_count: int = 1,
    ) -> Tuple[TenantId, TimelineId]:
        """
        Creates a new tenant, returns its id and its initial timeline's id.
@@ -1160,6 +1165,8 @@ class NeonCli(AbstractNeonCli):
            str(timeline_id),
            "--pg-version",
            self.env.pg_version,
+            "--shard-count",
+            str(shard_count),
        ]
        if conf is not None:
            args.extend(
@@ -1382,7 +1389,7 @@ class NeonCli(AbstractNeonCli):
        tenant_id: Optional[TenantId] = None,
        hot_standby: bool = False,
        lsn: Optional[Lsn] = None,
-        pageserver_id: Optional[int] = None,
+        pageserver_ids: Optional[list[int]] = None,
    ) -> "subprocess.CompletedProcess[str]":
        args = [
            "endpoint",
@@ -1404,8 +1411,10 @@ class NeonCli(AbstractNeonCli):
            args.append(endpoint_id)
        if hot_standby:
            args.extend(["--hot-standby", "true"])
-        if pageserver_id is not None:
-            args.extend(["--pageserver-id", str(pageserver_id)])
+        if pageserver_ids is not None:
+            args.extend(["--pageserver-id", ",".join(str(i) for i in pageserver_ids)])
+
+        log.info(f"endpoint_create: {args}")

        res = self.raw_cli(args)
        res.check_returncode()
@@ -2451,7 +2460,7 @@ class Endpoint(PgProtocol):
        hot_standby: bool = False,
        lsn: Optional[Lsn] = None,
        config_lines: Optional[List[str]] = None,
-        pageserver_id: Optional[int] = None,
+        pageserver_ids: Optional[list[int]] = None,
    ) -> "Endpoint":
        """
        Create a new Postgres endpoint.
@@ -2473,7 +2482,7 @@ class Endpoint(PgProtocol):
            hot_standby=hot_standby,
            pg_port=self.pg_port,
            http_port=self.http_port,
-            pageserver_id=pageserver_id,
+            pageserver_ids=pageserver_ids,
        )
        path = Path("endpoints") / self.endpoint_id / "pgdata"
        self.pgdata_dir = os.path.join(self.env.repo_dir, path)
@@ -2609,7 +2618,7 @@ class Endpoint(PgProtocol):
        lsn: Optional[Lsn] = None,
        config_lines: Optional[List[str]] = None,
        remote_ext_config: Optional[str] = None,
-        pageserver_id: Optional[int] = None,
+        pageserver_ids: Optional[list[int]] = None,
    ) -> "Endpoint":
        """
        Create an endpoint, apply config, and start Postgres.
@@ -2624,7 +2633,7 @@ class Endpoint(PgProtocol):
            config_lines=config_lines,
            hot_standby=hot_standby,
            lsn=lsn,
-            pageserver_id=pageserver_id,
+            pageserver_ids=pageserver_ids,
        ).start(remote_ext_config=remote_ext_config)

        log.info(f"Postgres startup took {time.time() - started_at} seconds")
@@ -2660,7 +2669,7 @@ class EndpointFactory:
        hot_standby: bool = False,
        config_lines: Optional[List[str]] = None,
        remote_ext_config: Optional[str] = None,
-        pageserver_id: Optional[int] = None,
+        pageserver_ids: Optional[list[int]] = None,
    ) -> Endpoint:
        ep = Endpoint(
            self.env,
@@ -2678,7 +2687,7 @@ class EndpointFactory:
            config_lines=config_lines,
            lsn=lsn,
            remote_ext_config=remote_ext_config,
-            pageserver_id=pageserver_id,
+            pageserver_ids=pageserver_ids,
        )

    def create(
@@ -3162,7 +3171,15 @@ def check_restored_datadir_content(
            cur.execute("CHECKPOINT")

    # wait for pageserver to catch up
-    wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
+    if len(env.pageservers) > 1:
+        # FIXME: wait_for_last_flush_lsn needs teaching about sharding: it tries to query
+        # LSNs for shard-naive TenantId
+        return
+    for pageserver in env.pageservers:
+        wait_for_last_flush_lsn(
+            env, endpoint, endpoint.tenant_id, timeline_id, pageserver_id=pageserver.id
+        )
+
    # stop postgres to ensure that files won't change
    endpoint.stop()

--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -260,14 +260,6 @@ class PageserverHttpClient(requests.Session):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
        self.verbose_error(res)

-    def tenant_reset(self, tenant_id: TenantId, drop_cache: bool):
-        params = {}
-        if drop_cache:
-            params["drop_cache"] = "true"
-
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/reset", params=params)
-        self.verbose_error(res)
-
    def tenant_delete(self, tenant_id: TenantId):
        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
        self.verbose_error(res)
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -3,24 +3,38 @@
 #
 from pathlib import Path

-from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
+import pytest
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    check_restored_datadir_content,
+)


 # Run the main PostgreSQL regression tests, in src/test/regress.
 #
+@pytest.mark.parametrize("shard_count", [2])
 def test_pg_regress(
-    neon_simple_env: NeonEnv,
+    neon_env_builder: NeonEnvBuilder,
    test_output_dir: Path,
    pg_bin,
    capsys,
    base_dir: Path,
    pg_distrib_dir: Path,
+    shard_count: int,
 ):
-    env = neon_simple_env
+    neon_env_builder.enable_generations = True
+    neon_env_builder.initial_shard_count = shard_count
+    neon_env_builder.num_pageservers = shard_count
+    env = neon_env_builder.init_start()
+
+    for pageserver in env.pageservers:
+        # FIXME: attachment_service is not yet sharding aware, so generation validation is broken.
+        pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
+
+    endpoint = env.endpoints.create_start("main", pageserver_ids=[p.id for p in env.pageservers])

-    env.neon_cli.create_branch("test_pg_regress", "empty")
    # Connect to postgres and create a database called "regression".
-    endpoint = env.endpoints.create_start("test_pg_regress")
    endpoint.safe_psql("CREATE DATABASE regression")

    # Create some local directories for pg_regress to run in.
--- a/test_runner/regress/test_physical_replication.py
+++ b/test_runner/regress/test_physical_replication.py
@@ -1,29 +0,0 @@
-import random
-import time
-
-from fixtures.neon_fixtures import NeonEnv
-
-
-def test_physical_replication(neon_simple_env: NeonEnv):
-    env = neon_simple_env
-    n_records = 100000
-    with env.endpoints.create_start(
-        branch_name="main",
-        endpoint_id="primary",
-    ) as primary:
-        with primary.connect() as p_con:
-            with p_con.cursor() as p_cur:
-                p_cur.execute(
-                    "CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))"
-                )
-        time.sleep(1)
-        with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary:
-            with primary.connect() as p_con:
-                with p_con.cursor() as p_cur:
-                    with secondary.connect() as s_con:
-                        with s_con.cursor() as s_cur:
-                            for pk in range(n_records):
-                                p_cur.execute("insert into t (pk) values (%s)", (pk,))
-                                s_cur.execute(
-                                    "select * from t where pk=%s", (random.randrange(1, n_records),)
-                                )
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -840,7 +840,7 @@ def test_compaction_waits_for_upload(
    ), "there should be one L1 after L0 => L1 compaction (without #5863 being fixed)"

    def layer_deletes_completed():
-        m = client.get_metric_value("pageserver_layer_completed_deletes_total")
+        m = client.get_metric_value("pageserver_layer_gcs_count_total", {"state": "completed"})
        if m is None:
            return 0
        return int(m)
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -0,0 +1,23 @@
+from fixtures.neon_fixtures import NeonEnvBuilder
+import pytest
+
+@pytest.mark.parametrize("shard_count", [2])
+@pytest.mark.timeout(1000)
+def test_sharding(neon_env_builder: NeonEnvBuilder, shard_count: int):
+    neon_env_builder.enable_generations = True
+    neon_env_builder.initial_shard_count = shard_count
+    neon_env_builder.num_pageservers = shard_count
+    env = neon_env_builder.init_start()
+
+    for pageserver in env.pageservers:
+        # FIXME: attachment_service is not yet sharding aware, so generation validation is broken.
+        pageserver.allowed_errors.append(".*Dropped remote consistent LSN updates.*")
+        pageserver.allowed_errors.append(".*Dropping stale deletions for tenant.*")
+
+    endpoint = env.endpoints.create_start("main", pageserver_ids=[p.id for p in env.pageservers])
+    with endpoint.cursor() as cur:
+        cur.execute("SET statement_timeout=0") # disable statement timeout
+        cur.execute("create table t(t bigint, payload text default repeat('?',200))")
+        cur.execute("insert into t values(generate_series(1,10000000))")
+        cur.execute("select count(*) from t")
+        assert cur.fetchone()[0] == 10000000
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -1,5 +1,4 @@
 import asyncio
-import enum
 import random
 import time
 from threading import Thread
@@ -52,20 +51,11 @@ def do_gc_target(
        log.info("gc http thread returning")


-class ReattachMode(str, enum.Enum):
-    REATTACH_EXPLICIT = "explicit"
-    REATTACH_RESET = "reset"
-    REATTACH_RESET_DROP = "reset"
-
-
 # Basic detach and re-attach test
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
-@pytest.mark.parametrize(
-    "mode",
-    [ReattachMode.REATTACH_EXPLICIT, ReattachMode.REATTACH_RESET, ReattachMode.REATTACH_RESET_DROP],
-)
 def test_tenant_reattach(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, mode: str
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
 ):
    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

@@ -110,15 +100,8 @@ def test_tenant_reattach(
        ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
    )

-    if mode == ReattachMode.REATTACH_EXPLICIT:
-        # Explicitly detach then attach the tenant as two separate API calls
-        pageserver_http.tenant_detach(tenant_id)
-        pageserver_http.tenant_attach(tenant_id)
-    elif mode in (ReattachMode.REATTACH_RESET, ReattachMode.REATTACH_RESET_DROP):
-        # Use the reset API to detach/attach in one shot
-        pageserver_http.tenant_reset(tenant_id, mode == ReattachMode.REATTACH_RESET_DROP)
-    else:
-        raise NotImplementedError(mode)
+    pageserver_http.tenant_detach(tenant_id)
+    pageserver_http.tenant_attach(tenant_id)

    time.sleep(1)  # for metrics propagation
Author	SHA1	Message	Date
Konstantin Knizhnik	66fdb54d8d	Merge with #5837	2023-12-05 12:05:27 +00:00
Konstantin Knizhnik	21c3c55e6b	Fix flushing prefetch requests in page_server_request	2023-12-05 12:05:27 +00:00
John Spray	2c21c74ff4	DNM: script for sharding demo	2023-12-05 12:05:27 +00:00
John Spray	2d0930f622	neon_local: basic sharding support	2023-12-05 12:05:27 +00:00
John Spray	1518675bac	postgres: use modified hash	2023-12-05 12:05:27 +00:00
Konstantin Knizhnik	e3c6fc3e51	compute: Add support for PS shardoing in compute	2023-12-05 12:05:27 +00:00
Konstantin Knizhnik	039f96c8a6	Add simple test for sharding	2023-12-05 12:05:27 +00:00
John Spray	8d50593e17	tests: enable running test_pg_regress with sharding	2023-12-05 12:05:27 +00:00
John Spray	f7795ee2a5	pageserver: filter WAL by ShardIdentity	2023-12-05 12:03:33 +00:00