Do not round LSN on timeline explicit creation in sk.

2026-05-25 09:00:37 +00:00 · 2023-11-30 21:33:10 +03:00
80 changed files with 709 additions and 1872 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3643,7 +3643,6 @@ dependencies = [
 "serde",
 "serde_json",
 "sha2",
- "smol_str",
 "socket2 0.5.3",
 "sync_wrapper",
 "task-local-extensions",
@@ -4710,15 +4709,6 @@ version = "1.11.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "62bb4feee49fdd9f707ef802e22365a35de4b7b299de4763d44bfea899442ff9"

-[[package]]
-name = "smol_str"
-version = "0.2.0"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "74212e6bbe9a4352329b2f68ba3130c15a3f26fe88ff22dbdc6cdd58fa85e99c"
-dependencies = [
- "serde",
-]
-
 [[package]]
 name = "socket2"
 version = "0.4.9"
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -132,7 +132,6 @@ serde_assert = "0.5.0"
 sha2 = "0.10.2"
 signal-hook = "0.3"
 smallvec = "1.11"
-smol_str = { version = "0.2.0", features = ["serde"] }
 socket2 = "0.5"
 strum = "0.24"
 strum_macros = "0.24"
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -387,10 +387,18 @@ COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 ARG PG_VERSION
 ENV PATH "/usr/local/pgsql/bin:$PATH"

-RUN apt-get update && \
+RUN case "${PG_VERSION}" in \
+      "v14" | "v15") \
+        export TIMESCALEDB_VERSION=2.10.1 \
+        export TIMESCALEDB_CHECKSUM=6fca72a6ed0f6d32d2b3523951ede73dc5f9b0077b38450a029a5f411fdb8c73 \
+        ;; \
+      *) \
+        echo "TimescaleDB not supported on this PostgreSQL version. See https://github.com/timescale/timescaledb/issues/5752" && exit 0;; \
+    esac && \
+    apt-get update && \
    apt-get install -y cmake && \
-    wget https://github.com/timescale/timescaledb/archive/refs/tags/2.13.0.tar.gz -O timescaledb.tar.gz && \
-    echo "584a351c7775f0e067eaa0e7277ea88cab9077cc4c455cbbf09a5d9723dce95d timescaledb.tar.gz" | sha256sum --check && \
+    wget https://github.com/timescale/timescaledb/archive/refs/tags/${TIMESCALEDB_VERSION}.tar.gz -O timescaledb.tar.gz && \
+    echo "${TIMESCALEDB_CHECKSUM} timescaledb.tar.gz" | sha256sum --check && \
    mkdir timescaledb-src && cd timescaledb-src && tar xvzf ../timescaledb.tar.gz --strip-components=1 -C . && \
    ./bootstrap -DSEND_TELEMETRY_DEFAULT:BOOL=OFF -DUSE_TELEMETRY:BOOL=OFF -DAPACHE_ONLY:BOOL=ON -DCMAKE_BUILD_TYPE=Release && \
    cd build && \
@@ -721,7 +729,8 @@ RUN wget https://github.com/eulerto/wal2json/archive/refs/tags/wal2json_2_5.tar.
    echo "b516653575541cf221b99cf3f8be9b6821f6dbcfc125675c85f35090f824f00e wal2json_2_5.tar.gz" | sha256sum --check && \
    mkdir wal2json-src && cd wal2json-src && tar xvzf ../wal2json_2_5.tar.gz --strip-components=1 -C . && \
    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/wal2json.control

 #########################################################################################
 #
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -274,13 +274,7 @@ fn main() -> Result<()> {
            let mut state = compute.state.lock().unwrap();
            state.error = Some(format!("{:?}", err));
            state.status = ComputeStatus::Failed;
-            // Notify others that Postgres failed to start. In case of configuring the
-            // empty compute, it's likely that API handler is still waiting for compute
-            // state change. With this we will notify it that compute is in Failed state,
-            // so control plane will know about it earlier and record proper error instead
-            // of timeout.
-            compute.state_changed.notify_all();
-            drop(state); // unlock
+            drop(state);
            delay_exit = true;
            None
        }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -22,7 +22,7 @@ use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

 use compute_api::responses::{ComputeMetrics, ComputeStatus};
-use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
+use compute_api::spec::{ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

 use remote_storage::{DownloadError, RemotePath};
@@ -277,17 +277,6 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
 }

 impl ComputeNode {
-    /// Check that compute node has corresponding feature enabled.
-    pub fn has_feature(&self, feature: ComputeFeature) -> bool {
-        let state = self.state.lock().unwrap();
-
-        if let Some(s) = state.pspec.as_ref() {
-            s.spec.features.contains(&feature)
-        } else {
-            false
-        }
-    }
-
    pub fn set_status(&self, status: ComputeStatus) {
        let mut state = self.state.lock().unwrap();
        state.status = status;
@@ -739,12 +728,7 @@ impl ComputeNode {

        // Write new config
        let pgdata_path = Path::new(&self.pgdata);
-        let postgresql_conf_path = pgdata_path.join("postgresql.conf");
-        config::write_postgres_conf(&postgresql_conf_path, &spec, None)?;
-        // temporarily reset max_cluster_size in config
-        // to avoid the possibility of hitting the limit, while we are reconfiguring:
-        // creating new extensions, roles, etc...
-        config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
+        config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), &spec, None)?;
        self.pg_reload_conf()?;

        let mut client = Client::connect(self.connstr.as_str(), NoTls)?;
@@ -765,10 +749,6 @@ impl ComputeNode {
        // 'Close' connection
        drop(client);

-        // reset max_cluster_size in config back to original value and reload config
-        config::compute_ctl_temp_override_remove(pgdata_path)?;
-        self.pg_reload_conf()?;
-
        let unknown_op = "unknown".to_string();
        let op_id = spec.operation_uuid.as_ref().unwrap_or(&unknown_op);
        info!(
@@ -829,17 +809,7 @@ impl ComputeNode {

        let config_time = Utc::now();
        if pspec.spec.mode == ComputeMode::Primary && !pspec.spec.skip_pg_catalog_updates {
-            let pgdata_path = Path::new(&self.pgdata);
-            // temporarily reset max_cluster_size in config
-            // to avoid the possibility of hitting the limit, while we are applying config:
-            // creating new extensions, roles, etc...
-            config::compute_ctl_temp_override_create(pgdata_path, "neon.max_cluster_size=-1")?;
-            self.pg_reload_conf()?;
-
            self.apply_config(&compute_state)?;
-
-            config::compute_ctl_temp_override_remove(pgdata_path)?;
-            self.pg_reload_conf()?;
        }

        let startup_end_time = Utc::now();
--- a/compute_tools/src/config.rs
+++ b/compute_tools/src/config.rs
@@ -93,25 +93,5 @@ pub fn write_postgres_conf(
        writeln!(file, "neon.extension_server_port={}", port)?;
    }

-    // This is essential to keep this line at the end of the file,
-    // because it is intended to override any settings above.
-    writeln!(file, "include_if_exists = 'compute_ctl_temp_override.conf'")?;
-
-    Ok(())
-}
-
-/// create file compute_ctl_temp_override.conf in pgdata_dir
-/// add provided options to this file
-pub fn compute_ctl_temp_override_create(pgdata_path: &Path, options: &str) -> Result<()> {
-    let path = pgdata_path.join("compute_ctl_temp_override.conf");
-    let mut file = File::create(path)?;
-    write!(file, "{}", options)?;
-    Ok(())
-}
-
-/// remove file compute_ctl_temp_override.conf in pgdata_dir
-pub fn compute_ctl_temp_override_remove(pgdata_path: &Path) -> Result<()> {
-    let path = pgdata_path.join("compute_ctl_temp_override.conf");
-    std::fs::remove_file(path)?;
    Ok(())
 }
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -227,7 +227,7 @@ async fn handle_configure_request(

        let parsed_spec = match ParsedSpec::try_from(spec) {
            Ok(ps) => ps,
-            Err(msg) => return Err((msg, StatusCode::BAD_REQUEST)),
+            Err(msg) => return Err((msg, StatusCode::PRECONDITION_FAILED)),
        };

        // XXX: wrap state update under lock in code blocks. Otherwise,
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -156,17 +156,17 @@ paths:
                description: Error text or 'OK' if download succeeded.
                example: "OK"
        400:
-          description: Request is invalid.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/GenericError"
+        description: Request is invalid.
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/GenericError"
        500:
-          description: Extension download request failed.
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/GenericError"
+        description: Extension download request failed.
+        content:
+          application/json:
+            schema:
+              $ref: "#/components/schemas/GenericError"

 components:
  securitySchemes:
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -118,6 +118,19 @@ pub fn get_spec_from_control_plane(
    spec
 }

+/// It takes cluster specification and does the following:
+/// - Serialize cluster config and put it into `postgresql.conf` completely rewriting the file.
+/// - Update `pg_hba.conf` to allow external connections.
+pub fn handle_configuration(spec: &ComputeSpec, pgdata_path: &Path) -> Result<()> {
+    // File `postgresql.conf` is no longer included into `basebackup`, so just
+    // always write all config into it creating new file.
+    config::write_postgres_conf(&pgdata_path.join("postgresql.conf"), spec, None)?;
+
+    update_pg_hba(pgdata_path)?;
+
+    Ok(())
+}
+
 /// Check `pg_hba.conf` and update if needed to allow external connections.
 pub fn update_pg_hba(pgdata_path: &Path) -> Result<()> {
    // XXX: consider making it a part of spec.json
--- a/control_plane/src/bin/neon_local.rs
+++ b/control_plane/src/bin/neon_local.rs
@@ -415,7 +415,6 @@ fn handle_tenant(tenant_match: &ArgMatches, env: &mut local_env::LocalEnv) -> an
                None,
                None,
                Some(pg_version),
-                None,
            )?;
            let new_timeline_id = timeline_info.timeline_id;
            let last_record_lsn = timeline_info.last_record_lsn;
@@ -496,7 +495,6 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                None,
                None,
                Some(pg_version),
-                None,
            )?;
            let new_timeline_id = timeline_info.timeline_id;

@@ -584,7 +582,6 @@ fn handle_timeline(timeline_match: &ArgMatches, env: &mut local_env::LocalEnv) -
                start_lsn,
                Some(ancestor_timeline_id),
                None,
-                None,
            )?;
            let new_timeline_id = timeline_info.timeline_id;

--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -519,7 +519,6 @@ impl Endpoint {
            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
            format_version: 1.0,
            operation_uuid: None,
-            features: vec![],
            cluster: Cluster {
                cluster_id: None, // project ID: not used
                name: None,       // project name: not used
--- a/control_plane/src/pageserver.rs
+++ b/control_plane/src/pageserver.rs
@@ -565,7 +565,6 @@ impl PageServerNode {
        ancestor_start_lsn: Option<Lsn>,
        ancestor_timeline_id: Option<TimelineId>,
        pg_version: Option<u32>,
-        existing_initdb_timeline_id: Option<TimelineId>,
    ) -> anyhow::Result<TimelineInfo> {
        // If timeline ID was not specified, generate one
        let new_timeline_id = new_timeline_id.unwrap_or(TimelineId::generate());
@@ -579,7 +578,6 @@ impl PageServerNode {
            ancestor_start_lsn,
            ancestor_timeline_id,
            pg_version,
-            existing_initdb_timeline_id,
        })
        .send()?
        .error_from_body()?
--- a/docs/rfcs/027-pageserver-wal-disaster-recovery.md
+++ b/docs/rfcs/027-pageserver-wal-disaster-recovery.md
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -26,13 +26,6 @@ pub struct ComputeSpec {
    // but we don't use it for anything. Serde will ignore missing fields when
    // deserializing it.
    pub operation_uuid: Option<String>,
-
-    /// Compute features to enable. These feature flags are provided, when we
-    /// know all the details about client's compute, so they cannot be used
-    /// to change `Empty` compute behavior.
-    #[serde(default)]
-    pub features: Vec<ComputeFeature>,
-
    /// Expected cluster state at the end of transition process.
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,
@@ -75,19 +68,6 @@ pub struct ComputeSpec {
    pub remote_extensions: Option<RemoteExtSpec>,
 }

-/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
-#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "snake_case")]
-pub enum ComputeFeature {
-    // XXX: Add more feature flags here.
-
-    // This is a special feature flag that is used to represent unknown feature flags.
-    // Basically all unknown to enum flags are represented as this one. See unit test
-    // `parse_unknown_features()` for more details.
-    #[serde(other)]
-    UnknownFeature,
-}
-
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct RemoteExtSpec {
    pub public_extensions: Option<Vec<String>>,
@@ -249,10 +229,7 @@ mod tests {
    #[test]
    fn parse_spec_file() {
        let file = File::open("tests/cluster_spec.json").unwrap();
-        let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
-
-        // Features list defaults to empty vector.
-        assert!(spec.features.is_empty());
+        let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
    }

    #[test]
@@ -264,22 +241,4 @@ mod tests {
        ob.insert("unknown_field_123123123".into(), "hello".into());
        let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
    }
-
-    #[test]
-    fn parse_unknown_features() {
-        // Test that unknown feature flags do not cause any errors.
-        let file = File::open("tests/cluster_spec.json").unwrap();
-        let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
-        let ob = json.as_object_mut().unwrap();
-
-        // Add unknown feature flags.
-        let features = vec!["foo_bar_feature", "baz_feature"];
-        ob.insert("features".into(), features.into());
-
-        let spec: ComputeSpec = serde_json::from_value(json).unwrap();
-
-        assert!(spec.features.len() == 2);
-        assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
-        assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
-    }
 }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -140,7 +140,3 @@ impl Key {
        })
    }
 }
-
-pub fn is_rel_block_key(key: &Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0
-}
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -179,8 +179,6 @@ pub struct TimelineCreateRequest {
    #[serde(default)]
    pub ancestor_timeline_id: Option<TimelineId>,
    #[serde(default)]
-    pub existing_initdb_timeline_id: Option<TimelineId>,
-    #[serde(default)]
    pub ancestor_start_lsn: Option<Lsn>,
    pub pg_version: Option<u32>,
 }
@@ -384,9 +382,7 @@ pub struct TimelineInfo {
    /// The LSN that we are advertizing to safekeepers
    pub remote_consistent_lsn_visible: Lsn,

-    pub current_logical_size: u64,
-    pub current_logical_size_is_accurate: bool,
-
+    pub current_logical_size: Option<u64>, // is None when timeline is Unloaded
    /// Sum of the size of all layer files.
    /// If a layer is present in both local FS and S3, it counts only once.
    pub current_physical_size: Option<u64>, // is None when timeline is Unloaded
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,6 +1,5 @@
 use std::{ops::RangeInclusive, str::FromStr};

-use crate::key::{is_rel_block_key, Key};
 use hex::FromHex;
 use serde::{Deserialize, Serialize};
 use thiserror;
@@ -303,8 +302,6 @@ pub struct ShardStripeSize(pub u32);
 pub struct ShardLayout(u8);

 const LAYOUT_V1: ShardLayout = ShardLayout(1);
-/// ShardIdentity uses a magic layout value to indicate if it is unusable
-const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);

 /// Default stripe size in pages: 256MiB divided by 8kiB page size.
 const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
@@ -313,10 +310,10 @@ const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
 /// to resolve a key to a shard, and then check whether that shard is ==self.
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardIdentity {
+    pub layout: ShardLayout,
    pub number: ShardNumber,
    pub count: ShardCount,
-    stripe_size: ShardStripeSize,
-    layout: ShardLayout,
+    pub stripe_size: ShardStripeSize,
 }

 #[derive(thiserror::Error, Debug, PartialEq, Eq)]
@@ -342,22 +339,6 @@ impl ShardIdentity {
        }
    }

-    /// A broken instance of this type is only used for `TenantState::Broken` tenants,
-    /// which are constructed in code paths that don't have access to proper configuration.
-    ///
-    /// A ShardIdentity in this state may not be used for anything, and should not be persisted.
-    /// Enforcement is via assertions, to avoid making our interface fallible for this
-    /// edge case: it is the Tenant's responsibility to avoid trying to do any I/O when in a broken
-    /// state, and by extension to avoid trying to do any page->shard resolution.
-    pub fn broken(number: ShardNumber, count: ShardCount) -> Self {
-        Self {
-            number,
-            count,
-            layout: LAYOUT_BROKEN,
-            stripe_size: DEFAULT_STRIPE_SIZE,
-        }
-    }
-
    pub fn is_unsharded(&self) -> bool {
        self.number == ShardNumber(0) && self.count == ShardCount(0)
    }
@@ -384,33 +365,6 @@ impl ShardIdentity {
            })
        }
    }
-
-    fn is_broken(&self) -> bool {
-        self.layout == LAYOUT_BROKEN
-    }
-
-    pub fn get_shard_number(&self, key: &Key) -> ShardNumber {
-        assert!(!self.is_broken());
-        key_to_shard_number(self.count, self.stripe_size, key)
-    }
-
-    /// Return true if the key should be ingested by this shard
-    pub fn is_key_local(&self, key: &Key) -> bool {
-        assert!(!self.is_broken());
-        if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
-            true
-        } else {
-            key_to_shard_number(self.count, self.stripe_size, key) == self.number
-        }
-    }
-
-    pub fn shard_slug(&self) -> String {
-        if self.count > ShardCount(0) {
-            format!("-{:02x}{:02x}", self.number.0, self.count.0)
-        } else {
-            String::new()
-        }
-    }
 }

 impl Serialize for ShardIndex {
@@ -484,65 +438,6 @@ impl<'de> Deserialize<'de> for ShardIndex {
    }
 }

-/// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
-/// in order to be able to serve basebackup requests without peer communication).
-fn key_is_shard0(key: &Key) -> bool {
-    // To decide what to shard out to shards >0, we apply a simple rule that only
-    // relation pages are distributed to shards other than shard zero. Everything else gets
-    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
-    // requests, and any request other than those for particular blocks in relations.
-    //
-    // In this condition:
-    // - is_rel_block_key includes only relations, i.e. excludes SLRU data and
-    // all metadata.
-    // - field6 is set to -1 for relation size pages.
-    !(is_rel_block_key(key) && key.field6 != 0xffffffff)
-}
-
-/// Provide the same result as the function in postgres `hashfn.h` with the same name
-fn murmurhash32(mut h: u32) -> u32 {
-    h ^= h >> 16;
-    h = h.wrapping_mul(0x85ebca6b);
-    h ^= h >> 13;
-    h = h.wrapping_mul(0xc2b2ae35);
-    h ^= h >> 16;
-    h
-}
-
-/// Provide the same result as the function in postgres `hashfn.h` with the same name
-fn hash_combine(mut a: u32, mut b: u32) -> u32 {
-    b = b.wrapping_add(0x9e3779b9);
-    b = b.wrapping_add(a << 6);
-    b = b.wrapping_add(a >> 2);
-
-    a ^= b;
-    a
-}
-
-/// Where a Key is to be distributed across shards, select the shard.  This function
-/// does not account for keys that should be broadcast across shards.
-///
-/// The hashing in this function must exactly match what we do in postgres smgr
-/// code.  The resulting distribution of pages is intended to preserve locality within
-/// `stripe_size` ranges of contiguous block numbers in the same relation, while otherwise
-/// distributing data pseudo-randomly.
-///
-/// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
-/// and will be handled at higher levels when shards are split.
-fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
-    // Fast path for un-sharded tenants or broadcast keys
-    if count < ShardCount(2) || key_is_shard0(key) {
-        return ShardNumber(0);
-    }
-
-    // relNode
-    let mut hash = murmurhash32(key.field4);
-    // blockNum/stripe size
-    hash = hash_combine(hash, murmurhash32(key.field6 / stripe_size.0));
-
-    ShardNumber((hash % count.0 as u32) as u8)
-}
-
 #[cfg(test)]
 mod tests {
    use std::str::FromStr;
@@ -714,29 +609,4 @@ mod tests {

        Ok(())
    }
-
-    // These are only smoke tests to spot check that our implementation doesn't
-    // deviate from a few examples values: not aiming to validate the overall
-    // hashing algorithm.
-    #[test]
-    fn murmur_hash() {
-        assert_eq!(murmurhash32(0), 0);
-
-        assert_eq!(hash_combine(0xb1ff3b40, 0), 0xfb7923c9);
-    }
-
-    #[test]
-    fn shard_mapping() {
-        let key = Key {
-            field1: 0x00,
-            field2: 0x67f,
-            field3: 0x5,
-            field4: 0x400c,
-            field5: 0x00,
-            field6: 0x7d06,
-        };
-
-        let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
-        assert_eq!(shard, ShardNumber(8));
-    }
 }
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -289,10 +289,10 @@ impl FeStartupPacket {
        // We shouldn't advance `buf` as probably full message is not there yet,
        // so can't directly use Bytes::get_u32 etc.
        let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
-        // The proposed replacement is `!(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
+        // The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
        // which is less readable
        #[allow(clippy::manual_range_contains)]
-        if len < 8 || len > MAX_STARTUP_PACKET_LENGTH {
+        if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
            return Err(ProtocolError::Protocol(format!(
                "invalid startup packet message length {}",
                len
@@ -975,10 +975,4 @@ mod tests {
        let params = make_params("foo\\ bar \\ \\\\ baz\\  lol");
        assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
    }
-
-    #[test]
-    fn parse_fe_startup_packet_regression() {
-        let data = [0, 0, 0, 7, 0, 0, 0, 0];
-        FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err();
-    }
 }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -378,7 +378,7 @@ impl RemoteStorage for S3Bucket {
            let empty = Vec::new();
            let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);

-            tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
+            tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());

            for object in keys {
                let object_path = object.key().expect("response does not contain a key");
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -152,16 +152,3 @@ impl Debug for Generation {
        }
    }
 }
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    #[test]
-    fn generation_gt() {
-        // Important that a None generation compares less than a valid one, during upgrades from
-        // pre-generation systems.
-        assert!(Generation::none() < Generation::new(0));
-        assert!(Generation::none() < Generation::new(1));
-    }
-}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -402,11 +402,15 @@ fn start_pageserver(
    let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel();
    let (init_done_tx, init_done_rx) = utils::completion::channel();

+    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
+
    let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();

    let order = pageserver::InitializationOrder {
        initial_tenant_load_remote: Some(init_done_tx),
        initial_tenant_load: Some(init_remote_done_tx),
+        initial_logical_size_can_start: init_done_rx.clone(),
+        initial_logical_size_attempt: Some(init_logical_size_done_tx),
        background_jobs_can_start: background_jobs_barrier.clone(),
    };

@@ -460,7 +464,7 @@ fn start_pageserver(
            });

            let WaitForPhaseResult {
-                timeout_remaining: _timeout,
+                timeout_remaining: timeout,
                skipped: init_load_skipped,
            } = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;

@@ -468,6 +472,26 @@ fn start_pageserver(

            scopeguard::ScopeGuard::into_inner(guard);

+            let guard = scopeguard::guard_on_success((), |_| {
+                tracing::info!("Cancelled before initial logical sizes completed")
+            });
+
+            let logical_sizes_done = std::pin::pin!(async {
+                init_logical_size_done_rx.wait().await;
+                startup_checkpoint(
+                    started_startup_at,
+                    "initial_logical_sizes",
+                    "Initial logical sizes completed",
+                );
+            });
+
+            let WaitForPhaseResult {
+                timeout_remaining: _,
+                skipped: logical_sizes_skipped,
+            } = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;
+
+            scopeguard::ScopeGuard::into_inner(guard);
+
            // allow background jobs to start: we either completed prior stages, or they reached timeout
            // and were skipped.  It is important that we do not let them block background jobs indefinitely,
            // because things like consumption metrics for billing are blocked by this barrier.
@@ -490,6 +514,9 @@ fn start_pageserver(
            if let Some(f) = init_load_skipped {
                f.await;
            }
+            if let Some(f) = logical_sizes_skipped {
+                f.await;
+            }
            scopeguard::ScopeGuard::into_inner(guard);

            startup_checkpoint(started_startup_at, "complete", "Startup complete");
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -855,8 +855,7 @@ impl PageServerConf {

    #[cfg(test)]
    pub fn test_repo_dir(test_name: &str) -> Utf8PathBuf {
-        let test_output_dir = std::env::var("TEST_OUTPUT").unwrap_or("../tmp_check".into());
-        Utf8PathBuf::from(format!("{test_output_dir}/test_{test_name}"))
+        Utf8PathBuf::from(format!("../tmp_check/test_{test_name}"))
    }

    pub fn dummy_conf(repo_dir: Utf8PathBuf) -> Self {
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -1,8 +1,8 @@
-use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogicalSize};
+use crate::context::RequestContext;
+use anyhow::Context;
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
 use futures::stream::StreamExt;
-use pageserver_api::shard::ShardNumber;
 use std::{sync::Arc, time::SystemTime};
 use utils::{
    id::{TenantId, TimelineId},
@@ -229,11 +229,6 @@ where
    while let Some((tenant_id, tenant)) = tenants.next().await {
        let mut tenant_resident_size = 0;

-        // Sharded tenants report all consumption metrics from shard zero
-        if tenant.tenant_shard_id().shard_number != ShardNumber(0) {
-            continue;
-        }
-
        for timeline in tenant.list_timelines() {
            let timeline_id = timeline.timeline_id;

@@ -357,16 +352,13 @@ impl TimelineSnapshot {

            let current_exact_logical_size = {
                let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
-                let size = span.in_scope(|| {
-                    t.get_current_logical_size(
-                        crate::tenant::timeline::GetLogicalSizePriority::Background,
-                        ctx,
-                    )
-                });
-                match size {
+                let res = span
+                    .in_scope(|| t.get_current_logical_size(ctx))
+                    .context("get_current_logical_size");
+                match res? {
                    // Only send timeline logical size when it is fully calculated.
-                    CurrentLogicalSize::Exact(ref size) => Some(size.into()),
-                    CurrentLogicalSize::Approximate(_) => None,
+                    (size, is_exact) if is_exact => Some(size),
+                    (_, _) => None,
                }
            };

--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -1028,9 +1028,6 @@ paths:
                  format: hex
                pg_version:
                  type: integer
-                existing_initdb_timeline_id:
-                  type: string
-                  format: hex
      responses:
        "201":
          description: TimelineInfo
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -338,8 +338,13 @@ async fn build_timeline_info_common(
        Lsn(0) => None,
        lsn @ Lsn(_) => Some(lsn),
    };
-    let current_logical_size =
-        timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx);
+    let current_logical_size = match timeline.get_current_logical_size(ctx) {
+        Ok((size, _)) => Some(size),
+        Err(err) => {
+            error!("Timeline info creation failed to get current logical size: {err:?}");
+            None
+        }
+    };
    let current_physical_size = Some(timeline.layer_size_sum().await);
    let state = timeline.current_state();
    let remote_consistent_lsn_projected = timeline
@@ -363,11 +368,7 @@ async fn build_timeline_info_common(
        last_record_lsn,
        prev_record_lsn: Some(timeline.get_prev_record_lsn()),
        latest_gc_cutoff_lsn: *timeline.get_latest_gc_cutoff_lsn(),
-        current_logical_size: current_logical_size.size_dont_care_about_accuracy(),
-        current_logical_size_is_accurate: match current_logical_size.accuracy() {
-            tenant::timeline::logical_size::Accuracy::Approximate => false,
-            tenant::timeline::logical_size::Accuracy::Exact => true,
-        },
+        current_logical_size,
        current_physical_size,
        current_logical_size_non_incremental: None,
        timeline_dir_layer_file_size_sum: None,
@@ -440,7 +441,6 @@ async fn timeline_create_handler(
            request_data.ancestor_timeline_id.map(TimelineId::from),
            request_data.ancestor_start_lsn,
            request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
-            request_data.existing_initdb_timeline_id,
            state.broker_client.clone(),
            &ctx,
        )
@@ -709,26 +709,6 @@ async fn tenant_detach_handler(
    json_response(StatusCode::OK, ())
 }

-async fn tenant_reset_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    let drop_cache: Option<bool> = parse_query_param(&request, "drop_cache")?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-    let state = get_state(&request);
-    state
-        .tenant_manager
-        .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn tenant_load_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -1848,9 +1828,6 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_id/detach", |r| {
            api_handler(r, tenant_detach_handler)
        })
-        .post("/v1/tenant/:tenant_shard_id/reset", |r| {
-            api_handler(r, tenant_reset_handler)
-        })
        .post("/v1/tenant/:tenant_id/load", |r| {
            api_handler(r, tenant_load_handler)
        })
--- a/pageserver/src/import_datadir.rs
+++ b/pageserver/src/import_datadir.rs
@@ -7,13 +7,12 @@ use std::pin::Pin;
 use std::task::{self, Poll};

 use anyhow::{bail, ensure, Context, Result};
-use async_compression::tokio::bufread::ZstdDecoder;
 use async_compression::{tokio::write::ZstdEncoder, zstd::CParameter, Level};
 use bytes::Bytes;
 use camino::Utf8Path;
 use futures::StreamExt;
 use nix::NixPath;
-use tokio::io::{AsyncBufRead, AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
+use tokio::io::{AsyncRead, AsyncReadExt, AsyncWrite, AsyncWriteExt};
 use tokio_tar::Archive;
 use tokio_tar::Builder;
 use tokio_tar::HeaderMode;
@@ -733,13 +732,3 @@ pub async fn create_tar_zst(pgdata_path: &Utf8Path) -> Result<Vec<u8>> {
    }
    Ok(compressed.buf)
 }
-
-pub async fn extract_tar_zst(
-    pgdata_path: &Utf8Path,
-    tar_zst: impl AsyncBufRead + Unpin,
-) -> Result<()> {
-    let tar = Box::pin(ZstdDecoder::new(tar_zst));
-    let mut archive = Archive::new(tar);
-    archive.unpack(pgdata_path).await?;
-    Ok(())
-}
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -186,6 +186,13 @@ pub struct InitializationOrder {
    /// Each initial tenant load task carries this until completion.
    pub initial_tenant_load: Option<utils::completion::Completion>,

+    /// Barrier for when we can start initial logical size calculations.
+    pub initial_logical_size_can_start: utils::completion::Barrier,
+
+    /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
+    /// attempt. It is important to drop this once the attempt has completed.
+    pub initial_logical_size_attempt: Option<utils::completion::Completion>,
+
    /// Barrier for when we can start any background jobs.
    ///
    /// This can be broken up later on, but right now there is just one class of a background job.
@@ -205,7 +212,7 @@ async fn timed<Fut: std::future::Future>(
    match tokio::time::timeout(warn_at, &mut fut).await {
        Ok(ret) => {
            tracing::info!(
-                stage = name,
+                task = name,
                elapsed_ms = started.elapsed().as_millis(),
                "completed"
            );
@@ -213,7 +220,7 @@ async fn timed<Fut: std::future::Future>(
        }
        Err(_) => {
            tracing::info!(
-                stage = name,
+                task = name,
                elapsed_ms = started.elapsed().as_millis(),
                "still waiting, taking longer than expected..."
            );
@@ -222,7 +229,7 @@ async fn timed<Fut: std::future::Future>(

            // this has a global allowed_errors
            tracing::warn!(
-                stage = name,
+                task = name,
                elapsed_ms = started.elapsed().as_millis(),
                "completed, took longer than expected"
            );
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -403,129 +403,6 @@ static CURRENT_LOGICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
    .expect("failed to define current logical size metric")
 });

-pub(crate) mod initial_logical_size {
-    use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
-    use once_cell::sync::Lazy;
-
-    pub(crate) struct StartCalculation(IntCounterVec);
-    pub(crate) static START_CALCULATION: Lazy<StartCalculation> = Lazy::new(|| {
-        StartCalculation(
-            register_int_counter_vec!(
-                "pageserver_initial_logical_size_start_calculation",
-                "Incremented each time we start an initial logical size calculation attempt. \
-                 The `circumstances` label provides some additional details.",
-                &["attempt", "circumstances"]
-            )
-            .unwrap(),
-        )
-    });
-
-    struct DropCalculation {
-        first: IntCounter,
-        retry: IntCounter,
-    }
-
-    static DROP_CALCULATION: Lazy<DropCalculation> = Lazy::new(|| {
-        let vec = register_int_counter_vec!(
-            "pageserver_initial_logical_size_drop_calculation",
-            "Incremented each time we abort a started size calculation attmpt.",
-            &["attempt"]
-        )
-        .unwrap();
-        DropCalculation {
-            first: vec.with_label_values(&["first"]),
-            retry: vec.with_label_values(&["retry"]),
-        }
-    });
-
-    pub(crate) struct Calculated {
-        pub(crate) births: IntCounter,
-        pub(crate) deaths: IntCounter,
-    }
-
-    pub(crate) static CALCULATED: Lazy<Calculated> = Lazy::new(|| Calculated {
-        births: register_int_counter!(
-            "pageserver_initial_logical_size_finish_calculation",
-            "Incremented every time we finish calculation of initial logical size.\
-             If everything is working well, this should happen at most once per Timeline object."
-        )
-        .unwrap(),
-        deaths: register_int_counter!(
-            "pageserver_initial_logical_size_drop_finished_calculation",
-            "Incremented when we drop a finished initial logical size calculation result.\
-             Mainly useful to turn pageserver_initial_logical_size_finish_calculation into a gauge."
-        )
-        .unwrap(),
-    });
-
-    pub(crate) struct OngoingCalculationGuard {
-        inc_drop_calculation: Option<IntCounter>,
-    }
-
-    #[derive(strum_macros::IntoStaticStr)]
-    pub(crate) enum StartCircumstances {
-        EmptyInitial,
-        SkippedConcurrencyLimiter,
-        AfterBackgroundTasksRateLimit,
-    }
-
-    impl StartCalculation {
-        pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
-            let circumstances_label: &'static str = circumstances.into();
-            self.0.with_label_values(&["first", circumstances_label]);
-            OngoingCalculationGuard {
-                inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
-            }
-        }
-        pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
-            let circumstances_label: &'static str = circumstances.into();
-            self.0.with_label_values(&["retry", circumstances_label]);
-            OngoingCalculationGuard {
-                inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
-            }
-        }
-    }
-
-    impl Drop for OngoingCalculationGuard {
-        fn drop(&mut self) {
-            if let Some(counter) = self.inc_drop_calculation.take() {
-                counter.inc();
-            }
-        }
-    }
-
-    impl OngoingCalculationGuard {
-        pub(crate) fn calculation_result_saved(mut self) -> FinishedCalculationGuard {
-            drop(self.inc_drop_calculation.take());
-            CALCULATED.births.inc();
-            FinishedCalculationGuard {
-                inc_on_drop: CALCULATED.deaths.clone(),
-            }
-        }
-    }
-
-    pub(crate) struct FinishedCalculationGuard {
-        inc_on_drop: IntCounter,
-    }
-
-    impl Drop for FinishedCalculationGuard {
-        fn drop(&mut self) {
-            self.inc_on_drop.inc();
-        }
-    }
-
-    // context: https://github.com/neondatabase/neon/issues/5963
-    pub(crate) static TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE: Lazy<IntCounter> =
-        Lazy::new(|| {
-            register_int_counter!(
-                "pageserver_initial_logical_size_timelines_where_walreceiver_got_approximate_size",
-                "Counter for the following event: walreceiver calls\
-                 Timeline::get_current_logical_size() and it returns `Approximate` for the first time."
-            )
-            .unwrap()
-        });
-}
-
 pub(crate) static TENANT_STATE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
    register_uint_gauge_vec!(
        "pageserver_tenant_states_count",
@@ -1388,8 +1265,6 @@ pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> =
 pub(crate) struct WalRedoProcessCounters {
    pub(crate) started: IntCounter,
    pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
-    pub(crate) active_stderr_logger_tasks_started: IntCounter,
-    pub(crate) active_stderr_logger_tasks_finished: IntCounter,
 }

 #[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
@@ -1413,19 +1288,6 @@ impl Default for WalRedoProcessCounters {
            &["cause"],
        )
        .unwrap();
-
-        let active_stderr_logger_tasks_started = register_int_counter!(
-            "pageserver_walredo_stderr_logger_tasks_started_total",
-            "Number of active walredo stderr logger tasks that have started",
-        )
-        .unwrap();
-
-        let active_stderr_logger_tasks_finished = register_int_counter!(
-            "pageserver_walredo_stderr_logger_tasks_finished_total",
-            "Number of active walredo stderr logger tasks that have finished",
-        )
-        .unwrap();
-
        Self {
            started,
            killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
@@ -1433,8 +1295,6 @@ impl Default for WalRedoProcessCounters {
                let cause_str: &'static str = cause.into();
                killed.with_label_values(&[cause_str])
            })),
-            active_stderr_logger_tasks_started,
-            active_stderr_logger_tasks_finished,
        }
    }
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -53,14 +53,12 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
-use crate::pgdatadir_mapping::rel_block_to_key;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
 use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
-use crate::tenant::mgr::ShardSelector;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;

@@ -401,19 +399,16 @@ impl PageServerHandler {
    {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        // Note that since one connection may contain getpage requests that target different
-        // shards (e.g. during splitting when the compute is not yet aware of the split), the tenant
-        // that we look up here may not be the one that serves all the actual requests: we will double
-        // check the mapping of key->shard later before calling into Timeline for getpage requests.
+        // TODO(sharding): enumerate local tenant shards for this tenant, and select the one
+        // that should serve this request.
+
+        // Make request tracer if needed
        let tenant = mgr::get_active_tenant_with_timeout(
            tenant_id,
-            ShardSelector::First,
            ACTIVE_TENANT_TIMEOUT,
            &task_mgr::shutdown_token(),
        )
        .await?;
-
-        // Make request tracer if needed
        let mut tracer = if tenant.get_trace_read_requests() {
            let connection_id = ConnectionId::generate();
            let path =
@@ -571,7 +566,6 @@ impl PageServerHandler {
        info!("creating new timeline");
        let tenant = get_active_tenant_with_timeout(
            tenant_id,
-            ShardSelector::Zero,
            ACTIVE_TENANT_TIMEOUT,
            &task_mgr::shutdown_token(),
        )
@@ -634,7 +628,7 @@ impl PageServerHandler {
        debug_assert_current_span_has_tenant_and_timeline_id();

        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+            .get_active_tenant_timeline(tenant_id, timeline_id)
            .await?;
        let last_record_lsn = timeline.get_last_record_lsn();
        if last_record_lsn != start_lsn {
@@ -813,49 +807,9 @@ impl PageServerHandler {
        }
        */

-        let key = rel_block_to_key(req.rel, req.blkno);
-        let page = if timeline.get_shard_identity().is_key_local(&key) {
-            timeline
-                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
-                .await?
-        } else {
-            // The Tenant shard we looked up at connection start does not hold this particular
-            // key: look for other shards in this tenant.  This scenario occurs if a pageserver
-            // has multiple shards for the same tenant.
-            //
-            // TODO: optimize this (https://github.com/neondatabase/neon/pull/6037)
-            let timeline = match self
-                .get_active_tenant_timeline(
-                    timeline.tenant_shard_id.tenant_id,
-                    timeline.timeline_id,
-                    ShardSelector::Page(key),
-                )
-                .await
-            {
-                Ok(t) => t,
-                Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
-                    // We already know this tenant exists in general, because we resolved it at
-                    // start of connection.  Getting a NotFound here indicates that the shard containing
-                    // the requested page is not present on this node.
-
-                    // TODO: this should be some kind of structured error that the client will understand,
-                    // so that it can block until its config is updated: this error is expected in the case
-                    // that the Tenant's shards' placements are being updated and the client hasn't been
-                    // informed yet.
-                    //
-                    // https://github.com/neondatabase/neon/issues/6038
-                    return Err(anyhow::anyhow!("Request routed to wrong shard"));
-                }
-                Err(e) => return Err(e.into()),
-            };
-
-            // Take a GateGuard for the duration of this request.  If we were using our main Timeline object,
-            // the GateGuard was already held over the whole connection.
-            let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
-            timeline
-                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
-                .await?
-        };
+        let page = timeline
+            .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
+            .await?;

        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
            page,
@@ -884,7 +838,7 @@ impl PageServerHandler {

        // check that the timeline exists
        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+            .get_active_tenant_timeline(tenant_id, timeline_id)
            .await?;
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
@@ -990,11 +944,9 @@ impl PageServerHandler {
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
-        selector: ShardSelector,
    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
        let tenant = get_active_tenant_with_timeout(
            tenant_id,
-            selector,
            ACTIVE_TENANT_TIMEOUT,
            &task_mgr::shutdown_token(),
        )
@@ -1168,7 +1120,7 @@ where

            self.check_permission(Some(tenant_id))?;
            let timeline = self
-                .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+                .get_active_tenant_timeline(tenant_id, timeline_id)
                .await?;

            let end_of_timeline = timeline.get_last_record_rlsn();
@@ -1355,7 +1307,6 @@ where

            let tenant = get_active_tenant_with_timeout(
                tenant_id,
-                ShardSelector::Zero,
                ACTIVE_TENANT_TIMEOUT,
                &task_mgr::shutdown_token(),
            )
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -13,7 +13,6 @@ use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use bytes::{Buf, Bytes};
-use pageserver_api::key::is_rel_block_key;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -283,10 +282,6 @@ impl Timeline {
    }

    /// Get a list of all existing relations in given tablespace and database.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
    pub async fn list_rels(
        &self,
        spcnode: Oid,
@@ -635,10 +630,6 @@ impl Timeline {
    ///
    /// Only relation blocks are counted currently. That excludes metadata,
    /// SLRUs, twophase files etc.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
    pub async fn get_current_logical_size_non_incremental(
        &self,
        lsn: Lsn,
@@ -1323,7 +1314,7 @@ impl<'a> DatadirModification<'a> {
        // Flush relation and  SLRU data blocks, keep metadata.
        let mut retained_pending_updates = HashMap::new();
        for (key, value) in self.pending_updates.drain() {
-            if is_rel_block_key(&key) || is_slru_block_key(key) {
+            if is_rel_block_key(key) || is_slru_block_key(key) {
                // This bails out on first error without modifying pending_updates.
                // That's Ok, cf this function's doc comment.
                writer.put(key, self.lsn, &value, ctx).await?;
@@ -1579,7 +1570,7 @@ fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
    }
 }

-pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
+fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
    Key {
        field1: 0x00,
        field2: rel.spcnode,
@@ -1778,6 +1769,10 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
    })
 }

+fn is_rel_block_key(key: Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0
+}
+
 pub fn is_rel_fsm_block_key(key: Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -19,13 +19,11 @@ use futures::stream::FuturesUnordered;
 use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models::TimelineState;
-use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use std::fmt;
 use storage_broker::BrokerClientChannel;
-use tokio::io::BufReader;
 use tokio::runtime::Handle;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
@@ -237,9 +235,6 @@ pub struct Tenant {

    tenant_shard_id: TenantShardId,

-    // The detailed sharding information, beyond the number/count in tenant_shard_id
-    shard_identity: ShardIdentity,
-
    /// The remote storage generation, used to protect S3 objects from split-brain.
    /// Does not change over the lifetime of the [`Tenant`] object.
    ///
@@ -316,9 +311,6 @@ impl WalRedoManager {
        }
    }

-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
    pub async fn request_redo(
        &self,
        key: crate::repository::Key,
@@ -476,6 +468,7 @@ impl Tenant {
        index_part: Option<IndexPart>,
        metadata: TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
+        init_order: Option<&InitializationOrder>,
        _ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let tenant_id = self.tenant_shard_id;
@@ -485,6 +478,7 @@ impl Tenant {
            &metadata,
            ancestor.clone(),
            resources,
+            init_order,
            CreateTimelineCause::Load,
        )?;
        let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
@@ -572,7 +566,6 @@ impl Tenant {
        tenant_shard_id: TenantShardId,
        resources: TenantSharedResources,
        attached_conf: AttachedTenantConf,
-        shard_identity: ShardIdentity,
        init_order: Option<InitializationOrder>,
        tenants: &'static std::sync::RwLock<TenantsMap>,
        mode: SpawnMode,
@@ -594,7 +587,6 @@ impl Tenant {
            TenantState::Attaching,
            conf,
            attached_conf,
-            shard_identity,
            wal_redo_manager,
            tenant_shard_id,
            remote_storage.clone(),
@@ -687,6 +679,10 @@ impl Tenant {
                    // as we are no longer loading, signal completion by dropping
                    // the completion while we resume deletion
                    drop(_completion);
+                    // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
+                    let _ = init_order
+                        .as_mut()
+                        .and_then(|x| x.initial_logical_size_attempt.take());
                    let background_jobs_can_start =
                        init_order.as_ref().map(|x| &x.background_jobs_can_start);
                    if let Some(background) = background_jobs_can_start {
@@ -700,6 +696,7 @@ impl Tenant {
                        &tenant_clone,
                        preload,
                        tenants,
+                        init_order,
                        &ctx,
                    )
                    .await
@@ -712,7 +709,7 @@ impl Tenant {
                    }
                }

-                match tenant_clone.attach(preload, &ctx).await {
+                match tenant_clone.attach(init_order, preload, &ctx).await {
                    Ok(()) => {
                        info!("attach finished, activating");
                        tenant_clone.activate(broker_client, None, &ctx);
@@ -775,6 +772,7 @@ impl Tenant {
    ///
    async fn attach(
        self: &Arc<Tenant>,
+        init_order: Option<InitializationOrder>,
        preload: Option<TenantPreload>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
@@ -787,7 +785,7 @@ impl Tenant {
            None => {
                // Deprecated dev mode: load from local disk state instead of remote storage
                // https://github.com/neondatabase/neon/issues/5624
-                return self.load_local(ctx).await;
+                return self.load_local(init_order, ctx).await;
            }
        };

@@ -797,31 +795,20 @@ impl Tenant {
        let mut timeline_ancestors = HashMap::new();
        let mut existent_timelines = HashSet::new();
        for (timeline_id, preload) in preload.timelines {
+            // In this context a timeline "exists" if it has any content in remote storage: this will
+            // be our cue to not delete any corresponding local directory
+            existent_timelines.insert(timeline_id);
+
            let index_part = match preload.index_part {
                Ok(i) => {
                    debug!("remote index part exists for timeline {timeline_id}");
-                    // We found index_part on the remote, this is the standard case.
-                    existent_timelines.insert(timeline_id);
                    i
                }
-                Err(DownloadError::NotFound) => {
-                    // There is no index_part on the remote. We only get here
-                    // if there is some prefix for the timeline in the remote storage.
-                    // This can e.g. be the initdb.tar.zst archive, maybe a
-                    // remnant from a prior incomplete creation or deletion attempt.
-                    // Delete the local directory as the deciding criterion for a
-                    // timeline's existence is presence of index_part.
-                    info!(%timeline_id, "index_part not found on remote");
-                    continue;
-                }
                Err(e) => {
-                    // Some (possibly ephemeral) error happened during index_part download.
-                    // Pretend the timeline exists to not delete the timeline directory,
-                    // as it might be a temporary issue and we don't want to re-download
-                    // everything after it resolves.
+                    // Timeline creation is not atomic: we might upload a layer but no index_part.  We expect
+                    // that the creation will be retried by the control plane and eventually result in
+                    // a valid loadable state.
                    warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})");
-
-                    existent_timelines.insert(timeline_id);
                    continue;
                }
            };
@@ -882,6 +869,7 @@ impl Tenant {
                &index_part.metadata,
                Some(remote_timeline_client),
                self.deletion_queue_client.clone(),
+                None,
            )
            .await
            .context("resume_deletion")
@@ -1006,6 +994,10 @@ impl Tenant {
            None
        };

+        // we can load remote timelines during init, but they are assumed to be so rare that
+        // initialization order is not passed to here.
+        let init_order = None;
+
        // timeline loading after attach expects to find metadata file for each metadata
        save_metadata(
            self.conf,
@@ -1023,6 +1015,7 @@ impl Tenant {
            Some(index_part),
            remote_metadata,
            ancestor,
+            init_order,
            ctx,
        )
        .await
@@ -1046,9 +1039,6 @@ impl Tenant {
            },
            conf,
            AttachedTenantConf::try_from(LocationConf::default()).unwrap(),
-            // Shard identity isn't meaningful for a broken tenant: it's just a placeholder
-            // to occupy the slot for this TenantShardId.
-            ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
            wal_redo_manager,
            tenant_shard_id,
            None,
@@ -1267,7 +1257,11 @@ impl Tenant {
    /// files on disk. Used at pageserver startup.
    ///
    /// No background tasks are started as part of this routine.
-    async fn load_local(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
+    async fn load_local(
+        self: &Arc<Tenant>,
+        init_order: Option<InitializationOrder>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
        span::debug_assert_current_span_has_tenant_id();

        debug!("loading tenant task");
@@ -1293,7 +1287,7 @@ impl Tenant {
        // Process loadable timelines first
        for (timeline_id, local_metadata) in scan.sorted_timelines_to_load {
            if let Err(e) = self
-                .load_local_timeline(timeline_id, local_metadata, ctx, false)
+                .load_local_timeline(timeline_id, local_metadata, init_order.as_ref(), ctx, false)
                .await
            {
                match e {
@@ -1327,7 +1321,13 @@ impl Tenant {
                }
                Some(local_metadata) => {
                    if let Err(e) = self
-                        .load_local_timeline(timeline_id, local_metadata, ctx, true)
+                        .load_local_timeline(
+                            timeline_id,
+                            local_metadata,
+                            init_order.as_ref(),
+                            ctx,
+                            true,
+                        )
                        .await
                    {
                        match e {
@@ -1355,11 +1355,12 @@ impl Tenant {
    /// Subroutine of `load_tenant`, to load an individual timeline
    ///
    /// NB: The parent is assumed to be already loaded!
-    #[instrument(skip(self, local_metadata, ctx))]
+    #[instrument(skip(self, local_metadata, init_order, ctx))]
    async fn load_local_timeline(
        self: &Arc<Self>,
        timeline_id: TimelineId,
        local_metadata: TimelineMetadata,
+        init_order: Option<&InitializationOrder>,
        ctx: &RequestContext,
        found_delete_mark: bool,
    ) -> Result<(), LoadLocalTimelineError> {
@@ -1376,6 +1377,7 @@ impl Tenant {
                &local_metadata,
                None,
                self.deletion_queue_client.clone(),
+                init_order,
            )
            .await
            .context("resume deletion")
@@ -1392,9 +1394,17 @@ impl Tenant {
            None
        };

-        self.timeline_init_and_sync(timeline_id, resources, None, local_metadata, ancestor, ctx)
-            .await
-            .map_err(LoadLocalTimelineError::Load)
+        self.timeline_init_and_sync(
+            timeline_id,
+            resources,
+            None,
+            local_metadata,
+            ancestor,
+            init_order,
+            ctx,
+        )
+        .await
+        .map_err(LoadLocalTimelineError::Load)
    }

    pub(crate) fn tenant_id(&self) -> TenantId {
@@ -1548,14 +1558,12 @@ impl Tenant {
    ///
    /// If the caller specified the timeline ID to use (`new_timeline_id`), and timeline with
    /// the same timeline ID already exists, returns CreateTimelineError::AlreadyExists.
-    #[allow(clippy::too_many_arguments)]
    pub async fn create_timeline(
        &self,
        new_timeline_id: TimelineId,
        ancestor_timeline_id: Option<TimelineId>,
        mut ancestor_start_lsn: Option<Lsn>,
        pg_version: u32,
-        load_existing_initdb: Option<TimelineId>,
        broker_client: storage_broker::BrokerClientChannel,
        ctx: &RequestContext,
    ) -> Result<Arc<Timeline>, CreateTimelineError> {
@@ -1630,7 +1638,7 @@ impl Tenant {
                    .await?
            }
            None => {
-                self.bootstrap_timeline(new_timeline_id, pg_version, load_existing_initdb, ctx)
+                self.bootstrap_timeline(new_timeline_id, pg_version, ctx)
                    .await?
            }
        };
@@ -2289,6 +2297,7 @@ impl Tenant {
        new_metadata: &TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
        resources: TimelineResources,
+        init_order: Option<&InitializationOrder>,
        cause: CreateTimelineCause,
    ) -> anyhow::Result<Arc<Timeline>> {
        let state = match cause {
@@ -2303,6 +2312,9 @@ impl Tenant {
            CreateTimelineCause::Delete => TimelineState::Stopping,
        };

+        let initial_logical_size_can_start = init_order.map(|x| &x.initial_logical_size_can_start);
+        let initial_logical_size_attempt = init_order.map(|x| &x.initial_logical_size_attempt);
+
        let pg_version = new_metadata.pg_version();

        let timeline = Timeline::new(
@@ -2313,10 +2325,11 @@ impl Tenant {
            new_timeline_id,
            self.tenant_shard_id,
            self.generation,
-            self.shard_identity,
            Arc::clone(&self.walredo_mgr),
            resources,
            pg_version,
+            initial_logical_size_can_start.cloned(),
+            initial_logical_size_attempt.cloned().flatten(),
            state,
            self.cancel.child_token(),
        );
@@ -2331,7 +2344,6 @@ impl Tenant {
        state: TenantState,
        conf: &'static PageServerConf,
        attached_conf: AttachedTenantConf,
-        shard_identity: ShardIdentity,
        walredo_mgr: Arc<WalRedoManager>,
        tenant_shard_id: TenantShardId,
        remote_storage: Option<GenericRemoteStorage>,
@@ -2393,7 +2405,6 @@ impl Tenant {

        Tenant {
            tenant_shard_id,
-            shard_identity,
            generation: attached_conf.location.generation,
            conf,
            // using now here is good enough approximation to catch tenants with really long
@@ -2515,7 +2526,7 @@ impl Tenant {
            }
        }

-        debug!("persisting tenantconf to {config_path}");
+        info!("persisting tenantconf to {config_path}");

        let mut conf_content = r#"# This file contains a specific per-tenant's config.
 #  It is read in case of pageserver restart.
@@ -2550,7 +2561,7 @@ impl Tenant {
        target_config_path: &Utf8Path,
        tenant_conf: &TenantConfOpt,
    ) -> anyhow::Result<()> {
-        debug!("persisting tenantconf to {target_config_path}");
+        info!("persisting tenantconf to {target_config_path}");

        let mut conf_content = r#"# This file contains a specific per-tenant's config.
 #  It is read in case of pageserver restart.
@@ -2940,7 +2951,6 @@ impl Tenant {
        &self,
        timeline_id: TimelineId,
        pg_version: u32,
-        load_existing_initdb: Option<TimelineId>,
        ctx: &RequestContext,
    ) -> anyhow::Result<Arc<Timeline>> {
        let timeline_uninit_mark = {
@@ -2963,6 +2973,8 @@ impl Tenant {
                format!("Failed to remove already existing initdb directory: {pgdata_path}")
            })?;
        }
+        // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path
+        run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
        // this new directory is very temporary, set to remove it immediately after bootstrap, we don't need it
        scopeguard::defer! {
            if let Err(e) = fs::remove_dir_all(&pgdata_path) {
@@ -2970,59 +2982,32 @@ impl Tenant {
                error!("Failed to remove temporary initdb directory '{pgdata_path}': {e}");
            }
        }
-        if let Some(existing_initdb_timeline_id) = load_existing_initdb {
-            let Some(storage) = &self.remote_storage else {
-                bail!("no storage configured but load_existing_initdb set to {existing_initdb_timeline_id}");
-            };
-            let (initdb_tar_zst_path, initdb_tar_zst) =
-                self::remote_timeline_client::download_initdb_tar_zst(
-                    self.conf,
-                    storage,
-                    &self.tenant_shard_id,
-                    &existing_initdb_timeline_id,
-                )
-                .await
-                .context("download initdb tar")?;
-            let buf_read = Box::pin(BufReader::new(initdb_tar_zst));
-            import_datadir::extract_tar_zst(&pgdata_path, buf_read)
-                .await
-                .context("extract initdb tar")?;
-
-            if initdb_tar_zst_path.exists() {
-                tokio::fs::remove_file(&initdb_tar_zst_path)
-                    .await
-                    .context("tempfile removal")?;
-            }
-        } else {
-            // Init temporarily repo to get bootstrap data, this creates a directory in the `initdb_path` path
-            run_initdb(self.conf, &pgdata_path, pg_version, &self.cancel).await?;
-
-            // Upload the created data dir to S3
-            if let Some(storage) = &self.remote_storage {
-                let pgdata_zstd = import_datadir::create_tar_zst(&pgdata_path).await?;
-                let pgdata_zstd = Bytes::from(pgdata_zstd);
-                backoff::retry(
-                    || async {
-                        self::remote_timeline_client::upload_initdb_dir(
-                            storage,
-                            &self.tenant_shard_id.tenant_id,
-                            &timeline_id,
-                            pgdata_zstd.clone(),
-                        )
-                        .await
-                    },
-                    |_| false,
-                    3,
-                    u32::MAX,
-                    "persist_initdb_tar_zst",
-                    // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
-                    backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
-                )
-                .await?;
-            }
-        }
        let pgdata_lsn = import_datadir::get_lsn_from_controlfile(&pgdata_path)?.align();

+        // Upload the created data dir to S3
+        if let Some(storage) = &self.remote_storage {
+            let pgdata_zstd = import_datadir::create_tar_zst(&pgdata_path).await?;
+            let pgdata_zstd = Bytes::from(pgdata_zstd);
+            backoff::retry(
+                || async {
+                    self::remote_timeline_client::upload_initdb_dir(
+                        storage,
+                        &self.tenant_shard_id.tenant_id,
+                        &timeline_id,
+                        pgdata_zstd.clone(),
+                    )
+                    .await
+                },
+                |_| false,
+                3,
+                u32::MAX,
+                "persist_initdb_tar_zst",
+                // TODO: use a cancellation token (https://github.com/neondatabase/neon/issues/5066)
+                backoff::Cancel::new(CancellationToken::new(), || unreachable!()),
+            )
+            .await?;
+        }
+
        // Import the contents of the data directory at the initial checkpoint
        // LSN, and any WAL after that.
        // Initdb lsn will be equal to last_record_lsn which will be set after import.
@@ -3140,6 +3125,7 @@ impl Tenant {
                new_metadata,
                ancestor,
                resources,
+                None,
                CreateTimelineCause::Load,
            )
            .context("Failed to create timeline data structure")?;
@@ -3213,10 +3199,7 @@ impl Tenant {
        let uninit_mark_path = self
            .conf
            .timeline_uninit_mark_file_path(tenant_shard_id, timeline_id);
-        fs::OpenOptions::new()
-            .write(true)
-            .create_new(true)
-            .open(&uninit_mark_path)
+        fs::File::create(&uninit_mark_path)
            .context("Failed to create uninit mark file")
            .and_then(|_| {
                crashsafe::fsync_file_and_parent(&uninit_mark_path)
@@ -3805,8 +3788,6 @@ pub(crate) mod harness {
                    self.generation,
                ))
                .unwrap(),
-                // This is a legacy/test code path: sharding isn't supported here.
-                ShardIdentity::unsharded(),
                walredo_mgr,
                self.tenant_shard_id,
                Some(self.remote_storage.clone()),
@@ -3816,7 +3797,7 @@ pub(crate) mod harness {
            match mode {
                LoadMode::Local => {
                    tenant
-                        .load_local(ctx)
+                        .load_local(None, ctx)
                        .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                        .await?;
                }
@@ -3826,7 +3807,7 @@ pub(crate) mod harness {
                        .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                        .await?;
                    tenant
-                        .attach(Some(preload), ctx)
+                        .attach(None, Some(preload), ctx)
                        .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                        .await?;
                }
@@ -3869,9 +3850,6 @@ pub(crate) mod harness {
    pub(crate) struct TestRedoManager;

    impl TestRedoManager {
-        /// # Cancel-Safety
-        ///
-        /// This method is cancellation-safe.
        pub async fn request_redo(
            &self,
            key: Key,
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -15,6 +15,7 @@ use crate::{
    context::RequestContext,
    task_mgr::{self, TaskKind},
    tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
+    InitializationOrder,
 };

 use super::{
@@ -389,6 +390,7 @@ impl DeleteTenantFlow {
        tenant: &Arc<Tenant>,
        preload: Option<TenantPreload>,
        tenants: &'static std::sync::RwLock<TenantsMap>,
+        init_order: Option<InitializationOrder>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
        let (_, progress) = completion::channel();
@@ -398,7 +400,10 @@ impl DeleteTenantFlow {
            .await
            .expect("cant be stopping or broken");

-        tenant.attach(preload, ctx).await.context("attach")?;
+        tenant
+            .attach(init_order, preload, ctx)
+            .await
+            .context("attach")?;

        Self::background(
            guard,
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,8 +2,7 @@
 //! page server.

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
-use pageserver_api::key::Key;
-use pageserver_api::shard::{ShardIdentity, ShardNumber, TenantShardId};
+use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::collections::{BTreeMap, HashMap};
@@ -131,18 +130,6 @@ pub(crate) enum TenantsMapRemoveResult {
    InProgress(utils::completion::Barrier),
 }

-/// When resolving a TenantId to a shard, we may be looking for the 0th
-/// shard, or we might be looking for whichever shard holds a particular page.
-pub(crate) enum ShardSelector {
-    /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
-    /// ignore it.
-    Zero,
-    /// Pick the first shard we find for the TenantId
-    First,
-    /// Pick the shard that holds this key
-    Page(Key),
-}
-
 impl TenantsMap {
    /// Convenience function for typical usage, where we want to get a `Tenant` object, for
    /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
@@ -157,49 +144,6 @@ impl TenantsMap {
        }
    }

-    /// A page service client sends a TenantId, and to look up the correct Tenant we must
-    /// resolve this to a fully qualified TenantShardId.
-    fn resolve_shard(
-        &self,
-        tenant_id: &TenantId,
-        selector: ShardSelector,
-    ) -> Option<TenantShardId> {
-        let mut want_shard = None;
-        match self {
-            TenantsMap::Initializing => None,
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
-                for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
-                    match selector {
-                        ShardSelector::First => return Some(*slot.0),
-                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
-                            return Some(*slot.0)
-                        }
-                        ShardSelector::Page(key) => {
-                            if let Some(tenant) = slot.1.get_attached() {
-                                // First slot we see for this tenant, calculate the expected shard number
-                                // for the key: we will use this for checking if this and subsequent
-                                // slots contain the key, rather than recalculating the hash each time.
-                                if want_shard.is_none() {
-                                    want_shard = Some(tenant.shard_identity.get_shard_number(&key));
-                                }
-
-                                if Some(tenant.shard_identity.number) == want_shard {
-                                    return Some(*slot.0);
-                                }
-                            } else {
-                                continue;
-                            }
-                        }
-                        _ => continue,
-                    }
-                }
-
-                // Fall through: we didn't find an acceptable shard
-                None
-            }
-        }
-    }
-
    /// Only for use from DeleteTenantFlow.  This method directly removes a TenantSlot from the map.
    ///
    /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
@@ -270,6 +214,49 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
    Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));

+/// Create a directory, including parents.  This does no fsyncs and makes
+/// no guarantees about the persistence of the resulting metadata: for
+/// use when creating dirs for use as cache.
+async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
+    let mut dirs_to_create = Vec::new();
+    let mut path: &Utf8Path = path.as_ref();
+
+    // Figure out which directories we need to create.
+    loop {
+        let meta = tokio::fs::metadata(path).await;
+        match meta {
+            Ok(metadata) if metadata.is_dir() => break,
+            Ok(_) => {
+                return Err(std::io::Error::new(
+                    std::io::ErrorKind::AlreadyExists,
+                    format!("non-directory found in path: {path}"),
+                ));
+            }
+            Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {}
+            Err(e) => return Err(e),
+        }
+
+        dirs_to_create.push(path);
+
+        match path.parent() {
+            Some(parent) => path = parent,
+            None => {
+                return Err(std::io::Error::new(
+                    std::io::ErrorKind::InvalidInput,
+                    format!("can't find parent of path '{path}'"),
+                ));
+            }
+        }
+    }
+
+    // Create directories from parent to child.
+    for &path in dirs_to_create.iter().rev() {
+        tokio::fs::create_dir(path).await?;
+    }
+
+    Ok(())
+}
+
 /// The TenantManager is responsible for storing and mutating the collection of all tenants
 /// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
 /// lives inside the TenantManager.
@@ -528,14 +515,12 @@ pub async fn init_tenant_mgr(
        location_conf.attach_in_generation(generation);
        Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;

-        let shard_identity = location_conf.shard;
        match tenant_spawn(
            conf,
            tenant_shard_id,
            &tenant_dir_path,
            resources.clone(),
            AttachedTenantConf::try_from(location_conf)?,
-            shard_identity,
            Some(init_order.clone()),
            &TENANTS,
            SpawnMode::Normal,
@@ -576,7 +561,6 @@ pub(crate) fn tenant_spawn(
    tenant_path: &Utf8Path,
    resources: TenantSharedResources,
    location_conf: AttachedTenantConf,
-    shard_identity: ShardIdentity,
    init_order: Option<InitializationOrder>,
    tenants: &'static std::sync::RwLock<TenantsMap>,
    mode: SpawnMode,
@@ -603,19 +587,12 @@ pub(crate) fn tenant_spawn(
        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
    );

-    info!(
-        tenant_id = %tenant_shard_id.tenant_id,
-        shard_id = %tenant_shard_id.shard_slug(),
-        generation = ?location_conf.location.generation,
-        attach_mode = ?location_conf.location.attach_mode,
-        "Attaching tenant"
-    );
+    info!("Attaching tenant {tenant_shard_id}");
    let tenant = match Tenant::spawn(
        conf,
        tenant_shard_id,
        resources,
        location_conf,
-        shard_identity,
        init_order,
        tenants,
        mode,
@@ -785,14 +762,12 @@ pub(crate) async fn create_tenant(
        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
    let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;

-    let shard_identity = location_conf.shard;
    let created_tenant = tenant_spawn(
        conf,
        tenant_shard_id,
        &tenant_path,
        resources,
        AttachedTenantConf::try_from(location_conf)?,
-        shard_identity,
        None,
        &TENANTS,
        SpawnMode::Create,
@@ -885,7 +860,6 @@ impl TenantManager {
        Ok(())
    }

-    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
    pub(crate) async fn upsert_location(
        &self,
        tenant_shard_id: TenantShardId,
@@ -998,7 +972,7 @@ impl TenantManager {
            LocationMode::Secondary(_) => {
                // Directory doesn't need to be fsync'd because if we crash it can
                // safely be recreated next time this tenant location is configured.
-                tokio::fs::create_dir_all(&tenant_path)
+                unsafe_create_dir_all(&tenant_path)
                    .await
                    .with_context(|| format!("Creating {tenant_path}"))?;

@@ -1014,7 +988,7 @@ impl TenantManager {
                // Directory doesn't need to be fsync'd because we do not depend on
                // it to exist after crashes: it may be recreated when tenant is
                // re-attached, see https://github.com/neondatabase/neon/issues/5550
-                tokio::fs::create_dir_all(&tenant_path)
+                unsafe_create_dir_all(&timelines_path)
                    .await
                    .with_context(|| format!("Creating {timelines_path}"))?;

@@ -1022,14 +996,12 @@ impl TenantManager {
                    .await
                    .map_err(SetNewTenantConfigError::Persist)?;

-                let shard_identity = new_location_config.shard;
                let tenant = tenant_spawn(
                    self.conf,
                    tenant_shard_id,
                    &tenant_path,
                    self.resources.clone(),
                    AttachedTenantConf::try_from(new_location_config)?,
-                    shard_identity,
                    None,
                    self.tenants,
                    SpawnMode::Normal,
@@ -1044,81 +1016,6 @@ impl TenantManager {

        Ok(())
    }
-
-    /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
-    /// LocationConf that was last used to attach it.  Optionally, the local file cache may be
-    /// dropped before re-attaching.
-    ///
-    /// This is not part of a tenant's normal lifecycle: it is used for debug/support, in situations
-    /// where an issue is identified that would go away with a restart of the tenant.
-    ///
-    /// This does not have any special "force" shutdown of a tenant: it relies on the tenant's tasks
-    /// to respect the cancellation tokens used in normal shutdown().
-    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %drop_cache))]
-    pub(crate) async fn reset_tenant(
-        &self,
-        tenant_shard_id: TenantShardId,
-        drop_cache: bool,
-        ctx: RequestContext,
-    ) -> anyhow::Result<()> {
-        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
-        let Some(old_slot) = slot_guard.get_old_value() else {
-            anyhow::bail!("Tenant not found when trying to reset");
-        };
-
-        let Some(tenant) = old_slot.get_attached() else {
-            slot_guard.revert();
-            anyhow::bail!("Tenant is not in attached state");
-        };
-
-        let (_guard, progress) = utils::completion::channel();
-        match tenant.shutdown(progress, false).await {
-            Ok(()) => {
-                slot_guard.drop_old_value()?;
-            }
-            Err(_barrier) => {
-                slot_guard.revert();
-                anyhow::bail!("Cannot reset Tenant, already shutting down");
-            }
-        }
-
-        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-        let timelines_path = self.conf.timelines_path(&tenant_shard_id);
-        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
-
-        if drop_cache {
-            tracing::info!("Dropping local file cache");
-
-            match tokio::fs::read_dir(&timelines_path).await {
-                Err(e) => {
-                    tracing::warn!("Failed to list timelines while dropping cache: {}", e);
-                }
-                Ok(mut entries) => {
-                    while let Some(entry) = entries.next_entry().await? {
-                        tokio::fs::remove_dir_all(entry.path()).await?;
-                    }
-                }
-            }
-        }
-
-        let shard_identity = config.shard;
-        let tenant = tenant_spawn(
-            self.conf,
-            tenant_shard_id,
-            &tenant_path,
-            self.resources.clone(),
-            AttachedTenantConf::try_from(config)?,
-            shard_identity,
-            None,
-            self.tenants,
-            SpawnMode::Normal,
-            &ctx,
-        )?;
-
-        slot_guard.upsert(TenantSlot::Attached(tenant))?;
-
-        Ok(())
-    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -1203,7 +1100,6 @@ pub(crate) enum GetActiveTenantError {
 /// then wait for up to `timeout` (minus however long we waited for the slot).
 pub(crate) async fn get_active_tenant_with_timeout(
    tenant_id: TenantId,
-    shard_selector: ShardSelector,
    timeout: Duration,
    cancel: &CancellationToken,
 ) -> Result<Arc<Tenant>, GetActiveTenantError> {
@@ -1212,17 +1108,15 @@ pub(crate) async fn get_active_tenant_with_timeout(
        Tenant(Arc<Tenant>),
    }

+    // TODO(sharding): make page service interface sharding-aware (page service should apply ShardIdentity to the key
+    // to decide which shard services the request)
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
    let wait_start = Instant::now();
    let deadline = wait_start + timeout;

-    let (wait_for, tenant_shard_id) = {
+    let wait_for = {
        let locked = TENANTS.read().unwrap();
-
-        // Resolve TenantId to TenantShardId
-        let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
-            GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
-        )?;
-
        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
            .map_err(GetTenantError::MapState)?;
        match peek_slot {
@@ -1232,7 +1126,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                        // Fast path: we don't need to do any async waiting.
                        return Ok(tenant.clone());
                    }
-                    _ => (WaitFor::Tenant(tenant.clone()), tenant_shard_id),
+                    _ => WaitFor::Tenant(tenant.clone()),
                }
            }
            Some(TenantSlot::Secondary) => {
@@ -1240,9 +1134,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                    tenant_id,
                )))
            }
-            Some(TenantSlot::InProgress(barrier)) => {
-                (WaitFor::Barrier(barrier.clone()), tenant_shard_id)
-            }
+            Some(TenantSlot::InProgress(barrier)) => WaitFor::Barrier(barrier.clone()),
            None => {
                return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
                    tenant_id,
@@ -1327,7 +1219,8 @@ pub(crate) async fn delete_tenant(
    // See https://github.com/neondatabase/neon/issues/5080

    // TODO(sharding): make delete API sharding-aware
-    let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
+    let mut slot_guard =
+        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;

    // unwrap is safe because we used MustExist mode when acquiring
    let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
@@ -1484,14 +1377,12 @@ pub(crate) async fn load_tenant(

    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;

-    let shard_identity = location_conf.shard;
    let new_tenant = tenant_spawn(
        conf,
        tenant_shard_id,
        &tenant_path,
        resources,
        AttachedTenantConf::try_from(location_conf)?,
-        shard_identity,
        None,
        &TENANTS,
        SpawnMode::Normal,
@@ -1581,14 +1472,12 @@ pub(crate) async fn attach_tenant(
    // TODO: tenant directory remains on disk if we bail out from here on.
    //       See https://github.com/neondatabase/neon/issues/4233

-    let shard_identity = location_conf.shard;
    let attached_tenant = tenant_spawn(
        conf,
        tenant_shard_id,
        &tenant_dir,
        resources,
        AttachedTenantConf::try_from(location_conf)?,
-        shard_identity,
        None,
        &TENANTS,
        SpawnMode::Normal,
@@ -1654,10 +1543,9 @@ pub enum TenantSlotUpsertError {
    MapState(#[from] TenantMapError),
 }

-#[derive(Debug, thiserror::Error)]
+#[derive(Debug)]
 enum TenantSlotDropError {
    /// It is only legal to drop a TenantSlot if its contents are fully shut down
-    #[error("Tenant was not shut down")]
    NotShutdown,
 }

@@ -1717,9 +1605,9 @@ impl SlotGuard {
        }
    }

-    /// Get any value that was present in the slot before we acquired ownership
+    /// Take any value that was present in the slot before we acquired ownership
    /// of it: in state transitions, this will be the old state.
-    fn get_old_value(&self) -> &Option<TenantSlot> {
+    fn get_old_value(&mut self) -> &Option<TenantSlot> {
        &self.old_value
    }

--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -188,7 +188,6 @@ use anyhow::Context;
 use camino::Utf8Path;
 use chrono::{NaiveDateTime, Utc};

-pub(crate) use download::download_initdb_tar_zst;
 use pageserver_api::shard::{ShardIndex, TenantShardId};
 use scopeguard::ScopeGuard;
 use tokio_util::sync::CancellationToken;
@@ -1078,17 +1077,7 @@ impl RemoteTimelineClient {

        let remaining_layers: Vec<RemotePath> = remaining
            .into_iter()
-            .filter(|p| {
-                if p == &latest_index {
-                    return false;
-                }
-                if let Some(name) = p.object_name() {
-                    if name == INITDB_PATH {
-                        return false;
-                    }
-                }
-                true
-            })
+            .filter(|p| p!= &latest_index)
            .inspect(|path| {
                if let Some(name) = path.object_name() {
                    info!(%name, "deleting a file not referenced from index_part.json");
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -8,12 +8,11 @@ use std::future::Future;
 use std::time::Duration;

 use anyhow::{anyhow, Context};
-use camino::{Utf8Path, Utf8PathBuf};
+use camino::Utf8Path;
 use pageserver_api::shard::TenantShardId;
-use tokio::fs::{self, File, OpenOptions};
-use tokio::io::{AsyncSeekExt, AsyncWriteExt};
+use tokio::fs;
+use tokio::io::AsyncWriteExt;
 use tokio_util::sync::CancellationToken;
-use tracing::warn;
 use utils::{backoff, crashsafe};

 use crate::config::PageServerConf;
@@ -21,15 +20,14 @@ use crate::tenant::remote_timeline_client::{remote_layer_path, remote_timelines_
 use crate::tenant::storage_layer::LayerFileName;
 use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::Generation;
-use crate::TEMP_FILE_SUFFIX;
 use remote_storage::{DownloadError, GenericRemoteStorage, ListingMode};
 use utils::crashsafe::path_with_suffix_extension;
 use utils::id::TimelineId;

 use super::index::{IndexPart, LayerFileMetadata};
 use super::{
-    parse_remote_index_path, remote_index_path, remote_initdb_archive_path,
-    FAILED_DOWNLOAD_WARN_THRESHOLD, FAILED_REMOTE_OP_RETRIES, INITDB_PATH,
+    parse_remote_index_path, remote_index_path, FAILED_DOWNLOAD_WARN_THRESHOLD,
+    FAILED_REMOTE_OP_RETRIES,
 };

 static MAX_DOWNLOAD_DURATION: Duration = Duration::from_secs(120);
@@ -363,7 +361,7 @@ pub(super) async fn download_index_part(
        None => {
            // Migration from legacy pre-generation state: we have a generation but no prior
            // attached pageservers did.  Try to load from a no-generation path.
-            tracing::debug!("No index_part.json* found");
+            tracing::info!("No index_part.json* found");
            do_download_index_part(
                storage,
                tenant_shard_id,
@@ -376,69 +374,6 @@ pub(super) async fn download_index_part(
    }
 }

-pub(crate) async fn download_initdb_tar_zst(
-    conf: &'static PageServerConf,
-    storage: &GenericRemoteStorage,
-    tenant_shard_id: &TenantShardId,
-    timeline_id: &TimelineId,
-) -> Result<(Utf8PathBuf, File), DownloadError> {
-    debug_assert_current_span_has_tenant_and_timeline_id();
-
-    let remote_path = remote_initdb_archive_path(&tenant_shard_id.tenant_id, timeline_id);
-
-    let timeline_path = conf.timelines_path(tenant_shard_id);
-
-    if !timeline_path.exists() {
-        tokio::fs::create_dir_all(&timeline_path)
-            .await
-            .with_context(|| format!("timeline dir creation {timeline_path}"))
-            .map_err(DownloadError::Other)?;
-    }
-    let temp_path = timeline_path.join(format!("{INITDB_PATH}-{timeline_id}.{TEMP_FILE_SUFFIX}"));
-
-    let file = download_retry(
-        || async {
-            let mut file = OpenOptions::new()
-                .create(true)
-                .truncate(true)
-                .read(true)
-                .write(true)
-                .open(&temp_path)
-                .await
-                .with_context(|| format!("tempfile creation {temp_path}"))
-                .map_err(DownloadError::Other)?;
-
-            let mut download = storage.download(&remote_path).await?;
-
-            tokio::io::copy(&mut download.download_stream, &mut file)
-                .await
-                .with_context(|| format!("download initdb.tar.zst at {remote_path:?}"))
-                .map_err(DownloadError::Other)?;
-
-            file.seek(std::io::SeekFrom::Start(0))
-                .await
-                .with_context(|| format!("rewinding initdb.tar.zst at: {remote_path:?}"))
-                .map_err(DownloadError::Other)?;
-
-            Ok(file)
-        },
-        &format!("download {remote_path}"),
-    )
-    .await
-    .map_err(|e| {
-        if temp_path.exists() {
-            // Do a best-effort attempt at deleting the temporary file upon encountering an error.
-            // We don't have async here nor do we want to pile on any extra errors.
-            if let Err(e) = std::fs::remove_file(&temp_path) {
-                warn!("error deleting temporary file {temp_path}: {e}");
-            }
-        }
-        e
-    })?;
-
-    Ok((temp_path, file))
-}
-
 /// Helper function to handle retries for a download operation.
 ///
 /// Remote operations can fail due to rate limits (IAM, S3), spurious network
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -230,10 +230,6 @@ impl Layer {
    ///
    /// It is up to the caller to collect more data from the previous layer and
    /// perform WAL redo, if necessary.
-    ///
-    /// # Cancellation-Safety
-    ///
-    /// This method is cancellation-safe.
    pub(crate) async fn get_value_reconstruct_data(
        &self,
        key: Key,
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -44,7 +44,6 @@ pub(crate) enum BackgroundLoopKind {
    Eviction,
    ConsumptionMetricsCollectMetrics,
    ConsumptionMetricsSyntheticSizeWorker,
-    InitialLogicalSizeCalculation,
 }

 impl BackgroundLoopKind {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -2,7 +2,7 @@ pub mod delete;
 mod eviction_task;
 mod init;
 pub mod layer_manager;
-pub(crate) mod logical_size;
+mod logical_size;
 pub mod span;
 pub mod uninit;
 mod walreceiver;
@@ -18,29 +18,25 @@ use pageserver_api::{
        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, LayerMapInfo,
        TimelineState,
    },
-    shard::{ShardIdentity, TenantShardId},
+    shard::TenantShardId,
 };
-use rand::Rng;
 use serde_with::serde_as;
 use storage_broker::BrokerClientChannel;
 use tokio::{
    runtime::Handle,
-    sync::{oneshot, watch},
+    sync::{oneshot, watch, TryAcquireError},
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{id::TenantTimelineId, sync::gate::Gate};

+use std::cmp::{max, min, Ordering};
 use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::ops::{Deref, Range};
 use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
-use std::{
-    cmp::{max, min, Ordering},
-    ops::ControlFlow,
-};

 use crate::context::{
    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
@@ -167,10 +163,6 @@ pub struct Timeline {
    /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
    pub(crate) generation: Generation,

-    /// The detailed sharding information from our parent Tenant.  This enables us to map keys
-    /// to shards, and is constant through the lifetime of this Timeline.
-    shard_identity: ShardIdentity,
-
    pub pg_version: u32,

    /// The tuple has two elements.
@@ -306,6 +298,13 @@ pub struct Timeline {

    eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,

+    /// Barrier to wait before doing initial logical size calculation. Used only during startup.
+    initial_logical_size_can_start: Option<completion::Barrier>,
+
+    /// Completion shared between all timelines loaded during startup; used to delay heavier
+    /// background tasks until some logical sizes have been calculated.
+    initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
+
    /// Load or creation time information about the disk_consistent_lsn and when the loading
    /// happened. Used for consumption metrics.
    pub(crate) loaded_at: (Lsn, SystemTime),
@@ -454,11 +453,6 @@ pub enum LogicalSizeCalculationCause {
    TenantSizeHandler,
 }

-pub enum GetLogicalSizePriority {
-    User,
-    Background,
-}
-
 #[derive(enumset::EnumSetType)]
 pub(crate) enum CompactFlags {
    ForceRepartition,
@@ -495,9 +489,6 @@ impl Timeline {
    /// an ancestor branch, for example, or waste a lot of cycles chasing the
    /// non-existing key.
    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
    pub async fn get(
        &self,
        key: Key,
@@ -810,12 +801,7 @@ impl Timeline {
                    .access_stats_behavior(AccessStatsBehavior::Skip)
                    .build();

-                // 2. Compact
-                let timer = self.metrics.compact_time_histo.start_timer();
-                self.compact_level0(target_file_size, ctx).await?;
-                timer.stop_and_record();
-
-                // 3. Create new image layers for partitions that have been modified
+                // 2. Create new image layers for partitions that have been modified
                // "enough".
                let layers = self
                    .create_image_layers(&partitioning, lsn, false, &image_ctx)
@@ -827,6 +813,11 @@ impl Timeline {
                    }
                }

+                // 3. Compact
+                let timer = self.metrics.compact_time_histo.start_timer();
+                self.compact_level0(target_file_size, ctx).await?;
+                timer.stop_and_record();
+
                if let Some(remote_client) = &self.remote_client {
                    // should any new image layer been created, not uploading index_part will
                    // result in a mismatch between remote_physical_size and layermap calculated
@@ -858,6 +849,31 @@ impl Timeline {
        }
    }

+    /// Retrieve current logical size of the timeline.
+    ///
+    /// The size could be lagging behind the actual number, in case
+    /// the initial size calculation has not been run (gets triggered on the first size access).
+    ///
+    /// return size and boolean flag that shows if the size is exact
+    pub fn get_current_logical_size(
+        self: &Arc<Self>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<(u64, bool)> {
+        let current_size = self.current_logical_size.current_size()?;
+        debug!("Current size: {current_size:?}");
+
+        let mut is_exact = true;
+        let size = current_size.size();
+        if let (CurrentLogicalSize::Approximate(_), Some(initial_part_end)) =
+            (current_size, self.current_logical_size.initial_part_end)
+        {
+            is_exact = false;
+            self.try_spawn_size_init_task(initial_part_end, ctx);
+        }
+
+        Ok((size, is_exact))
+    }
+
    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
    /// the in-memory layer, and initiate flushing it if so.
    ///
@@ -907,7 +923,6 @@ impl Timeline {
        background_jobs_can_start: Option<&completion::Barrier>,
        ctx: &RequestContext,
    ) {
-        self.spawn_initial_logical_size_computation_task(ctx);
        self.launch_wal_receiver(ctx, broker_client);
        self.set_state(TimelineState::Active);
        self.launch_eviction_task(background_jobs_can_start);
@@ -1021,6 +1036,17 @@ impl Timeline {
                error!("Not activating a Stopping timeline");
            }
            (_, new_state) => {
+                if matches!(
+                    new_state,
+                    TimelineState::Stopping | TimelineState::Broken { .. }
+                ) {
+                    // drop the completion guard, if any; it might be holding off the completion
+                    // forever needlessly
+                    self.initial_logical_size_attempt
+                        .lock()
+                        .unwrap_or_else(|e| e.into_inner())
+                        .take();
+                }
                self.state.send_replace(new_state);
            }
        }
@@ -1339,10 +1365,11 @@ impl Timeline {
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        generation: Generation,
-        shard_identity: ShardIdentity,
        walredo_mgr: Arc<super::WalRedoManager>,
        resources: TimelineResources,
        pg_version: u32,
+        initial_logical_size_can_start: Option<completion::Barrier>,
+        initial_logical_size_attempt: Option<completion::Completion>,
        state: TimelineState,
        cancel: CancellationToken,
    ) -> Arc<Self> {
@@ -1369,7 +1396,6 @@ impl Timeline {
                timeline_id,
                tenant_shard_id,
                generation,
-                shard_identity,
                pg_version,
                layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())),
                wanted_image_layers: Mutex::new(None),
@@ -1443,6 +1469,8 @@ impl Timeline {
                ),
                delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),

+                initial_logical_size_can_start,
+                initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
                cancel,
                gate: Gate::new(format!("Timeline<{tenant_shard_id}/{timeline_id}>")),

@@ -1754,91 +1782,38 @@ impl Timeline {
        Ok(())
    }

-    /// Retrieve current logical size of the timeline.
-    ///
-    /// The size could be lagging behind the actual number, in case
-    /// the initial size calculation has not been run (gets triggered on the first size access).
-    ///
-    /// return size and boolean flag that shows if the size is exact
-    pub(crate) fn get_current_logical_size(
-        self: &Arc<Self>,
-        priority: GetLogicalSizePriority,
-        ctx: &RequestContext,
-    ) -> logical_size::CurrentLogicalSize {
-        let current_size = self.current_logical_size.current_size();
-        debug!("Current size: {current_size:?}");
-
-        match (current_size.accuracy(), priority) {
-            (logical_size::Accuracy::Exact, _) => (), // nothing to do
-            (logical_size::Accuracy::Approximate, GetLogicalSizePriority::Background) => {
-                // background task will eventually deliver an exact value, we're in no rush
-            }
-            (logical_size::Accuracy::Approximate, GetLogicalSizePriority::User) => {
-                // background task is not ready, but user is asking for it now;
-                // => make the background task skip the line
-                // (The alternative would be to calculate the size here, but,
-                //  it can actually take a long time if the user has a lot of rels.
-                //  And we'll inevitable need it again; So, let the background task do the work.)
-                match self
-                    .current_logical_size
-                    .cancel_wait_for_background_loop_concurrency_limit_semaphore
-                    .get()
-                {
-                    Some(cancel) => cancel.cancel(),
-                    None => {
-                        let state = self.current_state();
-                        if matches!(
-                            state,
-                            TimelineState::Broken { .. } | TimelineState::Stopping
-                        ) {
-
-                            // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
-                            // Don't make noise.
-                        } else {
-                            warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
-                        }
-                    }
-                };
-            }
-        }
-
-        if let CurrentLogicalSize::Approximate(_) = &current_size {
-            if ctx.task_kind() == TaskKind::WalReceiverConnectionHandler {
-                let first = self
-                    .current_logical_size
-                    .did_return_approximate_to_walreceiver
-                    .compare_exchange(
-                        false,
-                        true,
-                        AtomicOrdering::Relaxed,
-                        AtomicOrdering::Relaxed,
-                    )
-                    .is_ok();
-                if first {
-                    crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE.inc();
-                }
-            }
-        }
-
-        current_size
-    }
-
-    fn spawn_initial_logical_size_computation_task(self: &Arc<Self>, ctx: &RequestContext) {
-        let Some(initial_part_end) = self.current_logical_size.initial_part_end else {
-            // nothing to do for freshly created timelines;
-            assert_eq!(
-                self.current_logical_size.current_size().accuracy(),
-                logical_size::Accuracy::Exact,
-            );
+    fn try_spawn_size_init_task(self: &Arc<Self>, lsn: Lsn, ctx: &RequestContext) {
+        let state = self.current_state();
+        if matches!(
+            state,
+            TimelineState::Broken { .. } | TimelineState::Stopping
+        ) {
+            // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
            return;
+        }
+
+        let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
+            .try_acquire_owned()
+        {
+            Ok(permit) => permit,
+            Err(TryAcquireError::NoPermits) => {
+                // computation already ongoing or finished with success
+                return;
+            }
+            Err(TryAcquireError::Closed) => unreachable!("we never call close"),
        };
+        debug_assert!(self
+            .current_logical_size
+            .initial_logical_size
+            .get()
+            .is_none());

-        let cancel_wait_for_background_loop_concurrency_limit_semaphore = CancellationToken::new();
-        let token = cancel_wait_for_background_loop_concurrency_limit_semaphore.clone();
-        self.current_logical_size
-            .cancel_wait_for_background_loop_concurrency_limit_semaphore.set(token)
-            .expect("initial logical size calculation task must be spawned exactly once per Timeline object");
-
+        info!(
+            "spawning logical size computation from context of task kind {:?}",
+            ctx.task_kind()
+        );
+        // We need to start the computation task.
+        // It gets a separate context since it will outlive the request that called this function.
        let self_clone = Arc::clone(self);
        let background_ctx = ctx.detached_child(
            TaskKind::InitialLogicalSizeCalculation,
@@ -1853,152 +1828,89 @@ impl Timeline {
            false,
            // NB: don't log errors here, task_mgr will do that.
            async move {
+
                let cancel = task_mgr::shutdown_token();
-                self_clone
-                    .initial_logical_size_calculation_task(
-                        initial_part_end,
-                        cancel_wait_for_background_loop_concurrency_limit_semaphore,
-                        cancel,
-                        background_ctx,
-                    )
-                    .await;
-                Ok(())
-            }
-            .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, timeline_id=%self.timeline_id)),
-        );
-    }

-    async fn initial_logical_size_calculation_task(
-        self: Arc<Self>,
-        initial_part_end: Lsn,
-        skip_concurrency_limiter: CancellationToken,
-        cancel: CancellationToken,
-        background_ctx: RequestContext,
-    ) {
-        enum BackgroundCalculationError {
-            Cancelled,
-            Other(anyhow::Error),
-        }
-
-        let try_once = |attempt: usize| {
-            let background_ctx = &background_ctx;
-            let self_ref = &self;
-            let skip_concurrency_limiter = &skip_concurrency_limiter;
-            async move {
-                let cancel = task_mgr::shutdown_token();
-                let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit(
-                    BackgroundLoopKind::InitialLogicalSizeCalculation,
-                    background_ctx,
-                    &cancel,
-                );
-
-                use crate::metrics::initial_logical_size::StartCircumstances;
-                let (_maybe_permit, circumstances) = tokio::select! {
-                    res = wait_for_permit => {
-                        match res {
-                            Ok(permit) => (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit),
-                            Err(RateLimitError::Cancelled) => {
-                                return Err(BackgroundCalculationError::Cancelled);
-                            }
-                        }
-                    }
-                    () = skip_concurrency_limiter.cancelled() => {
-                        // Some action that is part of a end user interaction requested logical size
-                        // => break out of the rate limit
-                        // TODO: ideally we'd not run on BackgroundRuntime but the requester's runtime;
-                        // but then again what happens if they cancel; also, we should just be using
-                        // one runtime across the entire process, so, let's leave this for now.
-                        (None, StartCircumstances::SkippedConcurrencyLimiter)
-                    }
+                // in case we were created during pageserver initialization, wait for
+                // initialization to complete before proceeding. startup time init runs on the same
+                // runtime.
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()); },
+                    _ = completion::Barrier::maybe_wait(self_clone.initial_logical_size_can_start.clone()) => {}
                };

-                let metrics_guard = if attempt == 1 {
-                    crate::metrics::initial_logical_size::START_CALCULATION.first(circumstances)
-                } else {
-                    crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
-                };
+                // hold off background tasks from starting until all timelines get to try at least
+                // once initial logical size calculation; though retry will rarely be useful.
+                // holding off is done because heavier tasks execute blockingly on the same
+                // runtime.
+                //
+                // dropping this at every outcome is probably better than trying to cling on to it,
+                // delay will be terminated by a timeout regardless.
+                let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };

-                match self_ref
-                    .logical_size_calculation_task(
-                        initial_part_end,
-                        LogicalSizeCalculationCause::Initial,
-                        background_ctx,
-                    )
+                let calculated_size = match self_clone
+                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx)
                    .await
                {
-                    Ok(calculated_size) => Ok((calculated_size, metrics_guard)),
+                    Ok(s) => s,
                    Err(CalculateLogicalSizeError::Cancelled) => {
-                        Err(BackgroundCalculationError::Cancelled)
+                        // Don't make noise, this is a common task.
+                        // In the unlikely case that there is another call to this function, we'll retry
+                        // because initial_logical_size is still None.
+                        info!("initial size calculation cancelled, likely timeline delete / tenant detach");
+                        return Ok(());
                    }
                    Err(CalculateLogicalSizeError::Other(err)) => {
-                        if let Some(PageReconstructError::AncestorStopping(_)) =
+                        if let Some(e @ PageReconstructError::AncestorStopping(_)) =
                            err.root_cause().downcast_ref()
                        {
-                            Err(BackgroundCalculationError::Cancelled)
-                        } else {
-                            Err(BackgroundCalculationError::Other(err))
+                            // This can happen if the timeline parent timeline switches to
+                            // Stopping state while we're still calculating the initial
+                            // timeline size for the child, for example if the tenant is
+                            // being detached or the pageserver is shut down. Like with
+                            // CalculateLogicalSizeError::Cancelled, don't make noise.
+                            info!("initial size calculation failed because the timeline or its ancestor is Stopping, likely because the tenant is being detached: {e:#}");
+                            return Ok(());
                        }
+                        return Err(err.context("Failed to calculate logical size"));
+                    }
+                };
+
+                // we cannot query current_logical_size.current_size() to know the current
+                // *negative* value, only truncated to u64.
+                let added = self_clone
+                    .current_logical_size
+                    .size_added_after_initial
+                    .load(AtomicOrdering::Relaxed);
+
+                let sum = calculated_size.saturating_add_signed(added);
+
+                // set the gauge value before it can be set in `update_current_logical_size`.
+                self_clone.metrics.current_logical_size_gauge.set(sum);
+
+                match self_clone
+                    .current_logical_size
+                    .initial_logical_size
+                    .set(calculated_size)
+                {
+                    Ok(()) => (),
+                    Err(_what_we_just_attempted_to_set) => {
+                        let existing_size = self_clone
+                            .current_logical_size
+                            .initial_logical_size
+                            .get()
+                            .expect("once_cell set was lost, then get failed, impossible.");
+                        // This shouldn't happen because the semaphore is initialized with 1.
+                        // But if it happens, just complain & report success so there are no further retries.
+                        error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing")
                    }
                }
-            }
-        };
-
-        let retrying = async {
-            let mut attempt = 0;
-            loop {
-                attempt += 1;
-
-                match try_once(attempt).await {
-                    Ok(res) => return ControlFlow::Continue(res),
-                    Err(BackgroundCalculationError::Cancelled) => return ControlFlow::Break(()),
-                    Err(BackgroundCalculationError::Other(e)) => {
-                        warn!(attempt, "initial size calculation failed: {e:?}");
-                        // exponential back-off doesn't make sense at these long intervals;
-                        // use fixed retry interval with generous jitter instead
-                        let sleep_duration = Duration::from_secs(
-                            u64::try_from(
-                                // 1hour base
-                                (60_i64 * 60_i64)
-                                    // 10min jitter
-                                    + rand::thread_rng().gen_range(-10 * 60..10 * 60),
-                            )
-                            .expect("10min < 1hour"),
-                        );
-                        tokio::time::sleep(sleep_duration).await;
-                    }
-                }
-            }
-        };
-
-        let (calculated_size, metrics_guard) = tokio::select! {
-            res = retrying  => {
-                match res {
-                    ControlFlow::Continue(calculated_size) => calculated_size,
-                    ControlFlow::Break(()) => return,
-                }
-            }
-            _ = cancel.cancelled() => {
-                return;
-            }
-        };
-
-        // we cannot query current_logical_size.current_size() to know the current
-        // *negative* value, only truncated to u64.
-        let added = self
-            .current_logical_size
-            .size_added_after_initial
-            .load(AtomicOrdering::Relaxed);
-
-        let sum = calculated_size.saturating_add_signed(added);
-
-        // set the gauge value before it can be set in `update_current_logical_size`.
-        self.metrics.current_logical_size_gauge.set(sum);
-
-        self.current_logical_size
-            .initial_logical_size
-            .set((calculated_size, metrics_guard.calculation_result_saved()))
-            .ok()
-            .expect("only this task sets it");
+                // now that `initial_logical_size.is_some()`, reduce permit count to 0
+                // so that we prevent future callers from spawning this task
+                permit.forget();
+                Ok(())
+            }.in_current_span(),
+        );
    }

    pub fn spawn_ondemand_logical_size_calculation(
@@ -2036,9 +1948,6 @@ impl Timeline {
        receiver
    }

-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
    #[instrument(skip_all)]
    async fn logical_size_calculation_task(
        self: &Arc<Self>,
@@ -2076,10 +1985,6 @@ impl Timeline {
    ///
    /// NOTE: counted incrementally, includes ancestors. This can be a slow operation,
    /// especially if we need to download remote layers.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
    pub async fn calculate_logical_size(
        &self,
        up_to_lsn: Lsn,
@@ -2144,14 +2049,16 @@ impl Timeline {
        // one value while current_logical_size is set to the
        // other.
        match logical_size.current_size() {
-            CurrentLogicalSize::Exact(ref new_current_size) => self
+            Ok(CurrentLogicalSize::Exact(new_current_size)) => self
                .metrics
                .current_logical_size_gauge
-                .set(new_current_size.into()),
-            CurrentLogicalSize::Approximate(_) => {
+                .set(new_current_size),
+            Ok(CurrentLogicalSize::Approximate(_)) => {
                // don't update the gauge yet, this allows us not to update the gauge back and
                // forth between the initial size calculation task.
            }
+            // this is overflow
+            Err(e) => error!("Failed to compute current logical size for metrics update: {e:?}"),
        }
    }

@@ -2195,10 +2102,6 @@ impl Timeline {
    ///
    /// This function takes the current timeline's locked LayerMap as an argument,
    /// so callers can avoid potential race conditions.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
    async fn get_reconstruct_data(
        &self,
        key: Key,
@@ -2447,9 +2350,6 @@ impl Timeline {
        }
    }

-    /// # Cancel-safety
-    ///
-    /// This method is cancellation-safe.
    async fn lookup_cached_page(
        &self,
        key: &Key,
@@ -2484,10 +2384,6 @@ impl Timeline {
        Ok(Arc::clone(ancestor))
    }

-    pub(crate) fn get_shard_identity(&self) -> &ShardIdentity {
-        &self.shard_identity
-    }
-
    ///
    /// Get a handle to the latest layer for appending.
    ///
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -21,6 +21,7 @@ use crate::{
        },
        CreateTimelineCause, DeleteTimelineError, Tenant,
    },
+    InitializationOrder,
 };

 use super::{Timeline, TimelineResources};
@@ -406,6 +407,7 @@ impl DeleteTimelineFlow {
        local_metadata: &TimelineMetadata,
        remote_client: Option<RemoteTimelineClient>,
        deletion_queue_client: DeletionQueueClient,
+        init_order: Option<&InitializationOrder>,
    ) -> anyhow::Result<()> {
        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
        // RemoteTimelineClient is the only functioning part.
@@ -418,6 +420,7 @@ impl DeleteTimelineFlow {
                    remote_client,
                    deletion_queue_client,
                },
+                init_order,
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
                CreateTimelineCause::Delete,
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -1,10 +1,11 @@
 use anyhow::Context;
-
 use once_cell::sync::OnceCell;
-use tokio_util::sync::CancellationToken;
+
+use tokio::sync::Semaphore;
 use utils::lsn::Lsn;

-use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
+use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
+use std::sync::Arc;

 /// Internal structure to hold all data needed for logical size calculation.
 ///
@@ -22,17 +23,10 @@ pub(super) struct LogicalSize {
    ///
    /// NOTE: size at a given LSN is constant, but after a restart we will calculate
    /// the initial size at a different LSN.
-    pub initial_logical_size: OnceCell<(
-        u64,
-        crate::metrics::initial_logical_size::FinishedCalculationGuard,
-    )>,
+    pub initial_logical_size: OnceCell<u64>,

-    /// Cancellation for the best-effort logical size calculation.
-    ///
-    /// The token is kept in a once-cell so that we can error out if a higher priority
-    /// request comes in *before* we have started the normal logical size calculation.
-    pub(crate) cancel_wait_for_background_loop_concurrency_limit_semaphore:
-        OnceCell<CancellationToken>,
+    /// Semaphore to track ongoing calculation of `initial_logical_size`.
+    pub initial_size_computation: Arc<tokio::sync::Semaphore>,

    /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
    pub initial_part_end: Option<Lsn>,
@@ -58,57 +52,25 @@ pub(super) struct LogicalSize {
    /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
    /// to modify this, it will also keep the prometheus metric in sync.
    pub size_added_after_initial: AtomicI64,
-
-    /// For [`crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE`].
-    pub(super) did_return_approximate_to_walreceiver: AtomicBool,
 }

 /// Normalized current size, that the data in pageserver occupies.
 #[derive(Debug, Clone, Copy)]
-pub(crate) enum CurrentLogicalSize {
+pub(super) enum CurrentLogicalSize {
    /// The size is not yet calculated to the end, this is an intermediate result,
    /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
    /// yet total logical size cannot be below 0.
-    Approximate(Approximate),
+    Approximate(u64),
    // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
    // available for observation without any calculations.
-    Exact(Exact),
-}
-
-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
-pub(crate) enum Accuracy {
-    Approximate,
-    Exact,
-}
-
-#[derive(Debug, Clone, Copy)]
-pub(crate) struct Approximate(u64);
-#[derive(Debug, Clone, Copy)]
-pub(crate) struct Exact(u64);
-
-impl From<&Approximate> for u64 {
-    fn from(value: &Approximate) -> Self {
-        value.0
-    }
-}
-
-impl From<&Exact> for u64 {
-    fn from(val: &Exact) -> Self {
-        val.0
-    }
+    Exact(u64),
 }

 impl CurrentLogicalSize {
-    pub(crate) fn size_dont_care_about_accuracy(&self) -> u64 {
-        match self {
-            Self::Approximate(size) => size.into(),
-            Self::Exact(size) => size.into(),
-        }
-    }
-    pub(crate) fn accuracy(&self) -> Accuracy {
-        match self {
-            Self::Approximate(_) => Accuracy::Approximate,
-            Self::Exact(_) => Accuracy::Exact,
+    pub(super) fn size(&self) -> u64 {
+        *match self {
+            Self::Approximate(size) => size,
+            Self::Exact(size) => size,
        }
    }
 }
@@ -116,42 +78,36 @@ impl CurrentLogicalSize {
 impl LogicalSize {
    pub(super) fn empty_initial() -> Self {
        Self {
-            initial_logical_size: OnceCell::with_value((0, {
-                crate::metrics::initial_logical_size::START_CALCULATION
-                    .first(crate::metrics::initial_logical_size::StartCircumstances::EmptyInitial)
-                    .calculation_result_saved()
-            })),
-            cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
+            initial_logical_size: OnceCell::with_value(0),
+            //  initial_logical_size already computed, so, don't admit any calculations
+            initial_size_computation: Arc::new(Semaphore::new(0)),
            initial_part_end: None,
            size_added_after_initial: AtomicI64::new(0),
-            did_return_approximate_to_walreceiver: AtomicBool::new(false),
        }
    }

    pub(super) fn deferred_initial(compute_to: Lsn) -> Self {
        Self {
            initial_logical_size: OnceCell::new(),
-            cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
+            initial_size_computation: Arc::new(Semaphore::new(1)),
            initial_part_end: Some(compute_to),
            size_added_after_initial: AtomicI64::new(0),
-            did_return_approximate_to_walreceiver: AtomicBool::new(false),
        }
    }

-    pub(super) fn current_size(&self) -> CurrentLogicalSize {
+    pub(super) fn current_size(&self) -> anyhow::Result<CurrentLogicalSize> {
        let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
        //                  ^^^ keep this type explicit so that the casts in this function break if
        //                  we change the type.
        match self.initial_logical_size.get() {
-            Some((initial_size, _)) => {
-                CurrentLogicalSize::Exact(Exact(initial_size.checked_add_signed(size_increment)
+            Some(initial_size) => {
+                initial_size.checked_add_signed(size_increment)
                    .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
-                    .unwrap()))
+                    .map(CurrentLogicalSize::Exact)
            }
            None => {
-
                let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
-                CurrentLogicalSize::Approximate(Approximate(non_negative_size_increment))
+                Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
            }
        }
    }
@@ -165,7 +121,7 @@ impl LogicalSize {
    /// available for re-use. This doesn't contain the incremental part.
    pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
        match self.initial_part_end {
-            Some(v) if v == lsn => self.initial_logical_size.get().map(|(s, _)| *s),
+            Some(v) if v == lsn => self.initial_logical_size.get().copied(),
            _ => None,
        }
    }
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -396,15 +396,11 @@ pub(super) async fn handle_walreceiver_connection(

            // Send the replication feedback message.
            // Regular standby_status_update fields are put into this message.
-            let current_timeline_size = timeline
-                .get_current_logical_size(
-                    crate::tenant::timeline::GetLogicalSizePriority::User,
-                    &ctx,
-                )
-                // FIXME: https://github.com/neondatabase/neon/issues/5963
-                .size_dont_care_about_accuracy();
+            let (timeline_logical_size, _) = timeline
+                .get_current_logical_size(&ctx)
+                .context("Status update creation failed to get current logical size")?;
            let status_update = PageserverFeedback {
-                current_timeline_size,
+                current_timeline_size: timeline_logical_size,
                last_received_lsn,
                disk_consistent_lsn,
                remote_consistent_lsn,
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -1437,16 +1437,7 @@ impl<'a> WalIngest<'a> {
        // record.
        // TODO: would be nice if to be more explicit about it
        let last_lsn = modification.lsn;
-
-        // Get current size and put rel creation if rel doesn't exist
-        //
-        // NOTE: we check the cache first even though get_rel_exists and get_rel_size would
-        //       check the cache too. This is because eagerly checking the cache results in
-        //       less work overall and 10% better performance. It's more work on cache miss
-        //       but cache miss is rare.
-        let old_nblocks = if let Some(nblocks) = self.timeline.get_cached_rel_size(&rel, last_lsn) {
-            nblocks
-        } else if !self
+        let old_nblocks = if !self
            .timeline
            .get_rel_exists(rel, last_lsn, true, ctx)
            .await?
@@ -2124,7 +2115,7 @@ mod tests {
            .load()
            .await;
        let tline = tenant
-            .bootstrap_timeline(TIMELINE_ID, pg_version, None, &ctx)
+            .bootstrap_timeline(TIMELINE_ID, pg_version, &ctx)
            .await
            .unwrap();

--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -34,6 +34,7 @@ use std::process::{Child, ChildStdin, ChildStdout, Command};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::Duration;
 use std::time::Instant;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};

@@ -123,9 +124,7 @@ impl PostgresRedoManager {
    /// The WAL redo is handled by a separate thread, so this just sends a request
    /// to the thread and waits for response.
    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
+    /// CANCEL SAFETY: NOT CANCEL SAFE.
    pub async fn request_redo(
        &self,
        key: Key,
@@ -158,6 +157,7 @@ impl PostgresRedoManager {
                        self.conf.wal_redo_timeout,
                        pg_version,
                    )
+                    .await
                };
                img = Some(result?);

@@ -178,6 +178,7 @@ impl PostgresRedoManager {
                self.conf.wal_redo_timeout,
                pg_version,
            )
+            .await
        }
    }
 }
@@ -215,7 +216,7 @@ impl PostgresRedoManager {
    /// Process one request for WAL redo using wal-redo postgres
    ///
    #[allow(clippy::too_many_arguments)]
-    fn apply_batch_postgres(
+    async fn apply_batch_postgres(
        &self,
        key: Key,
        lsn: Lsn,
@@ -331,7 +332,12 @@ impl PostgresRedoManager {
                // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
                // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
                // This probably needs revisiting at some later point.
+                let mut wait_done = proc.stderr_logger_task_done.clone();
                drop(proc);
+                wait_done
+                    .wait_for(|v| *v)
+                    .await
+                    .expect("we use scopeguard to ensure we always send `true` to the channel before dropping the sender");
            } else if n_attempts != 0 {
                info!(n_attempts, "retried walredo succeeded");
            }
@@ -643,6 +649,8 @@ struct WalRedoProcess {
    child: Option<NoLeakChild>,
    stdout: Mutex<ProcessOutput>,
    stdin: Mutex<ProcessInput>,
+    stderr_logger_cancel: CancellationToken,
+    stderr_logger_task_done: tokio::sync::watch::Receiver<bool>,
    /// Counter to separate same sized walredo inputs failing at the same millisecond.
    #[cfg(feature = "testing")]
    dump_sequence: AtomicUsize,
@@ -691,8 +699,6 @@ impl WalRedoProcess {
        let stdin = child.stdin.take().unwrap();
        let stdout = child.stdout.take().unwrap();
        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
        macro_rules! set_nonblock_or_log_err {
            ($file:ident) => {{
                let res = set_nonblock($file.as_raw_fd());
@@ -704,45 +710,69 @@ impl WalRedoProcess {
        }
        set_nonblock_or_log_err!(stdin)?;
        set_nonblock_or_log_err!(stdout)?;
+        set_nonblock_or_log_err!(stderr)?;
+
+        let mut stderr = tokio::io::unix::AsyncFd::new(stderr).context("AsyncFd::with_interest")?;

        // all fallible operations post-spawn are complete, so get rid of the guard
        let child = scopeguard::ScopeGuard::into_inner(child);

-        tokio::spawn(
+        let stderr_logger_cancel = CancellationToken::new();
+        let (stderr_logger_task_done_tx, stderr_logger_task_done_rx) =
+            tokio::sync::watch::channel(false);
+        tokio::spawn({
+            let stderr_logger_cancel = stderr_logger_cancel.clone();
            async move {
                scopeguard::defer! {
                    debug!("wal-redo-postgres stderr_logger_task finished");
-                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                    let _ = stderr_logger_task_done_tx.send(true);
                }
                debug!("wal-redo-postgres stderr_logger_task started");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-                use tokio::io::AsyncBufReadExt;
-                let mut stderr_lines = tokio::io::BufReader::new(stderr);
-                let mut buf = Vec::new();
-                let res = loop {
-                    buf.clear();
-                    // TODO we don't trust the process to cap its stderr length.
-                    // Currently it can do unbounded Vec allocation.
-                    match stderr_lines.read_until(b'\n', &mut buf).await {
-                        Ok(0) => break Ok(()), // eof
-                        Ok(num_bytes) => {
-                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                            error!(%output, "received output");
+                loop {
+                    // NB: we purposefully don't do a select! for the cancellation here.
+                    // The cancellation would likely cause us to miss stderr messages.
+                    // We can rely on this to return from .await because when we SIGKILL
+                    // the child, the writing end of the stderr pipe gets closed.
+                    match stderr.readable_mut().await {
+                        Ok(mut guard) => {
+                            let mut errbuf = [0; 16384];
+                            let res = guard.try_io(|fd| {
+                                use std::io::Read;
+                                fd.get_mut().read(&mut errbuf)
+                            });
+                            match res {
+                                Ok(Ok(0)) => {
+                                    // it closed the stderr pipe
+                                    break;
+                                }
+                                Ok(Ok(n)) => {
+                                    // The message might not be split correctly into lines here. But this is
+                                    // good enough, the important thing is to get the message to the log.
+                                    let output = String::from_utf8_lossy(&errbuf[0..n]).to_string();
+                                    error!(output, "received output");
+                                },
+                                Ok(Err(e)) => {
+                                    error!(error = ?e, "read() error, waiting for cancellation");
+                                    stderr_logger_cancel.cancelled().await;
+                                    error!(error = ?e, "read() error, cancellation complete");
+                                    break;
+                                }
+                                Err(e) => {
+                                    let _e: tokio::io::unix::TryIoError = e;
+                                    // the read() returned WouldBlock, that's expected
+                                }
+                            }
                        }
                        Err(e) => {
-                            break Err(e);
+                            error!(error = ?e, "read() error, waiting for cancellation");
+                            stderr_logger_cancel.cancelled().await;
+                            error!(error = ?e, "read() error, cancellation complete");
+                            break;
                        }
                    }
-                };
-                match res {
-                    Ok(()) => (),
-                    Err(e) => {
-                        error!(error=?e, "failed to read from walredo stderr");
-                    }
                }
            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
-        );
+        });

        Ok(Self {
            conf,
@@ -757,6 +787,8 @@ impl WalRedoProcess {
                pending_responses: VecDeque::new(),
                n_processed_responses: 0,
            }),
+            stderr_logger_cancel,
+            stderr_logger_task_done: stderr_logger_task_done_rx,
            #[cfg(feature = "testing")]
            dump_sequence: AtomicUsize::default(),
        })
@@ -997,6 +1029,7 @@ impl Drop for WalRedoProcess {
            .take()
            .expect("we only do this once")
            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        self.stderr_logger_cancel.cancel();
        // no way to wait for stderr_logger_task from Drop because that is async only
    }
 }
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -19,6 +19,7 @@
 #include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "storage/buf_internals.h"
+#include "storage/lwlock.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
 #include "c.h"
@@ -36,8 +37,6 @@
 #include "walproposer.h"
 #include "neon_utils.h"

-#include <pthread.h>
-
 #define PageStoreTrace DEBUG5

 #define RECONNECT_INTERVAL_USEC 1000000
@@ -70,7 +69,7 @@ int			max_reconnect_attempts = 60;

 typedef struct
 {
-    pthread_rwlock_t lock;
+    LWLockId lock;
    pg_atomic_uint64 update_counter;
    char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
 } PagestoreShmemState;
@@ -106,10 +105,10 @@ AssignPageserverConnstring(const char *newval, void *extra)
 {
    if(!PagestoreShmemIsValid())
        return;
-    pthread_rwlock_wrlock(&pagestore_shared->lock);
+    LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
    strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
    pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
-    pthread_rwlock_unlock(&pagestore_shared->lock);
+    LWLockRelease(pagestore_shared->lock);
 }

 static bool
@@ -120,24 +119,15 @@ CheckConnstringUpdated()
    return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
 }

-/* Returns true if the connstring has changed and false if not */
-static bool
+static void
 ReloadConnstring()
 {
    if(!PagestoreShmemIsValid())
-        return false;
-    pthread_rwlock_rdlock(&pagestore_shared->lock);
-
-    if(strcmp(local_pageserver_connstring, pagestore_shared->pageserver_connstring) == 0)
-    {
-        pthread_rwlock_unlock(&pagestore_shared->lock);
-        return false;
-    }
-
+        return;
+    LWLockAcquire(pagestore_shared->lock, LW_SHARED);
    strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
    pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
-    pthread_rwlock_unlock(&pagestore_shared->lock);
-    return true;
+    LWLockRelease(pagestore_shared->lock);
 }

 static bool
@@ -300,6 +290,7 @@ pageserver_disconnect(void)
 	 */
 	if (connected)
 	{
+		neon_log(LOG, "dropping connection to page server due to error");
 		PQfinish(pageserver_conn);
 		pageserver_conn = NULL;
 		connected = false;
@@ -320,12 +311,8 @@ pageserver_send(NeonRequest * request)

        if(CheckConnstringUpdated())
        {
-            bool should_disconnect = ReloadConnstring();
-            if(should_disconnect)
-            {
-                neon_log(LOG, "pageserver_send disconnect because connstring changed");
-                pageserver_disconnect();
-            }
+            pageserver_disconnect();
+            ReloadConnstring();
        }

 	/* If the connection was lost for some reason, reconnect */
@@ -497,12 +484,7 @@ PagestoreShmemInit(void)
                                       &found);
    if(!found)
    {
-        pthread_rwlockattr_t attr;
-        pthread_rwlockattr_init(&attr);
-        pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); 
-        pthread_rwlock_init(&pagestore_shared->lock, &attr);
-        pthread_rwlockattr_destroy(&attr);
-
+        pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
        pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
        AssignPageserverConnstring(page_server_connstring, NULL);
    }
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -59,7 +59,6 @@
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
 #include "storage/buf_internals.h"
-#include "storage/fsm_internals.h"
 #include "storage/smgr.h"
 #include "storage/md.h"
 #include "pgstat.h"
@@ -2723,86 +2722,6 @@ smgr_init_neon(void)
 }


-static void
-neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, XLogRecPtr end_recptr)
-{
-	BlockNumber relsize;
-	/* Extend the relation if we know its size */
-	if (get_cached_relsize(rinfo, forknum, &relsize))
-	{
-		if (relsize < blkno + 1)
-		{
-			update_cached_relsize(rinfo, forknum, blkno + 1);
-			SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
-		}
-	}
-	else
-	{
-		/*
-		 * Size was not cached. We populate the cache now, with the size of the
-		 * relation measured after this WAL record is applied.
-		 *
-		 * This length is later reused when we open the smgr to read the block,
-		 * which is fine and expected.
-		 */
-
-		NeonResponse *response;
-		NeonNblocksResponse *nbresponse;
-		NeonNblocksRequest request = {
-			.req = (NeonRequest) {
-				.lsn = end_recptr,
-				.latest = false,
-				.tag = T_NeonNblocksRequest,
-			},
-			.rinfo = rinfo,
-			.forknum = forknum,
-		};
-
-		response = page_server_request(&request);
-
-		Assert(response->tag == T_NeonNblocksResponse);
-		nbresponse = (NeonNblocksResponse *) response;
-
-		relsize = Max(nbresponse->n_blocks, blkno+1);
-
-		set_cached_relsize(rinfo, forknum, relsize);
-		SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
-
-		elog(SmgrTrace, "Set length to %d", relsize);
-	}
-}
-
-#define FSM_TREE_DEPTH	((SlotsPerFSMPage >= 1626) ? 3 : 4)
-
-/*
- * TODO: May be it is better to make correspondent fgunctio from freespace.c public?
- */
-static BlockNumber
-get_fsm_physical_block(BlockNumber heapblk)
-{
-	BlockNumber pages;
-	int			leafno;
-	int			l;
-
-	/*
-	 * Calculate the logical page number of the first leaf page below the
-	 * given page.
-	 */
-	leafno = heapblk / SlotsPerFSMPage;
-
-	/* Count upper level nodes required to address the leaf page */
-	pages = 0;
-	for (l = 0; l < FSM_TREE_DEPTH; l++)
-	{
-		pages += leafno + 1;
-		leafno /= SlotsPerFSMPage;
-	}
-
-	/* Turn the page count into 0-based block number */
-	return pages - 1;
-}
-
-
 /*
 * Return whether we can skip the redo for this block.
 * 
@@ -2850,6 +2769,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	LWLock	   *partitionLock;
 	Buffer		buffer;
 	bool		no_redo_needed;
+	BlockNumber relsize;

 	if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
 		return true;
@@ -2899,10 +2819,49 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)

 	LWLockRelease(partitionLock);

-	neon_extend_rel_size(rinfo, forknum, blkno, end_recptr);
-	if (forknum == MAIN_FORKNUM)
+	/* Extend the relation if we know its size */
+	if (get_cached_relsize(rinfo, forknum, &relsize))
 	{
-		neon_extend_rel_size(rinfo, FSM_FORKNUM, get_fsm_physical_block(blkno), end_recptr);
+		if (relsize < blkno + 1)
+		{
+			update_cached_relsize(rinfo, forknum, blkno + 1);
+			SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
+		}
 	}
+	else
+	{
+		/*
+		 * Size was not cached. We populate the cache now, with the size of the
+		 * relation measured after this WAL record is applied.
+		 *
+		 * This length is later reused when we open the smgr to read the block,
+		 * which is fine and expected.
+		 */
+
+		NeonResponse *response;
+		NeonNblocksResponse *nbresponse;
+		NeonNblocksRequest request = {
+			.req = (NeonRequest) {
+				.lsn = end_recptr,
+				.latest = false,
+				.tag = T_NeonNblocksRequest,
+			},
+			.rinfo = rinfo,
+			.forknum = forknum,
+		};
+
+		response = page_server_request(&request);
+
+		Assert(response->tag == T_NeonNblocksResponse);
+		nbresponse = (NeonNblocksResponse *) response;
+
+		Assert(nbresponse->n_blocks > blkno);
+
+		set_cached_relsize(rinfo, forknum, nbresponse->n_blocks);
+		SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
+
+		elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
+	}
+
 	return no_redo_needed;
 }
--- a/poetry.lock
+++ b/poetry.lock
@@ -1967,18 +1967,18 @@ pytest = [

 [[package]]
 name = "pytest-rerunfailures"
-version = "13.0"
+version = "11.1.2"
 description = "pytest plugin to re-run tests to eliminate flaky failures"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "pytest-rerunfailures-13.0.tar.gz", hash = "sha256:e132dbe420bc476f544b96e7036edd0a69707574209b6677263c950d19b09199"},
-    {file = "pytest_rerunfailures-13.0-py3-none-any.whl", hash = "sha256:34919cb3fcb1f8e5d4b940aa75ccdea9661bade925091873b7c6fa5548333069"},
+    {file = "pytest-rerunfailures-11.1.2.tar.gz", hash = "sha256:55611661e873f1cafa384c82f08d07883954f4b76435f4b8a5b470c1954573de"},
+    {file = "pytest_rerunfailures-11.1.2-py3-none-any.whl", hash = "sha256:d21fe2e46d9774f8ad95f1aa799544ae95cac3a223477af94aa985adfae92b7e"},
 ]

 [package.dependencies]
 packaging = ">=17.1"
-pytest = ">=7"
+pytest = ">=5.3"

 [[package]]
 name = "pytest-split"
@@ -2476,6 +2476,16 @@ files = [
    {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
    {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
    {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
    {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
@@ -2697,4 +2707,4 @@ cffi = ["cffi (>=1.11)"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "9f33b4404dbb9803ede5785469241dde1d09132427b87db8928bdbc37ccd6b7a"
+content-hash = "25ffa9ed98d890a3b85e6036792296a60bb705e8f9eaa1f07336501116a58756"
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -69,7 +69,6 @@ webpki-roots.workspace = true
 x509-parser.workspace = true
 native-tls.workspace = true
 postgres-native-tls.workspace = true
-smol_str.workspace = true

 workspace_hack.workspace = true
 tokio-util.workspace = true
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -106,7 +106,7 @@ pub(super) async fn authenticate(
        reported_auth_ok: true,
        value: NodeInfo {
            config,
-            aux: db_info.aux,
+            aux: db_info.aux.into(),
            allow_self_signed_compute: false, // caller may override
        },
    })
--- a/proxy/src/bin/pg_sni_router.rs
+++ b/proxy/src/bin/pg_sni_router.rs
@@ -284,5 +284,5 @@ async fn handle_client(
    let client = tokio::net::TcpStream::connect(destination).await?;

    let metrics_aux: MetricsAuxInfo = Default::default();
-    proxy::proxy::proxy_pass(tls_stream, client, metrics_aux).await
+    proxy::proxy::proxy_pass(tls_stream, client, &metrics_aux).await
 }
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -103,7 +103,7 @@ struct ProxyCliArgs {
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    require_client_ip: bool,
    /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    disable_dynamic_rate_limiter: bool,
    /// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`.
    #[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)]
--- a/proxy/src/console/messages.rs
+++ b/proxy/src/console/messages.rs
@@ -1,5 +1,4 @@
 use serde::Deserialize;
-use smol_str::SmolStr;
 use std::fmt;

 /// Generic error response with human-readable description.
@@ -89,11 +88,11 @@ impl fmt::Debug for DatabaseInfo {

 /// Various labels for prometheus metrics.
 /// Also known as `ProxyMetricsAuxInfo` in the console.
-#[derive(Debug, Deserialize, Clone, Default)]
+#[derive(Debug, Deserialize, Default)]
 pub struct MetricsAuxInfo {
-    pub endpoint_id: SmolStr,
-    pub project_id: SmolStr,
-    pub branch_id: SmolStr,
+    pub endpoint_id: Box<str>,
+    pub project_id: Box<str>,
+    pub branch_id: Box<str>,
 }

 impl MetricsAuxInfo {
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -229,7 +229,7 @@ pub struct NodeInfo {
    pub config: compute::ConnCfg,

    /// Labels for proxy's metrics.
-    pub aux: MetricsAuxInfo,
+    pub aux: Arc<MetricsAuxInfo>,

    /// Whether we should accept self-signed certificates (for testing)
    pub allow_self_signed_compute: bool,
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -144,7 +144,7 @@ impl Api {

            let node = NodeInfo {
                config,
-                aux: body.aux,
+                aux: body.aux.into(),
                allow_self_signed_compute: false,
            };

--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -134,9 +134,9 @@ pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {

 pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
-        "proxy_control_plane_token_acquire_seconds",
+        "semaphore_control_plane_token_acquire_seconds",
        "Time it took for proxy to establish a connection to the compute endpoint",
-        // largest bucket = 3^16 * 0.05ms = 2.15s
+        // largest bucket = 3^16 * 0.00005ms = 2.15s
        exponential_buckets(0.00005, 3.0, 16).unwrap(),
    )
    .unwrap()
@@ -877,11 +877,11 @@ async fn prepare_client_connection(
 pub async fn proxy_pass(
    client: impl AsyncRead + AsyncWrite + Unpin,
    compute: impl AsyncRead + AsyncWrite + Unpin,
-    aux: MetricsAuxInfo,
+    aux: &MetricsAuxInfo,
 ) -> anyhow::Result<()> {
    let usage = USAGE_METRICS.register(Ids {
-        endpoint_id: aux.endpoint_id.clone(),
-        branch_id: aux.branch_id.clone(),
+        endpoint_id: aux.endpoint_id.to_string(),
+        branch_id: aux.branch_id.to_string(),
    });

    let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);
@@ -1032,7 +1032,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
        // immediately after opening the connection.
        let (stream, read_buf) = stream.into_inner();
        node.stream.write_all(&read_buf).await?;
-        proxy_pass(stream, node.stream, aux).await
+        proxy_pass(stream, node.stream, &aux).await
    }
 }

--- a/proxy/src/serverless/conn_pool.rs
+++ b/proxy/src/serverless/conn_pool.rs
@@ -8,7 +8,6 @@ use pbkdf2::{
    Params, Pbkdf2,
 };
 use pq_proto::StartupMessageParams;
-use smol_str::SmolStr;
 use std::{collections::HashMap, net::SocketAddr, sync::Arc};
 use std::{
    fmt,
@@ -42,16 +41,16 @@ const MAX_CONNS_PER_ENDPOINT: usize = 20;

 #[derive(Debug, Clone)]
 pub struct ConnInfo {
-    pub username: SmolStr,
-    pub dbname: SmolStr,
-    pub hostname: SmolStr,
-    pub password: SmolStr,
-    pub options: Option<SmolStr>,
+    pub username: String,
+    pub dbname: String,
+    pub hostname: String,
+    pub password: String,
+    pub options: Option<String>,
 }

 impl ConnInfo {
    // hm, change to hasher to avoid cloning?
-    pub fn db_and_user(&self) -> (SmolStr, SmolStr) {
+    pub fn db_and_user(&self) -> (String, String) {
        (self.dbname.clone(), self.username.clone())
    }
 }
@@ -71,7 +70,7 @@ struct ConnPoolEntry {
 // Per-endpoint connection pool, (dbname, username) -> DbUserConnPool
 // Number of open connections is limited by the `max_conns_per_endpoint`.
 pub struct EndpointConnPool {
-    pools: HashMap<(SmolStr, SmolStr), DbUserConnPool>,
+    pools: HashMap<(String, String), DbUserConnPool>,
    total_conns: usize,
 }

@@ -96,7 +95,7 @@ pub struct GlobalConnPool {
    //
    // That should be a fairly conteded map, so return reference to the per-endpoint
    // pool as early as possible and release the lock.
-    global_pool: DashMap<SmolStr, Arc<RwLock<EndpointConnPool>>>,
+    global_pool: DashMap<String, Arc<RwLock<EndpointConnPool>>>,

    /// [`DashMap::len`] iterates over all inner pools and acquires a read lock on each.
    /// That seems like far too much effort, so we're using a relaxed increment counter instead.
@@ -328,7 +327,7 @@ impl GlobalConnPool {
        Ok(())
    }

-    fn get_or_create_endpoint_pool(&self, endpoint: &SmolStr) -> Arc<RwLock<EndpointConnPool>> {
+    fn get_or_create_endpoint_pool(&self, endpoint: &String) -> Arc<RwLock<EndpointConnPool>> {
        // fast path
        if let Some(pool) = self.global_pool.get(endpoint) {
            return pool.clone();
@@ -469,7 +468,7 @@ async fn connect_to_compute_once(

    let (client, mut connection) = config
        .user(&conn_info.username)
-        .password(&*conn_info.password)
+        .password(&conn_info.password)
        .dbname(&conn_info.dbname)
        .connect_timeout(timeout)
        .connect(tokio_postgres::NoTls)
@@ -483,8 +482,8 @@ async fn connect_to_compute_once(
        info!(%conn_info, %session, "new connection");
    });
    let ids = Ids {
-        endpoint_id: node_info.aux.endpoint_id.clone(),
-        branch_id: node_info.aux.branch_id.clone(),
+        endpoint_id: node_info.aux.endpoint_id.to_string(),
+        branch_id: node_info.aux.branch_id.to_string(),
    };

    tokio::spawn(
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -182,16 +182,16 @@ fn get_conn_info(

    for (key, value) in pairs {
        if key == "options" {
-            options = Some(value.into());
+            options = Some(value.to_string());
            break;
        }
    }

    Ok(ConnInfo {
-        username: username.into(),
-        dbname: dbname.into(),
-        hostname: hostname.into(),
-        password: password.into(),
+        username: username.to_owned(),
+        dbname: dbname.to_owned(),
+        hostname: hostname.to_owned(),
+        password: password.to_owned(),
        options,
    })
 }
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -6,7 +6,6 @@ use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_S
 use dashmap::{mapref::entry::Entry, DashMap};
 use once_cell::sync::Lazy;
 use serde::{Deserialize, Serialize};
-use smol_str::SmolStr;
 use std::{
    convert::Infallible,
    sync::{
@@ -30,8 +29,8 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 /// because we enrich the event with project_id in the control-plane endpoint.
 #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
 pub struct Ids {
-    pub endpoint_id: SmolStr,
-    pub branch_id: SmolStr,
+    pub endpoint_id: String,
+    pub branch_id: String,
 }

 #[derive(Debug)]
@@ -291,8 +290,8 @@ mod tests {

        // register a new counter
        let counter = metrics.register(Ids {
-            endpoint_id: "e1".into(),
-            branch_id: "b1".into(),
+            endpoint_id: "e1".to_string(),
+            branch_id: "b1".to_string(),
        });

        // the counter should be observed despite 0 egress
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -34,7 +34,7 @@ types-psutil = "^5.9.5.12"
 types-toml = "^0.10.8.6"
 pytest-httpserver = "^1.0.8"
 aiohttp = "3.9.0"
-pytest-rerunfailures = "^13.0"
+pytest-rerunfailures = "^11.1.2"
 types-pytest-lazy-fixture = "^0.6.3.3"
 pytest-split = "^0.8.1"
 zstandard = "^0.21.0"
--- a/safekeeper/src/http/routes.rs
+++ b/safekeeper/src/http/routes.rs
@@ -178,11 +178,9 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
        system_id: request_data.system_id.unwrap_or(0),
        wal_seg_size: request_data.wal_seg_size.unwrap_or(WAL_SEGMENT_SIZE as u32),
    };
-    let local_start_lsn = request_data.local_start_lsn.unwrap_or_else(|| {
-        request_data
-            .commit_lsn
-            .segment_lsn(server_info.wal_seg_size as usize)
-    });
+    let local_start_lsn = request_data
+        .local_start_lsn
+        .unwrap_or(request_data.commit_lsn);
    GlobalTimelines::create(ttid, server_info, request_data.commit_lsn, local_start_lsn)
        .await
        .map_err(ApiError::InternalServerError)?;
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -914,14 +914,9 @@ where
        Ok(())
    }

-    /// Persist in-memory state of control file to disk.
-    //
-    // TODO: passing inmem_remote_consistent_lsn everywhere is ugly, better
-    // separate state completely and give Arc to all those who need it.
-    pub async fn persist_inmem(&mut self, inmem_remote_consistent_lsn: Lsn) -> Result<()> {
-        let mut state = self.state.clone();
-        state.remote_consistent_lsn = inmem_remote_consistent_lsn;
-        self.persist_control_file(state).await
+    /// Persist control file to disk, called only after timeline creation (bootstrap).
+    pub async fn persist(&mut self) -> Result<()> {
+        self.persist_control_file(self.state.clone()).await
    }

    /// Persist in-memory state to the disk, taking other data from state.
@@ -935,7 +930,7 @@ where

    /// Persist control file if there is something to save and enough time
    /// passed after the last save.
-    pub async fn maybe_persist_inmem_control_file(
+    pub async fn maybe_persist_control_file(
        &mut self,
        inmem_remote_consistent_lsn: Lsn,
    ) -> Result<()> {
@@ -948,7 +943,9 @@ where
            || self.inmem.peer_horizon_lsn > self.state.peer_horizon_lsn
            || inmem_remote_consistent_lsn > self.state.remote_consistent_lsn;
        if need_persist {
-            self.persist_inmem(inmem_remote_consistent_lsn).await?;
+            let mut state = self.state.clone();
+            state.remote_consistent_lsn = inmem_remote_consistent_lsn;
+            self.persist_control_file(state).await?;
            trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
        }
        Ok(())
@@ -1067,6 +1064,8 @@ where

        if sync_control_file {
            let mut state = self.state.clone();
+            // Note: we could make remote_consistent_lsn update in cf common by
+            // storing Arc to walsenders in Safekeeper.
            state.remote_consistent_lsn = new_remote_consistent_lsn;
            self.persist_control_file(state).await?;
        }
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -182,9 +182,8 @@ impl SharedState {
    }

    /// Mark timeline active/inactive and return whether s3 offloading requires
-    /// start/stop action. If timeline is deactivated, control file is persisted
-    /// as maintenance task does that only for active timelines.
-    async fn update_status(
+    /// start/stop action.
+    fn update_status(
        &mut self,
        num_computes: usize,
        remote_consistent_lsn: Lsn,
@@ -192,15 +191,7 @@ impl SharedState {
    ) -> bool {
        let is_active = self.is_active(num_computes, remote_consistent_lsn);
        if self.active != is_active {
-            info!(
-                "timeline {} active={} now, remote_consistent_lsn={}, commit_lsn={}",
-                ttid, is_active, remote_consistent_lsn, self.sk.inmem.commit_lsn
-            );
-            if !is_active {
-                if let Err(e) = self.sk.persist_inmem(remote_consistent_lsn).await {
-                    warn!("control file save in update_status failed: {:?}", e);
-                }
-            }
+            info!("timeline {} active={} now", ttid, is_active);
        }
        self.active = is_active;
        self.is_wal_backup_action_pending(num_computes)
@@ -447,7 +438,7 @@ impl Timeline {
        fs::create_dir_all(&self.timeline_dir).await?;

        // Write timeline to disk and start background tasks.
-        if let Err(e) = shared_state.sk.persist_inmem(Lsn::INVALID).await {
+        if let Err(e) = shared_state.sk.persist().await {
            // Bootstrap failed, cancel timeline and remove timeline directory.
            self.cancel(shared_state);

@@ -520,14 +511,12 @@ impl Timeline {
        self.mutex.lock().await
    }

-    async fn update_status(&self, shared_state: &mut SharedState) -> bool {
-        shared_state
-            .update_status(
-                self.walreceivers.get_num(),
-                self.get_walsenders().get_remote_consistent_lsn(),
-                self.ttid,
-            )
-            .await
+    fn update_status(&self, shared_state: &mut SharedState) -> bool {
+        shared_state.update_status(
+            self.walreceivers.get_num(),
+            self.get_walsenders().get_remote_consistent_lsn(),
+            self.ttid,
+        )
    }

    /// Update timeline status and kick wal backup launcher to stop/start offloading if needed.
@@ -537,7 +526,7 @@ impl Timeline {
        }
        let is_wal_backup_action_pending: bool = {
            let mut shared_state = self.write_shared_state().await;
-            self.update_status(&mut shared_state).await
+            self.update_status(&mut shared_state)
        };
        if is_wal_backup_action_pending {
            // Can fail only if channel to a static thread got closed, which is not normal at all.
@@ -694,7 +683,7 @@ impl Timeline {
            shared_state.sk.record_safekeeper_info(&sk_info).await?;
            let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
            shared_state.peers_info.upsert(&peer_info);
-            is_wal_backup_action_pending = self.update_status(&mut shared_state).await;
+            is_wal_backup_action_pending = self.update_status(&mut shared_state);
            commit_lsn = shared_state.sk.inmem.commit_lsn;
        }
        self.commit_lsn_watch_tx.send(commit_lsn)?;
@@ -839,7 +828,7 @@ impl Timeline {
        self.write_shared_state()
            .await
            .sk
-            .maybe_persist_inmem_control_file(remote_consistent_lsn)
+            .maybe_persist_control_file(remote_consistent_lsn)
            .await
    }

--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3029,11 +3029,6 @@ def get_test_output_dir(request: FixtureRequest, top_output_dir: Path) -> Path:
    """Compute the working directory for an individual test."""
    test_name = request.node.name
    test_dir = top_output_dir / test_name.replace("/", "-")
-
-    # We rerun flaky tests multiple times, use a separate directory for each run.
-    if (suffix := getattr(request.node, "execution_count", None)) is not None:
-        test_dir = test_dir.parent / f"{test_dir.name}-{suffix}"
-
    log.info(f"get_test_output_dir is {test_dir}")
    # make mypy happy
    assert isinstance(test_dir, Path)
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -260,14 +260,6 @@ class PageserverHttpClient(requests.Session):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
        self.verbose_error(res)

-    def tenant_reset(self, tenant_id: TenantId, drop_cache: bool):
-        params = {}
-        if drop_cache:
-            params["drop_cache"] = "true"
-
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/reset", params=params)
-        self.verbose_error(res)
-
    def tenant_delete(self, tenant_id: TenantId):
        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
        self.verbose_error(res)
@@ -370,16 +362,12 @@ class PageserverHttpClient(requests.Session):
        new_timeline_id: TimelineId,
        ancestor_timeline_id: Optional[TimelineId] = None,
        ancestor_start_lsn: Optional[Lsn] = None,
-        existing_initdb_timeline_id: Optional[TimelineId] = None,
        **kwargs,
    ) -> Dict[Any, Any]:
        body: Dict[str, Any] = {
            "new_timeline_id": str(new_timeline_id),
            "ancestor_start_lsn": str(ancestor_start_lsn) if ancestor_start_lsn else None,
            "ancestor_timeline_id": str(ancestor_timeline_id) if ancestor_timeline_id else None,
-            "existing_initdb_timeline_id": str(existing_initdb_timeline_id)
-            if existing_initdb_timeline_id
-            else None,
        }
        if pg_version != PgVersion.NOT_SET:
            body["pg_version"] = int(pg_version)
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -1,7 +1,7 @@
 import time
-from typing import TYPE_CHECKING, Any, Dict, List, Optional
+from typing import TYPE_CHECKING, Any, Dict, Optional

-from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef, ObjectTypeDef
+from mypy_boto3_s3.type_defs import ListObjectsV2OutputTypeDef

 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
@@ -235,14 +235,10 @@ if TYPE_CHECKING:
    from fixtures.neon_fixtures import NeonEnvBuilder


-def assert_prefix_empty(
-    neon_env_builder: "NeonEnvBuilder",
-    prefix: Optional[str] = None,
-    allowed_postfix: Optional[str] = None,
-):
+def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
    response = list_prefix(neon_env_builder, prefix)
    keys = response["KeyCount"]
-    objects: List[ObjectTypeDef] = response.get("Contents", [])
+    objects = response.get("Contents", [])
    common_prefixes = response.get("CommonPrefixes", [])

    remote_storage = neon_env_builder.pageserver_remote_storage
@@ -265,18 +261,7 @@ def assert_prefix_empty(
                f"contradicting ListObjectsV2 response with KeyCount={keys} and Contents={objects}, CommonPrefixes={common_prefixes}"
            )

-    filtered_count = 0
-    if allowed_postfix is None:
-        filtered_count = len(objects)
-    else:
-        for _obj in objects:
-            key: str = str(response.get("Key", []))
-            if not (allowed_postfix.endswith(key)):
-                filtered_count += 1
-
-    assert (
-        filtered_count == 0
-    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+    assert keys == 0, f"remote dir with prefix {prefix} is not empty after deletion: {objects}"


 def assert_prefix_not_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
--- a/test_runner/pg_clients/test_pg_clients.py
+++ b/test_runner/pg_clients/test_pg_clients.py
@@ -48,6 +48,6 @@ def test_pg_clients(test_output_dir: Path, remote_pg: RemotePostgres, client: st
    subprocess_capture(test_output_dir, build_cmd, check=True)

    run_cmd = [docker_bin, "run", "--rm", "--env-file", env_file, image_tag]
-    _, output, _ = subprocess_capture(test_output_dir, run_cmd, check=True, capture_stdout=True)
+    basepath, _, _ = subprocess_capture(test_output_dir, run_cmd, check=True)

-    assert str(output).strip() == "1"
+    assert Path(f"{basepath}.stdout").read_text().strip() == "1"
--- a/test_runner/regress/test_broken_timeline.py
+++ b/test_runner/regress/test_broken_timeline.py
@@ -114,6 +114,7 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)
        [
            ".*Failed to process timeline dir contents.*Timeline has no ancestor and no layer files.*",
            ".*Timeline got dropped without initializing, cleaning its files.*",
+            ".*Failed to load index_part from remote storage, failed creation?.*",
        ]
    )

@@ -143,13 +144,8 @@ def test_timeline_init_break_before_checkpoint(neon_env_builder: NeonEnvBuilder)
    ), "pageserver should clean its temp timeline files on timeline creation failure"


-# The "exit" case is for a reproducer of issue 6007: an unclean shutdown where we can't do local fs cleanups
-@pytest.mark.parametrize("exit_or_return", ["return", "exit"])
-def test_timeline_init_break_before_checkpoint_recreate(
-    neon_env_builder: NeonEnvBuilder, exit_or_return: str
-):
-    env = neon_env_builder.init_configs()
-    env.start()
+def test_timeline_init_break_before_checkpoint_recreate(neon_env_builder: NeonEnvBuilder):
+    env = neon_env_builder.init_start()
    pageserver_http = env.pageserver.http_client()

    env.pageserver.allowed_errors.extend(
@@ -160,7 +156,6 @@ def test_timeline_init_break_before_checkpoint_recreate(
        ]
    )

-    pageserver_http.tenant_create(env.initial_tenant)
    tenant_id = env.initial_tenant

    timelines_dir = env.pageserver.timeline_dir(tenant_id)
@@ -171,17 +166,13 @@ def test_timeline_init_break_before_checkpoint_recreate(
    timeline_id = TimelineId("1080243c1f76fe3c5147266663c9860b")

    # Introduce failpoint during timeline init (some intermediate files are on disk), before it's checkpointed.
-    failpoint = "before-checkpoint-new-timeline"
-    pattern = failpoint
-    if exit_or_return == "exit":
-        # in reality a read error happens, but there are automatic retries which now fail because pageserver is dead
-        pattern = "Connection aborted."
+    pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "return"))
+    with pytest.raises(Exception, match="before-checkpoint-new-timeline"):
+        _ = env.neon_cli.create_timeline(
+            "test_timeline_init_break_before_checkpoint", tenant_id, timeline_id
+        )

-    pageserver_http.configure_failpoints((failpoint, exit_or_return))
-    with pytest.raises(Exception, match=pattern):
-        _ = pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id)
-
-    # Restart the page server (with the failpoint disabled)
+    # Restart the page server
    env.pageserver.restart(immediate=True)

    # Creating the timeline didn't finish. The other timelines on tenant should still be present and work normally.
@@ -195,9 +186,11 @@ def test_timeline_init_break_before_checkpoint_recreate(
        timeline_dirs == initial_timeline_dirs
    ), "pageserver should clean its temp timeline files on timeline creation failure"

+    # Disable the failpoint again
+    pageserver_http.configure_failpoints(("before-checkpoint-new-timeline", "off"))
    # creating the branch should have worked now
-    new_timeline_id = TimelineId(
-        pageserver_http.timeline_create(env.pg_version, tenant_id, timeline_id)["timeline_id"]
+    new_timeline_id = env.neon_cli.create_timeline(
+        "test_timeline_init_break_before_checkpoint", tenant_id, timeline_id
    )

    assert timeline_id == new_timeline_id
--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -60,10 +60,7 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
    execute("SELECT count(*) FROM foo")
    assert fetchone() == (100000,)

-    # Reconfigure it using the same connstring just to make sure nothing breaks
-    # as we have special handling for if the connstring doesn't change
-    for _ in range(5):
-        endpoint.reconfigure(pageserver_id=alt_pageserver_id)
+    endpoint.reconfigure(pageserver_id=alt_pageserver_id)

    # Verify that the neon.pageserver_connstring GUC is set to the correct thing
    execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -49,7 +49,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
        "compaction_period": "0s",  # we want to control when compaction runs
        "checkpoint_timeout": "24h",  # something we won't reach
        "checkpoint_distance": f"{50 * (1024**2)}",  # something we won't reach, we checkpoint manually
-        "image_creation_threshold": "100",  # we want to control when image is created
+        "image_creation_threshold": f"{image_creation_threshold}",
        "compaction_threshold": f"{l0_l1_threshold}",
        "compaction_target_size": f"{128 * (1024**3)}",  # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
    }
@@ -124,10 +124,6 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
    ), "sanity check for what above loop is supposed to do"

    # create the image layer from the future
-    ps_http.patch_tenant_config_client_side(
-        tenant_id, {"image_creation_threshold": image_creation_threshold}, None
-    )
-    assert ps_http.tenant_config(tenant_id).effective_config["image_creation_threshold"] == 1
    ps_http.timeline_compact(tenant_id, timeline_id, force_repartition=True)
    assert (
        len(
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -384,7 +384,7 @@ def test_download_remote_layers_api(
    env.pageserver.allowed_errors.extend(
        [
            ".*download failed: downloading evicted layer file failed.*",
-            f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed: downloading evicted layer file failed",
+            f".*initial size calculation.*{tenant_id}.*{timeline_id}.*Failed to calculate logical size",
        ]
    )

--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -106,6 +106,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
        # Initial tenant load should reflect the delay we injected
        ("initial_tenant_load", lambda t, p: t >= (tenant_load_delay_ms / 1000.0) and t >= p),
        # Subsequent steps should occur in expected order
+        ("initial_logical_sizes", lambda t, p: t > 0 and t >= p),
        ("background_jobs_can_start", lambda t, p: t > 0 and t >= p),
        ("complete", lambda t, p: t > 0 and t >= p),
    ]
--- a/test_runner/regress/test_physical_replication.py
+++ b/test_runner/regress/test_physical_replication.py
@@ -1,29 +0,0 @@
-import random
-import time
-
-from fixtures.neon_fixtures import NeonEnv
-
-
-def test_physical_replication(neon_simple_env: NeonEnv):
-    env = neon_simple_env
-    n_records = 100000
-    with env.endpoints.create_start(
-        branch_name="main",
-        endpoint_id="primary",
-    ) as primary:
-        with primary.connect() as p_con:
-            with p_con.cursor() as p_cur:
-                p_cur.execute(
-                    "CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))"
-                )
-        time.sleep(1)
-        with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary:
-            with primary.connect() as p_con:
-                with p_con.cursor() as p_cur:
-                    with secondary.connect() as s_con:
-                        with s_con.cursor() as s_cur:
-                            for pk in range(n_records):
-                                p_cur.execute("insert into t (pk) values (%s)", (pk,))
-                                s_cur.execute(
-                                    "select * from t where pk=%s", (random.randrange(1, n_records),)
-                                )
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -603,12 +603,7 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
    remote_timeline_path = env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id)

-    filtered = [
-        path
-        for path in remote_timeline_path.iterdir()
-        if not (path.name.endswith("initdb.tar.zst"))
-    ]
-    assert len(filtered) == 0
+    assert not list(remote_timeline_path.iterdir())

    # timeline deletion should kill ongoing uploads, so, the metric will be gone
    assert get_queued_count(file_kind="index", op_kind="upload") is None
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -285,7 +285,6 @@ def test_delete_tenant_exercise_crash_safety_failpoints(
                    str(tenant_id),
                )
            ),
-            allowed_postfix="initdb.tar.zst",
        )


--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -1,5 +1,4 @@
 import asyncio
-import enum
 import random
 import time
 from threading import Thread
@@ -52,20 +51,11 @@ def do_gc_target(
        log.info("gc http thread returning")


-class ReattachMode(str, enum.Enum):
-    REATTACH_EXPLICIT = "explicit"
-    REATTACH_RESET = "reset"
-    REATTACH_RESET_DROP = "reset"
-
-
 # Basic detach and re-attach test
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
-@pytest.mark.parametrize(
-    "mode",
-    [ReattachMode.REATTACH_EXPLICIT, ReattachMode.REATTACH_RESET, ReattachMode.REATTACH_RESET_DROP],
-)
 def test_tenant_reattach(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, mode: str
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
 ):
    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

@@ -110,15 +100,8 @@ def test_tenant_reattach(
        ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
    )

-    if mode == ReattachMode.REATTACH_EXPLICIT:
-        # Explicitly detach then attach the tenant as two separate API calls
-        pageserver_http.tenant_detach(tenant_id)
-        pageserver_http.tenant_attach(tenant_id)
-    elif mode in (ReattachMode.REATTACH_RESET, ReattachMode.REATTACH_RESET_DROP):
-        # Use the reset API to detach/attach in one shot
-        pageserver_http.tenant_reset(tenant_id, mode == ReattachMode.REATTACH_RESET_DROP)
-    else:
-        raise NotImplementedError(mode)
+    pageserver_http.tenant_detach(tenant_id)
+    pageserver_http.tenant_attach(tenant_id)

    time.sleep(1)  # for metrics propagation

--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -290,12 +290,10 @@ def test_pageserver_with_empty_tenants(

    env = neon_env_builder.init_start()

-    env.pageserver.allowed_errors.extend(
-        [
-            ".*marking .* as locally complete, while it doesnt exist in remote index.*",
-            ".*load failed.*list timelines directory.*",
-        ]
+    env.pageserver.allowed_errors.append(
+        ".*marking .* as locally complete, while it doesnt exist in remote index.*"
    )
+    env.pageserver.allowed_errors.append(".*load failed.*list timelines directory.*")

    client = env.pageserver.http_client()

--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -308,10 +308,8 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
        )

    timeline_dir = env.pageserver.timeline_dir(env.initial_tenant, timeline_id)
-
    # Check local is empty
-    assert (not timeline_dir.exists()) or len(os.listdir(timeline_dir)) == 0
-
+    assert not timeline_dir.exists()
    # Check no delete mark present
    assert not (timeline_dir.parent / f"{timeline_id}.___deleted").exists()

--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -146,72 +146,6 @@ def wait_for_pageserver_catchup(endpoint_main: Endpoint, polling_interval=1, tim
        time.sleep(polling_interval)


-def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
-    env = neon_env_builder.init_start()
-    client = env.pageserver.http_client()
-    new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota_on_startup")
-
-    wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
-
-    endpoint_main = env.endpoints.create(
-        "test_timeline_size_quota_on_startup",
-        # Set small limit for the test
-        config_lines=["neon.max_cluster_size=30MB"],
-    )
-    endpoint_main.start()
-
-    log.info("postgres is running on 'test_timeline_size_quota_on_startup' branch")
-
-    with closing(endpoint_main.connect()) as conn:
-        with conn.cursor() as cur:
-            cur.execute("CREATE TABLE foo (t text)")
-
-            # Insert many rows. This query must fail because of space limit
-            try:
-                for _i in range(5000):
-                    cur.execute(
-                        """
-                        INSERT INTO foo
-                            SELECT 'long string to consume some space' || g
-                            FROM generate_series(1, 100) g
-                    """
-                    )
-
-                # If we get here, the timeline size limit failed
-                log.error("Query unexpectedly succeeded")
-                raise AssertionError()
-
-            except psycopg2.errors.DiskFull as err:
-                log.info(f"Query expectedly failed with: {err}")
-
-    # Restart endpoint that reached the limit to ensure that it doesn't fail on startup
-    # i.e. the size limit is not enforced during startup.
-    endpoint_main.stop()
-    # don't skip pg_catalog updates - it runs CREATE EXTENSION neon
-    # which is needed for neon.pg_cluster_size() to work
-    endpoint_main.respec(skip_pg_catalog_updates=False)
-    endpoint_main.start()
-
-    # ensure that the limit is enforced after startup
-    with closing(endpoint_main.connect()) as conn:
-        with conn.cursor() as cur:
-            # This query must fail because of space limit
-            try:
-                cur.execute(
-                    """
-                    INSERT INTO foo
-                        SELECT 'long string to consume some space' || g
-                        FROM generate_series(1, 100000) g
-                """
-                )
-                # If we get here, the timeline size limit failed
-                log.error("Query unexpectedly succeeded")
-                raise AssertionError()
-
-            except psycopg2.errors.DiskFull as err:
-                log.info(f"Query expectedly failed with: {err}")
-
-
 def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
    env = neon_env_builder.init_start()
    client = env.pageserver.http_client()
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -30,7 +30,6 @@ from fixtures.neon_fixtures import (
    Safekeeper,
    SafekeeperHttpClient,
    SafekeeperPort,
-    last_flush_lsn_upload,
 )
 from fixtures.pageserver.utils import (
    timeline_delete_wait_completed,
@@ -287,43 +286,29 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
    # wait until remote_consistent_lsn gets advanced on all safekeepers
    clients = [sk.http_client() for sk in env.safekeepers]
    stat_before = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
-    log.info(f"statuses before insert: {stat_before}")
+    log.info(f"statuses is {stat_before}")

    endpoint.safe_psql("INSERT INTO t SELECT generate_series(1,100), 'payload'")

-    # wait for remote_consistent_lsn to reach flush_lsn, forcing it with checkpoint
-    new_rcl = last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
-    log.info(f"new_rcl: {new_rcl}")
-    endpoint.stop()
+    # force checkpoint in pageserver to advance remote_consistent_lsn
+    wait_lsn_force_checkpoint(tenant_id, timeline_id, endpoint, env.pageserver)

    # and wait till remote_consistent_lsn propagates to all safekeepers
-    #
-    # TODO: this executes long as timeline on safekeeper is immediately
-    # deactivated once rcl reaches pageserver one, and thus we generally wait
-    # till pageserver reconnects to all safekeepers one by one here. Timeline
-    # status on safekeeper should take into account peers state as well.
    started_at = time.time()
    while True:
        stat_after = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
-        if all([s_after.remote_consistent_lsn >= new_rcl for s_after in stat_after]):
+        if all(
+            s_after.remote_consistent_lsn > s_before.remote_consistent_lsn
+            for s_after, s_before in zip(stat_after, stat_before)
+        ):
            break
        elapsed = time.time() - started_at
-        if elapsed > 30:
+        if elapsed > 20:
            raise RuntimeError(
                f"timed out waiting {elapsed:.0f}s for remote_consistent_lsn propagation: status before {stat_before}, status current {stat_after}"
            )
        time.sleep(1)

-    # Ensure that safekeepers don't lose remote_consistent_lsn on restart.
-    # Control file is persisted each 5s. TODO: do that on shutdown and remove sleep.
-    time.sleep(6)
-    for sk in env.safekeepers:
-        sk.stop()
-        sk.start()
-    stat_after_restart = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
-    log.info(f"statuses after {stat_after_restart}")
-    assert all([s.remote_consistent_lsn >= new_rcl for s in stat_after_restart])
-

 # Test that old WAL consumed by peers and pageserver is removed from safekeepers.
@pytest.mark.parametrize("auth_enabled", [False, True])
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -1,7 +1,6 @@
 import sys
 import tarfile
 import tempfile
-import time
 from pathlib import Path

 import pytest
@@ -126,43 +125,3 @@ def test_wal_restore_initdb(
        )
        log.info(f"original lsn: {original_lsn}, restored lsn: {restored_lsn}")
        assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]
-
-
-def test_wal_restore_http(
-    neon_env_builder: NeonEnvBuilder,
-    test_output_dir: Path,
-):
-    env = neon_env_builder.init_start()
-    endpoint = env.endpoints.create_start("main")
-    endpoint.safe_psql("create table t as select generate_series(1,300000)")
-    tenant_id = env.initial_tenant
-    timeline_id = env.initial_timeline
-
-    ps_client = env.pageserver.http_client()
-
-    # shut down the endpoint and delete the timeline from the pageserver
-    endpoint.stop()
-
-    assert isinstance(env.pageserver_remote_storage, LocalFsStorage)
-
-    test_output_dir / "initdb.tar.zst"
-
-    (env.pageserver_remote_storage.timeline_path(tenant_id, timeline_id) / "initdb.tar.zst")
-
-    ps_client.timeline_delete(tenant_id, timeline_id)
-    time.sleep(2)
-
-    # verify that it is indeed deleted
-    # TODO
-
-    # issue the restoration command
-    ps_client.timeline_create(
-        tenant_id=tenant_id,
-        new_timeline_id=timeline_id,
-        existing_initdb_timeline_id=timeline_id,
-        pg_version=env.pg_version,
-    )
-
-    # the table is back now!
-    restored = env.endpoints.create_start("main")
-    assert restored.safe_psql("select count(*) from t", user="cloud_admin") == [(300000,)]
--- a/vm-image-spec.yaml
+++ b/vm-image-spec.yaml
@@ -13,10 +13,6 @@ commands:
    user: nobody
    sysvInitAction: respawn
    shell: 'DATA_SOURCE_NAME="user=cloud_admin sslmode=disable dbname=postgres" /bin/postgres_exporter'
-  - name: sql-exporter
-    user: nobody
-    sysvInitAction: respawn
-    shell: '/bin/sql_exporter -config.file=/etc/sql_exporter.yml'
 shutdownHook: |
  su -p postgres --session-command '/usr/local/bin/pg_ctl stop -D /var/db/postgres/compute/pgdata -m fast --wait -t 10'
 files:
@@ -50,77 +46,6 @@ files:
          }
          memory {}
      }
-  - filename: sql_exporter.yml
-    content: |
-      # Configuration for sql_exporter
-      # Global defaults.
-      global:
-        # If scrape_timeout <= 0, no timeout is set unless Prometheus provides one. The default is 10s.
-        scrape_timeout: 10s
-        # Subtracted from Prometheus' scrape_timeout to give us some headroom and prevent Prometheus from timing out first.
-        scrape_timeout_offset: 500ms
-        # Minimum interval between collector runs: by default (0s) collectors are executed on every scrape.
-        min_interval: 0s
-        # Maximum number of open connections to any one target. Metric queries will run concurrently on multiple connections,
-        # as will concurrent scrapes.
-        max_connections: 1
-        # Maximum number of idle connections to any one target. Unless you use very long collection intervals, this should
-        # always be the same as max_connections.
-        max_idle_connections: 1
-        # Maximum number of maximum amount of time a connection may be reused. Expired connections may be closed lazily before reuse.
-        # If 0, connections are not closed due to a connection's age.
-        max_connection_lifetime: 5m
-
-      # The target to monitor and the collectors to execute on it.
-      target:
-        # Data source name always has a URI schema that matches the driver name. In some cases (e.g. MySQL)
-        # the schema gets dropped or replaced to match the driver expected DSN format.
-        data_source_name: 'postgresql://cloud_admin@127.0.0.1:5432/postgres?sslmode=disable'
-
-        # Collectors (referenced by name) to execute on the target.
-        # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-        collectors: [neon_collector]
-
-      # Collector files specifies a list of globs. One collector definition is read from each matching file.
-      # Glob patterns are supported (see <https://pkg.go.dev/path/filepath#Match> for syntax).
-      collector_files:
-        - "neon_collector.yml"
-  - filename: neon_collector.yml
-    content: |
-      collector_name: neon_collector
-      metrics:
-      - metric_name: lfc_misses
-        type: gauge
-        help: 'lfc_misses'
-        key_labels:
-        values: [lfc_misses]
-        query: |
-          select lfc_value as lfc_misses from neon.neon_lfc_stats where lfc_key='file_cache_misses';
-
-      - metric_name: lfc_used
-        type: gauge
-        help: 'lfc_used'
-        key_labels:
-        values: [lfc_used]
-        query: |
-          select lfc_value as lfc_used from neon.neon_lfc_stats where lfc_key='file_cache_used';
-
-      - metric_name: lfc_hits
-        type: gauge
-        help: 'lfc_hits'
-        key_labels:
-        values: [lfc_hits]
-        query: |
-          select lfc_value as lfc_hits from neon.neon_lfc_stats where lfc_key='file_cache_hits';
-
-      - metric_name: lfc_writes
-        type: gauge
-        help: 'lfc_writes'
-        key_labels:
-        values: [lfc_writes]
-        query: |
-          select lfc_value as lfc_writes from neon.neon_lfc_stats where lfc_key='file_cache_writes';
-
 build: |
  # Build cgroup-tools
  #
@@ -157,8 +82,6 @@ build: |

  FROM quay.io/prometheuscommunity/postgres-exporter:v0.12.0 AS postgres-exporter

-  FROM burningalchemist/sql_exporter:0.13 AS sql-exporter
-
  # Build pgbouncer
  #
  FROM debian:bullseye-slim AS pgbouncer
@@ -193,19 +116,13 @@ merge: |

  COPY cgconfig.conf /etc/cgconfig.conf
  COPY pgbouncer.ini /etc/pgbouncer.ini
-  COPY sql_exporter.yml /etc/sql_exporter.yml
-  COPY neon_collector.yml /etc/neon_collector.yml
-
  RUN set -e \
      && chown postgres:postgres /etc/pgbouncer.ini \
      && chmod 0644 /etc/pgbouncer.ini \
-      && chmod 0644 /etc/cgconfig.conf \
-      && chmod 0644 /etc/sql_exporter.yml \
-      && chmod 0644 /etc/neon_collector.yml
+      && chmod 0644 /etc/cgconfig.conf

  COPY --from=libcgroup-builder /libcgroup-install/bin/*  /usr/bin/
  COPY --from=libcgroup-builder /libcgroup-install/lib/*  /usr/lib/
  COPY --from=libcgroup-builder /libcgroup-install/sbin/* /usr/sbin/
  COPY --from=postgres-exporter /bin/postgres_exporter /bin/postgres_exporter
-  COPY --from=sql-exporter      /bin/sql_exporter      /bin/sql_exporter
  COPY --from=pgbouncer         /usr/local/pgbouncer/bin/pgbouncer /usr/local/bin/pgbouncer