Merge pull request #6022 from neondatabase/releases/2023-12-04

Release 2023-12-04
Remove trusted from wal2json
2026-05-17 13:10:38 +00:00 · 2023-12-05 17:03:28 +02:00 · 2023-12-04 12:36:19 -08:00 · 2023-12-04 11:41:27 +02:00 · 2023-12-04 11:36:26 +02:00 · 2023-12-01 15:35:56 +02:00
42 changed files with 546 additions and 1122 deletions
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -274,13 +274,7 @@ fn main() -> Result<()> {
            let mut state = compute.state.lock().unwrap();
            state.error = Some(format!("{:?}", err));
            state.status = ComputeStatus::Failed;
-            // Notify others that Postgres failed to start. In case of configuring the
-            // empty compute, it's likely that API handler is still waiting for compute
-            // state change. With this we will notify it that compute is in Failed state,
-            // so control plane will know about it earlier and record proper error instead
-            // of timeout.
-            compute.state_changed.notify_all();
-            drop(state); // unlock
+            drop(state);
            delay_exit = true;
            None
        }
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -22,7 +22,7 @@ use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;

 use compute_api::responses::{ComputeMetrics, ComputeStatus};
-use compute_api::spec::{ComputeFeature, ComputeMode, ComputeSpec};
+use compute_api::spec::{ComputeMode, ComputeSpec};
 use utils::measured_stream::MeasuredReader;

 use remote_storage::{DownloadError, RemotePath};
@@ -277,17 +277,6 @@ fn create_neon_superuser(spec: &ComputeSpec, client: &mut Client) -> Result<()>
 }

 impl ComputeNode {
-    /// Check that compute node has corresponding feature enabled.
-    pub fn has_feature(&self, feature: ComputeFeature) -> bool {
-        let state = self.state.lock().unwrap();
-
-        if let Some(s) = state.pspec.as_ref() {
-            s.spec.features.contains(&feature)
-        } else {
-            false
-        }
-    }
-
    pub fn set_status(&self, status: ComputeStatus) {
        let mut state = self.state.lock().unwrap();
        state.status = status;
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -519,7 +519,6 @@ impl Endpoint {
            skip_pg_catalog_updates: self.skip_pg_catalog_updates,
            format_version: 1.0,
            operation_uuid: None,
-            features: vec![],
            cluster: Cluster {
                cluster_id: None, // project ID: not used
                name: None,       // project name: not used
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -26,13 +26,6 @@ pub struct ComputeSpec {
    // but we don't use it for anything. Serde will ignore missing fields when
    // deserializing it.
    pub operation_uuid: Option<String>,
-
-    /// Compute features to enable. These feature flags are provided, when we
-    /// know all the details about client's compute, so they cannot be used
-    /// to change `Empty` compute behavior.
-    #[serde(default)]
-    pub features: Vec<ComputeFeature>,
-
    /// Expected cluster state at the end of transition process.
    pub cluster: Cluster,
    pub delta_operations: Option<Vec<DeltaOp>>,
@@ -75,19 +68,6 @@ pub struct ComputeSpec {
    pub remote_extensions: Option<RemoteExtSpec>,
 }

-/// Feature flag to signal `compute_ctl` to enable certain experimental functionality.
-#[derive(Serialize, Clone, Copy, Debug, Deserialize, PartialEq, Eq)]
-#[serde(rename_all = "snake_case")]
-pub enum ComputeFeature {
-    // XXX: Add more feature flags here.
-
-    // This is a special feature flag that is used to represent unknown feature flags.
-    // Basically all unknown to enum flags are represented as this one. See unit test
-    // `parse_unknown_features()` for more details.
-    #[serde(other)]
-    UnknownFeature,
-}
-
 #[derive(Clone, Debug, Default, Deserialize, Serialize)]
 pub struct RemoteExtSpec {
    pub public_extensions: Option<Vec<String>>,
@@ -249,10 +229,7 @@ mod tests {
    #[test]
    fn parse_spec_file() {
        let file = File::open("tests/cluster_spec.json").unwrap();
-        let spec: ComputeSpec = serde_json::from_reader(file).unwrap();
-
-        // Features list defaults to empty vector.
-        assert!(spec.features.is_empty());
+        let _spec: ComputeSpec = serde_json::from_reader(file).unwrap();
    }

    #[test]
@@ -264,22 +241,4 @@ mod tests {
        ob.insert("unknown_field_123123123".into(), "hello".into());
        let _spec: ComputeSpec = serde_json::from_value(json).unwrap();
    }
-
-    #[test]
-    fn parse_unknown_features() {
-        // Test that unknown feature flags do not cause any errors.
-        let file = File::open("tests/cluster_spec.json").unwrap();
-        let mut json: serde_json::Value = serde_json::from_reader(file).unwrap();
-        let ob = json.as_object_mut().unwrap();
-
-        // Add unknown feature flags.
-        let features = vec!["foo_bar_feature", "baz_feature"];
-        ob.insert("features".into(), features.into());
-
-        let spec: ComputeSpec = serde_json::from_value(json).unwrap();
-
-        assert!(spec.features.len() == 2);
-        assert!(spec.features.contains(&ComputeFeature::UnknownFeature));
-        assert_eq!(spec.features, vec![ComputeFeature::UnknownFeature; 2]);
-    }
 }
--- a/libs/pageserver_api/src/key.rs
+++ b/libs/pageserver_api/src/key.rs
@@ -140,7 +140,3 @@ impl Key {
        })
    }
 }
-
-pub fn is_rel_block_key(key: &Key) -> bool {
-    key.field1 == 0x00 && key.field4 != 0
-}
--- a/libs/pageserver_api/src/shard.rs
+++ b/libs/pageserver_api/src/shard.rs
@@ -1,6 +1,5 @@
 use std::{ops::RangeInclusive, str::FromStr};

-use crate::key::{is_rel_block_key, Key};
 use hex::FromHex;
 use serde::{Deserialize, Serialize};
 use thiserror;
@@ -303,8 +302,6 @@ pub struct ShardStripeSize(pub u32);
 pub struct ShardLayout(u8);

 const LAYOUT_V1: ShardLayout = ShardLayout(1);
-/// ShardIdentity uses a magic layout value to indicate if it is unusable
-const LAYOUT_BROKEN: ShardLayout = ShardLayout(255);

 /// Default stripe size in pages: 256MiB divided by 8kiB page size.
 const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
@@ -313,10 +310,10 @@ const DEFAULT_STRIPE_SIZE: ShardStripeSize = ShardStripeSize(256 * 1024 / 8);
 /// to resolve a key to a shard, and then check whether that shard is ==self.
 #[derive(Clone, Copy, Serialize, Deserialize, Eq, PartialEq, Debug)]
 pub struct ShardIdentity {
+    pub layout: ShardLayout,
    pub number: ShardNumber,
    pub count: ShardCount,
-    stripe_size: ShardStripeSize,
-    layout: ShardLayout,
+    pub stripe_size: ShardStripeSize,
 }

 #[derive(thiserror::Error, Debug, PartialEq, Eq)]
@@ -342,22 +339,6 @@ impl ShardIdentity {
        }
    }

-    /// A broken instance of this type is only used for `TenantState::Broken` tenants,
-    /// which are constructed in code paths that don't have access to proper configuration.
-    ///
-    /// A ShardIdentity in this state may not be used for anything, and should not be persisted.
-    /// Enforcement is via assertions, to avoid making our interface fallible for this
-    /// edge case: it is the Tenant's responsibility to avoid trying to do any I/O when in a broken
-    /// state, and by extension to avoid trying to do any page->shard resolution.
-    pub fn broken(number: ShardNumber, count: ShardCount) -> Self {
-        Self {
-            number,
-            count,
-            layout: LAYOUT_BROKEN,
-            stripe_size: DEFAULT_STRIPE_SIZE,
-        }
-    }
-
    pub fn is_unsharded(&self) -> bool {
        self.number == ShardNumber(0) && self.count == ShardCount(0)
    }
@@ -384,33 +365,6 @@ impl ShardIdentity {
            })
        }
    }
-
-    fn is_broken(&self) -> bool {
-        self.layout == LAYOUT_BROKEN
-    }
-
-    pub fn get_shard_number(&self, key: &Key) -> ShardNumber {
-        assert!(!self.is_broken());
-        key_to_shard_number(self.count, self.stripe_size, key)
-    }
-
-    /// Return true if the key should be ingested by this shard
-    pub fn is_key_local(&self, key: &Key) -> bool {
-        assert!(!self.is_broken());
-        if self.count < ShardCount(2) || (key_is_shard0(key) && self.number == ShardNumber(0)) {
-            true
-        } else {
-            key_to_shard_number(self.count, self.stripe_size, key) == self.number
-        }
-    }
-
-    pub fn shard_slug(&self) -> String {
-        if self.count > ShardCount(0) {
-            format!("-{:02x}{:02x}", self.number.0, self.count.0)
-        } else {
-            String::new()
-        }
-    }
 }

 impl Serialize for ShardIndex {
@@ -484,65 +438,6 @@ impl<'de> Deserialize<'de> for ShardIndex {
    }
 }

-/// Whether this key is always held on shard 0 (e.g. shard 0 holds all SLRU keys
-/// in order to be able to serve basebackup requests without peer communication).
-fn key_is_shard0(key: &Key) -> bool {
-    // To decide what to shard out to shards >0, we apply a simple rule that only
-    // relation pages are distributed to shards other than shard zero. Everything else gets
-    // stored on shard 0.  This guarantees that shard 0 can independently serve basebackup
-    // requests, and any request other than those for particular blocks in relations.
-    //
-    // In this condition:
-    // - is_rel_block_key includes only relations, i.e. excludes SLRU data and
-    // all metadata.
-    // - field6 is set to -1 for relation size pages.
-    !(is_rel_block_key(key) && key.field6 != 0xffffffff)
-}
-
-/// Provide the same result as the function in postgres `hashfn.h` with the same name
-fn murmurhash32(mut h: u32) -> u32 {
-    h ^= h >> 16;
-    h = h.wrapping_mul(0x85ebca6b);
-    h ^= h >> 13;
-    h = h.wrapping_mul(0xc2b2ae35);
-    h ^= h >> 16;
-    h
-}
-
-/// Provide the same result as the function in postgres `hashfn.h` with the same name
-fn hash_combine(mut a: u32, mut b: u32) -> u32 {
-    b = b.wrapping_add(0x9e3779b9);
-    b = b.wrapping_add(a << 6);
-    b = b.wrapping_add(a >> 2);
-
-    a ^= b;
-    a
-}
-
-/// Where a Key is to be distributed across shards, select the shard.  This function
-/// does not account for keys that should be broadcast across shards.
-///
-/// The hashing in this function must exactly match what we do in postgres smgr
-/// code.  The resulting distribution of pages is intended to preserve locality within
-/// `stripe_size` ranges of contiguous block numbers in the same relation, while otherwise
-/// distributing data pseudo-randomly.
-///
-/// The mapping of key to shard is not stable across changes to ShardCount: this is intentional
-/// and will be handled at higher levels when shards are split.
-fn key_to_shard_number(count: ShardCount, stripe_size: ShardStripeSize, key: &Key) -> ShardNumber {
-    // Fast path for un-sharded tenants or broadcast keys
-    if count < ShardCount(2) || key_is_shard0(key) {
-        return ShardNumber(0);
-    }
-
-    // relNode
-    let mut hash = murmurhash32(key.field4);
-    // blockNum/stripe size
-    hash = hash_combine(hash, murmurhash32(key.field6 / stripe_size.0));
-
-    ShardNumber((hash % count.0 as u32) as u8)
-}
-
 #[cfg(test)]
 mod tests {
    use std::str::FromStr;
@@ -714,29 +609,4 @@ mod tests {

        Ok(())
    }
-
-    // These are only smoke tests to spot check that our implementation doesn't
-    // deviate from a few examples values: not aiming to validate the overall
-    // hashing algorithm.
-    #[test]
-    fn murmur_hash() {
-        assert_eq!(murmurhash32(0), 0);
-
-        assert_eq!(hash_combine(0xb1ff3b40, 0), 0xfb7923c9);
-    }
-
-    #[test]
-    fn shard_mapping() {
-        let key = Key {
-            field1: 0x00,
-            field2: 0x67f,
-            field3: 0x5,
-            field4: 0x400c,
-            field5: 0x00,
-            field6: 0x7d06,
-        };
-
-        let shard = key_to_shard_number(ShardCount(10), DEFAULT_STRIPE_SIZE, &key);
-        assert_eq!(shard, ShardNumber(8));
-    }
 }
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -289,10 +289,10 @@ impl FeStartupPacket {
        // We shouldn't advance `buf` as probably full message is not there yet,
        // so can't directly use Bytes::get_u32 etc.
        let len = (&buf[0..4]).read_u32::<BigEndian>().unwrap() as usize;
-        // The proposed replacement is `!(8..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
+        // The proposed replacement is `!(4..=MAX_STARTUP_PACKET_LENGTH).contains(&len)`
        // which is less readable
        #[allow(clippy::manual_range_contains)]
-        if len < 8 || len > MAX_STARTUP_PACKET_LENGTH {
+        if len < 4 || len > MAX_STARTUP_PACKET_LENGTH {
            return Err(ProtocolError::Protocol(format!(
                "invalid startup packet message length {}",
                len
@@ -975,10 +975,4 @@ mod tests {
        let params = make_params("foo\\ bar \\ \\\\ baz\\  lol");
        assert_eq!(split_options(&params), ["foo bar", " \\", "baz ", "lol"]);
    }
-
-    #[test]
-    fn parse_fe_startup_packet_regression() {
-        let data = [0, 0, 0, 7, 0, 0, 0, 0];
-        FeStartupPacket::parse(&mut BytesMut::from_iter(data)).unwrap_err();
-    }
 }
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -378,7 +378,7 @@ impl RemoteStorage for S3Bucket {
            let empty = Vec::new();
            let prefixes = response.common_prefixes.as_ref().unwrap_or(&empty);

-            tracing::debug!("list: {} prefixes, {} keys", prefixes.len(), keys.len());
+            tracing::info!("list: {} prefixes, {} keys", prefixes.len(), keys.len());

            for object in keys {
                let object_path = object.key().expect("response does not contain a key");
--- a/libs/utils/src/generation.rs
+++ b/libs/utils/src/generation.rs
@@ -152,16 +152,3 @@ impl Debug for Generation {
        }
    }
 }
-
-#[cfg(test)]
-mod test {
-    use super::*;
-
-    #[test]
-    fn generation_gt() {
-        // Important that a None generation compares less than a valid one, during upgrades from
-        // pre-generation systems.
-        assert!(Generation::none() < Generation::new(0));
-        assert!(Generation::none() < Generation::new(1));
-    }
-}
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -402,11 +402,15 @@ fn start_pageserver(
    let (init_remote_done_tx, init_remote_done_rx) = utils::completion::channel();
    let (init_done_tx, init_done_rx) = utils::completion::channel();

+    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
+
    let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();

    let order = pageserver::InitializationOrder {
        initial_tenant_load_remote: Some(init_done_tx),
        initial_tenant_load: Some(init_remote_done_tx),
+        initial_logical_size_can_start: init_done_rx.clone(),
+        initial_logical_size_attempt: Some(init_logical_size_done_tx),
        background_jobs_can_start: background_jobs_barrier.clone(),
    };

@@ -460,7 +464,7 @@ fn start_pageserver(
            });

            let WaitForPhaseResult {
-                timeout_remaining: _timeout,
+                timeout_remaining: timeout,
                skipped: init_load_skipped,
            } = wait_for_phase("initial_tenant_load", init_load_done, timeout).await;

@@ -468,6 +472,26 @@ fn start_pageserver(

            scopeguard::ScopeGuard::into_inner(guard);

+            let guard = scopeguard::guard_on_success((), |_| {
+                tracing::info!("Cancelled before initial logical sizes completed")
+            });
+
+            let logical_sizes_done = std::pin::pin!(async {
+                init_logical_size_done_rx.wait().await;
+                startup_checkpoint(
+                    started_startup_at,
+                    "initial_logical_sizes",
+                    "Initial logical sizes completed",
+                );
+            });
+
+            let WaitForPhaseResult {
+                timeout_remaining: _,
+                skipped: logical_sizes_skipped,
+            } = wait_for_phase("initial_logical_sizes", logical_sizes_done, timeout).await;
+
+            scopeguard::ScopeGuard::into_inner(guard);
+
            // allow background jobs to start: we either completed prior stages, or they reached timeout
            // and were skipped.  It is important that we do not let them block background jobs indefinitely,
            // because things like consumption metrics for billing are blocked by this barrier.
@@ -490,6 +514,9 @@ fn start_pageserver(
            if let Some(f) = init_load_skipped {
                f.await;
            }
+            if let Some(f) = logical_sizes_skipped {
+                f.await;
+            }
            scopeguard::ScopeGuard::into_inner(guard);

            startup_checkpoint(started_startup_at, "complete", "Startup complete");
--- a/pageserver/src/consumption_metrics/metrics.rs
+++ b/pageserver/src/consumption_metrics/metrics.rs
@@ -2,7 +2,6 @@ use crate::{context::RequestContext, tenant::timeline::logical_size::CurrentLogi
 use chrono::{DateTime, Utc};
 use consumption_metrics::EventType;
 use futures::stream::StreamExt;
-use pageserver_api::shard::ShardNumber;
 use std::{sync::Arc, time::SystemTime};
 use utils::{
    id::{TenantId, TimelineId},
@@ -229,11 +228,6 @@ where
    while let Some((tenant_id, tenant)) = tenants.next().await {
        let mut tenant_resident_size = 0;

-        // Sharded tenants report all consumption metrics from shard zero
-        if tenant.tenant_shard_id().shard_number != ShardNumber(0) {
-            continue;
-        }
-
        for timeline in tenant.list_timelines() {
            let timeline_id = timeline.timeline_id;

@@ -357,12 +351,7 @@ impl TimelineSnapshot {

            let current_exact_logical_size = {
                let span = tracing::info_span!("collect_metrics_iteration", tenant_id = %t.tenant_shard_id.tenant_id, timeline_id = %t.timeline_id);
-                let size = span.in_scope(|| {
-                    t.get_current_logical_size(
-                        crate::tenant::timeline::GetLogicalSizePriority::Background,
-                        ctx,
-                    )
-                });
+                let size = span.in_scope(|| t.get_current_logical_size(ctx));
                match size {
                    // Only send timeline logical size when it is fully calculated.
                    CurrentLogicalSize::Exact(ref size) => Some(size.into()),
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -338,8 +338,7 @@ async fn build_timeline_info_common(
        Lsn(0) => None,
        lsn @ Lsn(_) => Some(lsn),
    };
-    let current_logical_size =
-        timeline.get_current_logical_size(tenant::timeline::GetLogicalSizePriority::User, ctx);
+    let current_logical_size = timeline.get_current_logical_size(ctx);
    let current_physical_size = Some(timeline.layer_size_sum().await);
    let state = timeline.current_state();
    let remote_consistent_lsn_projected = timeline
@@ -709,26 +708,6 @@ async fn tenant_detach_handler(
    json_response(StatusCode::OK, ())
 }

-async fn tenant_reset_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
-    check_permission(&request, Some(tenant_shard_id.tenant_id))?;
-
-    let drop_cache: Option<bool> = parse_query_param(&request, "drop_cache")?;
-
-    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
-    let state = get_state(&request);
-    state
-        .tenant_manager
-        .reset_tenant(tenant_shard_id, drop_cache.unwrap_or(false), ctx)
-        .await
-        .map_err(ApiError::InternalServerError)?;
-
-    json_response(StatusCode::OK, ())
-}
-
 async fn tenant_load_handler(
    mut request: Request<Body>,
    _cancel: CancellationToken,
@@ -1848,9 +1827,6 @@ pub fn make_router(
        .post("/v1/tenant/:tenant_id/detach", |r| {
            api_handler(r, tenant_detach_handler)
        })
-        .post("/v1/tenant/:tenant_shard_id/reset", |r| {
-            api_handler(r, tenant_reset_handler)
-        })
        .post("/v1/tenant/:tenant_id/load", |r| {
            api_handler(r, tenant_load_handler)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -186,6 +186,13 @@ pub struct InitializationOrder {
    /// Each initial tenant load task carries this until completion.
    pub initial_tenant_load: Option<utils::completion::Completion>,

+    /// Barrier for when we can start initial logical size calculations.
+    pub initial_logical_size_can_start: utils::completion::Barrier,
+
+    /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
+    /// attempt. It is important to drop this once the attempt has completed.
+    pub initial_logical_size_attempt: Option<utils::completion::Completion>,
+
    /// Barrier for when we can start any background jobs.
    ///
    /// This can be broken up later on, but right now there is just one class of a background job.
@@ -205,7 +212,7 @@ async fn timed<Fut: std::future::Future>(
    match tokio::time::timeout(warn_at, &mut fut).await {
        Ok(ret) => {
            tracing::info!(
-                stage = name,
+                task = name,
                elapsed_ms = started.elapsed().as_millis(),
                "completed"
            );
@@ -213,7 +220,7 @@ async fn timed<Fut: std::future::Future>(
        }
        Err(_) => {
            tracing::info!(
-                stage = name,
+                task = name,
                elapsed_ms = started.elapsed().as_millis(),
                "still waiting, taking longer than expected..."
            );
@@ -222,7 +229,7 @@ async fn timed<Fut: std::future::Future>(

            // this has a global allowed_errors
            tracing::warn!(
-                stage = name,
+                task = name,
                elapsed_ms = started.elapsed().as_millis(),
                "completed, took longer than expected"
            );
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -407,14 +407,16 @@ pub(crate) mod initial_logical_size {
    use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
    use once_cell::sync::Lazy;

+    use crate::task_mgr::TaskKind;
+
    pub(crate) struct StartCalculation(IntCounterVec);
    pub(crate) static START_CALCULATION: Lazy<StartCalculation> = Lazy::new(|| {
        StartCalculation(
            register_int_counter_vec!(
                "pageserver_initial_logical_size_start_calculation",
                "Incremented each time we start an initial logical size calculation attempt. \
-                 The `circumstances` label provides some additional details.",
-                &["attempt", "circumstances"]
+                 The `task_kind` label is for the task kind that caused this attempt.",
+                &["attempt", "task_kind"]
            )
            .unwrap(),
        )
@@ -462,24 +464,19 @@ pub(crate) mod initial_logical_size {
        inc_drop_calculation: Option<IntCounter>,
    }

-    #[derive(strum_macros::IntoStaticStr)]
-    pub(crate) enum StartCircumstances {
-        EmptyInitial,
-        SkippedConcurrencyLimiter,
-        AfterBackgroundTasksRateLimit,
-    }
-
    impl StartCalculation {
-        pub(crate) fn first(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
-            let circumstances_label: &'static str = circumstances.into();
-            self.0.with_label_values(&["first", circumstances_label]);
+        pub(crate) fn first(&self, causing_task_kind: Option<TaskKind>) -> OngoingCalculationGuard {
+            let task_kind_label: &'static str =
+                causing_task_kind.map(|k| k.into()).unwrap_or_default();
+            self.0.with_label_values(&["first", task_kind_label]);
            OngoingCalculationGuard {
                inc_drop_calculation: Some(DROP_CALCULATION.first.clone()),
            }
        }
-        pub(crate) fn retry(&self, circumstances: StartCircumstances) -> OngoingCalculationGuard {
-            let circumstances_label: &'static str = circumstances.into();
-            self.0.with_label_values(&["retry", circumstances_label]);
+        pub(crate) fn retry(&self, causing_task_kind: Option<TaskKind>) -> OngoingCalculationGuard {
+            let task_kind_label: &'static str =
+                causing_task_kind.map(|k| k.into()).unwrap_or_default();
+            self.0.with_label_values(&["retry", task_kind_label]);
            OngoingCalculationGuard {
                inc_drop_calculation: Some(DROP_CALCULATION.retry.clone()),
            }
@@ -1388,8 +1385,6 @@ pub(crate) static WAL_REDO_PROCESS_LAUNCH_DURATION_HISTOGRAM: Lazy<Histogram> =
 pub(crate) struct WalRedoProcessCounters {
    pub(crate) started: IntCounter,
    pub(crate) killed_by_cause: enum_map::EnumMap<WalRedoKillCause, IntCounter>,
-    pub(crate) active_stderr_logger_tasks_started: IntCounter,
-    pub(crate) active_stderr_logger_tasks_finished: IntCounter,
 }

 #[derive(Debug, enum_map::Enum, strum_macros::IntoStaticStr)]
@@ -1413,19 +1408,6 @@ impl Default for WalRedoProcessCounters {
            &["cause"],
        )
        .unwrap();
-
-        let active_stderr_logger_tasks_started = register_int_counter!(
-            "pageserver_walredo_stderr_logger_tasks_started_total",
-            "Number of active walredo stderr logger tasks that have started",
-        )
-        .unwrap();
-
-        let active_stderr_logger_tasks_finished = register_int_counter!(
-            "pageserver_walredo_stderr_logger_tasks_finished_total",
-            "Number of active walredo stderr logger tasks that have finished",
-        )
-        .unwrap();
-
        Self {
            started,
            killed_by_cause: EnumMap::from_array(std::array::from_fn(|i| {
@@ -1433,8 +1415,6 @@ impl Default for WalRedoProcessCounters {
                let cause_str: &'static str = cause.into();
                killed.with_label_values(&[cause_str])
            })),
-            active_stderr_logger_tasks_started,
-            active_stderr_logger_tasks_finished,
        }
    }
 }
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -53,14 +53,12 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::import_datadir::import_wal_from_tar;
 use crate::metrics;
 use crate::metrics::LIVE_CONNECTIONS_COUNT;
-use crate::pgdatadir_mapping::rel_block_to_key;
 use crate::task_mgr;
 use crate::task_mgr::TaskKind;
 use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::mgr;
 use crate::tenant::mgr::get_active_tenant_with_timeout;
 use crate::tenant::mgr::GetActiveTenantError;
-use crate::tenant::mgr::ShardSelector;
 use crate::tenant::Timeline;
 use crate::trace::Tracer;

@@ -401,19 +399,16 @@ impl PageServerHandler {
    {
        debug_assert_current_span_has_tenant_and_timeline_id();

-        // Note that since one connection may contain getpage requests that target different
-        // shards (e.g. during splitting when the compute is not yet aware of the split), the tenant
-        // that we look up here may not be the one that serves all the actual requests: we will double
-        // check the mapping of key->shard later before calling into Timeline for getpage requests.
+        // TODO(sharding): enumerate local tenant shards for this tenant, and select the one
+        // that should serve this request.
+
+        // Make request tracer if needed
        let tenant = mgr::get_active_tenant_with_timeout(
            tenant_id,
-            ShardSelector::First,
            ACTIVE_TENANT_TIMEOUT,
            &task_mgr::shutdown_token(),
        )
        .await?;
-
-        // Make request tracer if needed
        let mut tracer = if tenant.get_trace_read_requests() {
            let connection_id = ConnectionId::generate();
            let path =
@@ -571,7 +566,6 @@ impl PageServerHandler {
        info!("creating new timeline");
        let tenant = get_active_tenant_with_timeout(
            tenant_id,
-            ShardSelector::Zero,
            ACTIVE_TENANT_TIMEOUT,
            &task_mgr::shutdown_token(),
        )
@@ -634,7 +628,7 @@ impl PageServerHandler {
        debug_assert_current_span_has_tenant_and_timeline_id();

        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+            .get_active_tenant_timeline(tenant_id, timeline_id)
            .await?;
        let last_record_lsn = timeline.get_last_record_lsn();
        if last_record_lsn != start_lsn {
@@ -813,49 +807,9 @@ impl PageServerHandler {
        }
        */

-        let key = rel_block_to_key(req.rel, req.blkno);
-        let page = if timeline.get_shard_identity().is_key_local(&key) {
-            timeline
-                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
-                .await?
-        } else {
-            // The Tenant shard we looked up at connection start does not hold this particular
-            // key: look for other shards in this tenant.  This scenario occurs if a pageserver
-            // has multiple shards for the same tenant.
-            //
-            // TODO: optimize this (https://github.com/neondatabase/neon/pull/6037)
-            let timeline = match self
-                .get_active_tenant_timeline(
-                    timeline.tenant_shard_id.tenant_id,
-                    timeline.timeline_id,
-                    ShardSelector::Page(key),
-                )
-                .await
-            {
-                Ok(t) => t,
-                Err(GetActiveTimelineError::Tenant(GetActiveTenantError::NotFound(_))) => {
-                    // We already know this tenant exists in general, because we resolved it at
-                    // start of connection.  Getting a NotFound here indicates that the shard containing
-                    // the requested page is not present on this node.
-
-                    // TODO: this should be some kind of structured error that the client will understand,
-                    // so that it can block until its config is updated: this error is expected in the case
-                    // that the Tenant's shards' placements are being updated and the client hasn't been
-                    // informed yet.
-                    //
-                    // https://github.com/neondatabase/neon/issues/6038
-                    return Err(anyhow::anyhow!("Request routed to wrong shard"));
-                }
-                Err(e) => return Err(e.into()),
-            };
-
-            // Take a GateGuard for the duration of this request.  If we were using our main Timeline object,
-            // the GateGuard was already held over the whole connection.
-            let _timeline_guard = timeline.gate.enter().map_err(|_| QueryError::Shutdown)?;
-            timeline
-                .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
-                .await?
-        };
+        let page = timeline
+            .get_rel_page_at_lsn(req.rel, req.blkno, lsn, req.latest, ctx)
+            .await?;

        Ok(PagestreamBeMessage::GetPage(PagestreamGetPageResponse {
            page,
@@ -884,7 +838,7 @@ impl PageServerHandler {

        // check that the timeline exists
        let timeline = self
-            .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+            .get_active_tenant_timeline(tenant_id, timeline_id)
            .await?;
        let latest_gc_cutoff_lsn = timeline.get_latest_gc_cutoff_lsn();
        if let Some(lsn) = lsn {
@@ -990,11 +944,9 @@ impl PageServerHandler {
        &self,
        tenant_id: TenantId,
        timeline_id: TimelineId,
-        selector: ShardSelector,
    ) -> Result<Arc<Timeline>, GetActiveTimelineError> {
        let tenant = get_active_tenant_with_timeout(
            tenant_id,
-            selector,
            ACTIVE_TENANT_TIMEOUT,
            &task_mgr::shutdown_token(),
        )
@@ -1168,7 +1120,7 @@ where

            self.check_permission(Some(tenant_id))?;
            let timeline = self
-                .get_active_tenant_timeline(tenant_id, timeline_id, ShardSelector::Zero)
+                .get_active_tenant_timeline(tenant_id, timeline_id)
                .await?;

            let end_of_timeline = timeline.get_last_record_rlsn();
@@ -1355,7 +1307,6 @@ where

            let tenant = get_active_tenant_with_timeout(
                tenant_id,
-                ShardSelector::Zero,
                ACTIVE_TENANT_TIMEOUT,
                &task_mgr::shutdown_token(),
            )
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -13,7 +13,6 @@ use crate::repository::*;
 use crate::walrecord::NeonWalRecord;
 use anyhow::Context;
 use bytes::{Buf, Bytes};
-use pageserver_api::key::is_rel_block_key;
 use pageserver_api::reltag::{RelTag, SlruKind};
 use postgres_ffi::relfile_utils::{FSM_FORKNUM, VISIBILITYMAP_FORKNUM};
 use postgres_ffi::BLCKSZ;
@@ -283,10 +282,6 @@ impl Timeline {
    }

    /// Get a list of all existing relations in given tablespace and database.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
    pub async fn list_rels(
        &self,
        spcnode: Oid,
@@ -635,10 +630,6 @@ impl Timeline {
    ///
    /// Only relation blocks are counted currently. That excludes metadata,
    /// SLRUs, twophase files etc.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
    pub async fn get_current_logical_size_non_incremental(
        &self,
        lsn: Lsn,
@@ -1323,7 +1314,7 @@ impl<'a> DatadirModification<'a> {
        // Flush relation and  SLRU data blocks, keep metadata.
        let mut retained_pending_updates = HashMap::new();
        for (key, value) in self.pending_updates.drain() {
-            if is_rel_block_key(&key) || is_slru_block_key(key) {
+            if is_rel_block_key(key) || is_slru_block_key(key) {
                // This bails out on first error without modifying pending_updates.
                // That's Ok, cf this function's doc comment.
                writer.put(key, self.lsn, &value, ctx).await?;
@@ -1579,7 +1570,7 @@ fn rel_dir_to_key(spcnode: Oid, dbnode: Oid) -> Key {
    }
 }

-pub(crate) fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
+fn rel_block_to_key(rel: RelTag, blknum: BlockNumber) -> Key {
    Key {
        field1: 0x00,
        field2: rel.spcnode,
@@ -1778,6 +1769,10 @@ pub fn key_to_rel_block(key: Key) -> anyhow::Result<(RelTag, BlockNumber)> {
    })
 }

+fn is_rel_block_key(key: Key) -> bool {
+    key.field1 == 0x00 && key.field4 != 0
+}
+
 pub fn is_rel_fsm_block_key(key: Key) -> bool {
    key.field1 == 0x00 && key.field4 != 0 && key.field5 == FSM_FORKNUM && key.field6 != 0xffffffff
 }
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -19,7 +19,6 @@ use futures::stream::FuturesUnordered;
 use futures::FutureExt;
 use futures::StreamExt;
 use pageserver_api::models::TimelineState;
-use pageserver_api::shard::ShardIdentity;
 use pageserver_api::shard::TenantShardId;
 use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
@@ -237,9 +236,6 @@ pub struct Tenant {

    tenant_shard_id: TenantShardId,

-    // The detailed sharding information, beyond the number/count in tenant_shard_id
-    shard_identity: ShardIdentity,
-
    /// The remote storage generation, used to protect S3 objects from split-brain.
    /// Does not change over the lifetime of the [`Tenant`] object.
    ///
@@ -316,9 +312,6 @@ impl WalRedoManager {
        }
    }

-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
    pub async fn request_redo(
        &self,
        key: crate::repository::Key,
@@ -476,6 +469,7 @@ impl Tenant {
        index_part: Option<IndexPart>,
        metadata: TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
+        init_order: Option<&InitializationOrder>,
        _ctx: &RequestContext,
    ) -> anyhow::Result<()> {
        let tenant_id = self.tenant_shard_id;
@@ -485,6 +479,7 @@ impl Tenant {
            &metadata,
            ancestor.clone(),
            resources,
+            init_order,
            CreateTimelineCause::Load,
        )?;
        let disk_consistent_lsn = timeline.get_disk_consistent_lsn();
@@ -572,7 +567,6 @@ impl Tenant {
        tenant_shard_id: TenantShardId,
        resources: TenantSharedResources,
        attached_conf: AttachedTenantConf,
-        shard_identity: ShardIdentity,
        init_order: Option<InitializationOrder>,
        tenants: &'static std::sync::RwLock<TenantsMap>,
        mode: SpawnMode,
@@ -594,7 +588,6 @@ impl Tenant {
            TenantState::Attaching,
            conf,
            attached_conf,
-            shard_identity,
            wal_redo_manager,
            tenant_shard_id,
            remote_storage.clone(),
@@ -687,6 +680,10 @@ impl Tenant {
                    // as we are no longer loading, signal completion by dropping
                    // the completion while we resume deletion
                    drop(_completion);
+                    // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
+                    let _ = init_order
+                        .as_mut()
+                        .and_then(|x| x.initial_logical_size_attempt.take());
                    let background_jobs_can_start =
                        init_order.as_ref().map(|x| &x.background_jobs_can_start);
                    if let Some(background) = background_jobs_can_start {
@@ -700,6 +697,7 @@ impl Tenant {
                        &tenant_clone,
                        preload,
                        tenants,
+                        init_order,
                        &ctx,
                    )
                    .await
@@ -712,7 +710,7 @@ impl Tenant {
                    }
                }

-                match tenant_clone.attach(preload, &ctx).await {
+                match tenant_clone.attach(init_order, preload, &ctx).await {
                    Ok(()) => {
                        info!("attach finished, activating");
                        tenant_clone.activate(broker_client, None, &ctx);
@@ -775,6 +773,7 @@ impl Tenant {
    ///
    async fn attach(
        self: &Arc<Tenant>,
+        init_order: Option<InitializationOrder>,
        preload: Option<TenantPreload>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
@@ -787,7 +786,7 @@ impl Tenant {
            None => {
                // Deprecated dev mode: load from local disk state instead of remote storage
                // https://github.com/neondatabase/neon/issues/5624
-                return self.load_local(ctx).await;
+                return self.load_local(init_order, ctx).await;
            }
        };

@@ -882,6 +881,7 @@ impl Tenant {
                &index_part.metadata,
                Some(remote_timeline_client),
                self.deletion_queue_client.clone(),
+                None,
            )
            .await
            .context("resume_deletion")
@@ -1006,6 +1006,10 @@ impl Tenant {
            None
        };

+        // we can load remote timelines during init, but they are assumed to be so rare that
+        // initialization order is not passed to here.
+        let init_order = None;
+
        // timeline loading after attach expects to find metadata file for each metadata
        save_metadata(
            self.conf,
@@ -1023,6 +1027,7 @@ impl Tenant {
            Some(index_part),
            remote_metadata,
            ancestor,
+            init_order,
            ctx,
        )
        .await
@@ -1046,9 +1051,6 @@ impl Tenant {
            },
            conf,
            AttachedTenantConf::try_from(LocationConf::default()).unwrap(),
-            // Shard identity isn't meaningful for a broken tenant: it's just a placeholder
-            // to occupy the slot for this TenantShardId.
-            ShardIdentity::broken(tenant_shard_id.shard_number, tenant_shard_id.shard_count),
            wal_redo_manager,
            tenant_shard_id,
            None,
@@ -1267,7 +1269,11 @@ impl Tenant {
    /// files on disk. Used at pageserver startup.
    ///
    /// No background tasks are started as part of this routine.
-    async fn load_local(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
+    async fn load_local(
+        self: &Arc<Tenant>,
+        init_order: Option<InitializationOrder>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
        span::debug_assert_current_span_has_tenant_id();

        debug!("loading tenant task");
@@ -1293,7 +1299,7 @@ impl Tenant {
        // Process loadable timelines first
        for (timeline_id, local_metadata) in scan.sorted_timelines_to_load {
            if let Err(e) = self
-                .load_local_timeline(timeline_id, local_metadata, ctx, false)
+                .load_local_timeline(timeline_id, local_metadata, init_order.as_ref(), ctx, false)
                .await
            {
                match e {
@@ -1327,7 +1333,13 @@ impl Tenant {
                }
                Some(local_metadata) => {
                    if let Err(e) = self
-                        .load_local_timeline(timeline_id, local_metadata, ctx, true)
+                        .load_local_timeline(
+                            timeline_id,
+                            local_metadata,
+                            init_order.as_ref(),
+                            ctx,
+                            true,
+                        )
                        .await
                    {
                        match e {
@@ -1355,11 +1367,12 @@ impl Tenant {
    /// Subroutine of `load_tenant`, to load an individual timeline
    ///
    /// NB: The parent is assumed to be already loaded!
-    #[instrument(skip(self, local_metadata, ctx))]
+    #[instrument(skip(self, local_metadata, init_order, ctx))]
    async fn load_local_timeline(
        self: &Arc<Self>,
        timeline_id: TimelineId,
        local_metadata: TimelineMetadata,
+        init_order: Option<&InitializationOrder>,
        ctx: &RequestContext,
        found_delete_mark: bool,
    ) -> Result<(), LoadLocalTimelineError> {
@@ -1376,6 +1389,7 @@ impl Tenant {
                &local_metadata,
                None,
                self.deletion_queue_client.clone(),
+                init_order,
            )
            .await
            .context("resume deletion")
@@ -1392,9 +1406,17 @@ impl Tenant {
            None
        };

-        self.timeline_init_and_sync(timeline_id, resources, None, local_metadata, ancestor, ctx)
-            .await
-            .map_err(LoadLocalTimelineError::Load)
+        self.timeline_init_and_sync(
+            timeline_id,
+            resources,
+            None,
+            local_metadata,
+            ancestor,
+            init_order,
+            ctx,
+        )
+        .await
+        .map_err(LoadLocalTimelineError::Load)
    }

    pub(crate) fn tenant_id(&self) -> TenantId {
@@ -2289,6 +2311,7 @@ impl Tenant {
        new_metadata: &TimelineMetadata,
        ancestor: Option<Arc<Timeline>>,
        resources: TimelineResources,
+        init_order: Option<&InitializationOrder>,
        cause: CreateTimelineCause,
    ) -> anyhow::Result<Arc<Timeline>> {
        let state = match cause {
@@ -2303,6 +2326,9 @@ impl Tenant {
            CreateTimelineCause::Delete => TimelineState::Stopping,
        };

+        let initial_logical_size_can_start = init_order.map(|x| &x.initial_logical_size_can_start);
+        let initial_logical_size_attempt = init_order.map(|x| &x.initial_logical_size_attempt);
+
        let pg_version = new_metadata.pg_version();

        let timeline = Timeline::new(
@@ -2313,10 +2339,11 @@ impl Tenant {
            new_timeline_id,
            self.tenant_shard_id,
            self.generation,
-            self.shard_identity,
            Arc::clone(&self.walredo_mgr),
            resources,
            pg_version,
+            initial_logical_size_can_start.cloned(),
+            initial_logical_size_attempt.cloned().flatten(),
            state,
            self.cancel.child_token(),
        );
@@ -2331,7 +2358,6 @@ impl Tenant {
        state: TenantState,
        conf: &'static PageServerConf,
        attached_conf: AttachedTenantConf,
-        shard_identity: ShardIdentity,
        walredo_mgr: Arc<WalRedoManager>,
        tenant_shard_id: TenantShardId,
        remote_storage: Option<GenericRemoteStorage>,
@@ -2393,7 +2419,6 @@ impl Tenant {

        Tenant {
            tenant_shard_id,
-            shard_identity,
            generation: attached_conf.location.generation,
            conf,
            // using now here is good enough approximation to catch tenants with really long
@@ -2515,7 +2540,7 @@ impl Tenant {
            }
        }

-        debug!("persisting tenantconf to {config_path}");
+        info!("persisting tenantconf to {config_path}");

        let mut conf_content = r#"# This file contains a specific per-tenant's config.
 #  It is read in case of pageserver restart.
@@ -2550,7 +2575,7 @@ impl Tenant {
        target_config_path: &Utf8Path,
        tenant_conf: &TenantConfOpt,
    ) -> anyhow::Result<()> {
-        debug!("persisting tenantconf to {target_config_path}");
+        info!("persisting tenantconf to {target_config_path}");

        let mut conf_content = r#"# This file contains a specific per-tenant's config.
 #  It is read in case of pageserver restart.
@@ -3140,6 +3165,7 @@ impl Tenant {
                new_metadata,
                ancestor,
                resources,
+                None,
                CreateTimelineCause::Load,
            )
            .context("Failed to create timeline data structure")?;
@@ -3805,8 +3831,6 @@ pub(crate) mod harness {
                    self.generation,
                ))
                .unwrap(),
-                // This is a legacy/test code path: sharding isn't supported here.
-                ShardIdentity::unsharded(),
                walredo_mgr,
                self.tenant_shard_id,
                Some(self.remote_storage.clone()),
@@ -3816,7 +3840,7 @@ pub(crate) mod harness {
            match mode {
                LoadMode::Local => {
                    tenant
-                        .load_local(ctx)
+                        .load_local(None, ctx)
                        .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                        .await?;
                }
@@ -3826,7 +3850,7 @@ pub(crate) mod harness {
                        .instrument(info_span!("try_load_preload", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                        .await?;
                    tenant
-                        .attach(Some(preload), ctx)
+                        .attach(None, Some(preload), ctx)
                        .instrument(info_span!("try_load", tenant_id=%self.tenant_shard_id.tenant_id, shard_id=%self.tenant_shard_id.shard_slug()))
                        .await?;
                }
@@ -3869,9 +3893,6 @@ pub(crate) mod harness {
    pub(crate) struct TestRedoManager;

    impl TestRedoManager {
-        /// # Cancel-Safety
-        ///
-        /// This method is cancellation-safe.
        pub async fn request_redo(
            &self,
            key: Key,
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -15,6 +15,7 @@ use crate::{
    context::RequestContext,
    task_mgr::{self, TaskKind},
    tenant::mgr::{TenantSlot, TenantsMapRemoveResult},
+    InitializationOrder,
 };

 use super::{
@@ -389,6 +390,7 @@ impl DeleteTenantFlow {
        tenant: &Arc<Tenant>,
        preload: Option<TenantPreload>,
        tenants: &'static std::sync::RwLock<TenantsMap>,
+        init_order: Option<InitializationOrder>,
        ctx: &RequestContext,
    ) -> Result<(), DeleteTenantError> {
        let (_, progress) = completion::channel();
@@ -398,7 +400,10 @@ impl DeleteTenantFlow {
            .await
            .expect("cant be stopping or broken");

-        tenant.attach(preload, ctx).await.context("attach")?;
+        tenant
+            .attach(init_order, preload, ctx)
+            .await
+            .context("attach")?;

        Self::background(
            guard,
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -2,8 +2,7 @@
 //! page server.

 use camino::{Utf8DirEntry, Utf8Path, Utf8PathBuf};
-use pageserver_api::key::Key;
-use pageserver_api::shard::{ShardIdentity, ShardNumber, TenantShardId};
+use pageserver_api::shard::TenantShardId;
 use rand::{distributions::Alphanumeric, Rng};
 use std::borrow::Cow;
 use std::collections::{BTreeMap, HashMap};
@@ -131,18 +130,6 @@ pub(crate) enum TenantsMapRemoveResult {
    InProgress(utils::completion::Barrier),
 }

-/// When resolving a TenantId to a shard, we may be looking for the 0th
-/// shard, or we might be looking for whichever shard holds a particular page.
-pub(crate) enum ShardSelector {
-    /// Only return the 0th shard, if it is present.  If a non-0th shard is present,
-    /// ignore it.
-    Zero,
-    /// Pick the first shard we find for the TenantId
-    First,
-    /// Pick the shard that holds this key
-    Page(Key),
-}
-
 impl TenantsMap {
    /// Convenience function for typical usage, where we want to get a `Tenant` object, for
    /// working with attached tenants.  If the TenantId is in the map but in Secondary state,
@@ -157,49 +144,6 @@ impl TenantsMap {
        }
    }

-    /// A page service client sends a TenantId, and to look up the correct Tenant we must
-    /// resolve this to a fully qualified TenantShardId.
-    fn resolve_shard(
-        &self,
-        tenant_id: &TenantId,
-        selector: ShardSelector,
-    ) -> Option<TenantShardId> {
-        let mut want_shard = None;
-        match self {
-            TenantsMap::Initializing => None,
-            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => {
-                for slot in m.range(TenantShardId::tenant_range(*tenant_id)) {
-                    match selector {
-                        ShardSelector::First => return Some(*slot.0),
-                        ShardSelector::Zero if slot.0.shard_number == ShardNumber(0) => {
-                            return Some(*slot.0)
-                        }
-                        ShardSelector::Page(key) => {
-                            if let Some(tenant) = slot.1.get_attached() {
-                                // First slot we see for this tenant, calculate the expected shard number
-                                // for the key: we will use this for checking if this and subsequent
-                                // slots contain the key, rather than recalculating the hash each time.
-                                if want_shard.is_none() {
-                                    want_shard = Some(tenant.shard_identity.get_shard_number(&key));
-                                }
-
-                                if Some(tenant.shard_identity.number) == want_shard {
-                                    return Some(*slot.0);
-                                }
-                            } else {
-                                continue;
-                            }
-                        }
-                        _ => continue,
-                    }
-                }
-
-                // Fall through: we didn't find an acceptable shard
-                None
-            }
-        }
-    }
-
    /// Only for use from DeleteTenantFlow.  This method directly removes a TenantSlot from the map.
    ///
    /// The normal way to remove a tenant is using a SlotGuard, which will gracefully remove the guarded
@@ -270,6 +214,49 @@ async fn safe_rename_tenant_dir(path: impl AsRef<Utf8Path>) -> std::io::Result<U
 static TENANTS: Lazy<std::sync::RwLock<TenantsMap>> =
    Lazy::new(|| std::sync::RwLock::new(TenantsMap::Initializing));

+/// Create a directory, including parents.  This does no fsyncs and makes
+/// no guarantees about the persistence of the resulting metadata: for
+/// use when creating dirs for use as cache.
+async fn unsafe_create_dir_all(path: &Utf8PathBuf) -> std::io::Result<()> {
+    let mut dirs_to_create = Vec::new();
+    let mut path: &Utf8Path = path.as_ref();
+
+    // Figure out which directories we need to create.
+    loop {
+        let meta = tokio::fs::metadata(path).await;
+        match meta {
+            Ok(metadata) if metadata.is_dir() => break,
+            Ok(_) => {
+                return Err(std::io::Error::new(
+                    std::io::ErrorKind::AlreadyExists,
+                    format!("non-directory found in path: {path}"),
+                ));
+            }
+            Err(ref e) if e.kind() == std::io::ErrorKind::NotFound => {}
+            Err(e) => return Err(e),
+        }
+
+        dirs_to_create.push(path);
+
+        match path.parent() {
+            Some(parent) => path = parent,
+            None => {
+                return Err(std::io::Error::new(
+                    std::io::ErrorKind::InvalidInput,
+                    format!("can't find parent of path '{path}'"),
+                ));
+            }
+        }
+    }
+
+    // Create directories from parent to child.
+    for &path in dirs_to_create.iter().rev() {
+        tokio::fs::create_dir(path).await?;
+    }
+
+    Ok(())
+}
+
 /// The TenantManager is responsible for storing and mutating the collection of all tenants
 /// that this pageserver process has state for.  Every Tenant and SecondaryTenant instance
 /// lives inside the TenantManager.
@@ -528,14 +515,12 @@ pub async fn init_tenant_mgr(
        location_conf.attach_in_generation(generation);
        Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;

-        let shard_identity = location_conf.shard;
        match tenant_spawn(
            conf,
            tenant_shard_id,
            &tenant_dir_path,
            resources.clone(),
            AttachedTenantConf::try_from(location_conf)?,
-            shard_identity,
            Some(init_order.clone()),
            &TENANTS,
            SpawnMode::Normal,
@@ -576,7 +561,6 @@ pub(crate) fn tenant_spawn(
    tenant_path: &Utf8Path,
    resources: TenantSharedResources,
    location_conf: AttachedTenantConf,
-    shard_identity: ShardIdentity,
    init_order: Option<InitializationOrder>,
    tenants: &'static std::sync::RwLock<TenantsMap>,
    mode: SpawnMode,
@@ -603,19 +587,12 @@ pub(crate) fn tenant_spawn(
        "Cannot load tenant, ignore mark found at {tenant_ignore_mark:?}"
    );

-    info!(
-        tenant_id = %tenant_shard_id.tenant_id,
-        shard_id = %tenant_shard_id.shard_slug(),
-        generation = ?location_conf.location.generation,
-        attach_mode = ?location_conf.location.attach_mode,
-        "Attaching tenant"
-    );
+    info!("Attaching tenant {tenant_shard_id}");
    let tenant = match Tenant::spawn(
        conf,
        tenant_shard_id,
        resources,
        location_conf,
-        shard_identity,
        init_order,
        tenants,
        mode,
@@ -785,14 +762,12 @@ pub(crate) async fn create_tenant(
        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustNotExist)?;
    let tenant_path = super::create_tenant_files(conf, &location_conf, &tenant_shard_id).await?;

-    let shard_identity = location_conf.shard;
    let created_tenant = tenant_spawn(
        conf,
        tenant_shard_id,
        &tenant_path,
        resources,
        AttachedTenantConf::try_from(location_conf)?,
-        shard_identity,
        None,
        &TENANTS,
        SpawnMode::Create,
@@ -885,7 +860,6 @@ impl TenantManager {
        Ok(())
    }

-    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()))]
    pub(crate) async fn upsert_location(
        &self,
        tenant_shard_id: TenantShardId,
@@ -998,7 +972,7 @@ impl TenantManager {
            LocationMode::Secondary(_) => {
                // Directory doesn't need to be fsync'd because if we crash it can
                // safely be recreated next time this tenant location is configured.
-                tokio::fs::create_dir_all(&tenant_path)
+                unsafe_create_dir_all(&tenant_path)
                    .await
                    .with_context(|| format!("Creating {tenant_path}"))?;

@@ -1014,7 +988,7 @@ impl TenantManager {
                // Directory doesn't need to be fsync'd because we do not depend on
                // it to exist after crashes: it may be recreated when tenant is
                // re-attached, see https://github.com/neondatabase/neon/issues/5550
-                tokio::fs::create_dir_all(&tenant_path)
+                unsafe_create_dir_all(&timelines_path)
                    .await
                    .with_context(|| format!("Creating {timelines_path}"))?;

@@ -1022,14 +996,12 @@ impl TenantManager {
                    .await
                    .map_err(SetNewTenantConfigError::Persist)?;

-                let shard_identity = new_location_config.shard;
                let tenant = tenant_spawn(
                    self.conf,
                    tenant_shard_id,
                    &tenant_path,
                    self.resources.clone(),
                    AttachedTenantConf::try_from(new_location_config)?,
-                    shard_identity,
                    None,
                    self.tenants,
                    SpawnMode::Normal,
@@ -1044,81 +1016,6 @@ impl TenantManager {

        Ok(())
    }
-
-    /// Resetting a tenant is equivalent to detaching it, then attaching it again with the same
-    /// LocationConf that was last used to attach it.  Optionally, the local file cache may be
-    /// dropped before re-attaching.
-    ///
-    /// This is not part of a tenant's normal lifecycle: it is used for debug/support, in situations
-    /// where an issue is identified that would go away with a restart of the tenant.
-    ///
-    /// This does not have any special "force" shutdown of a tenant: it relies on the tenant's tasks
-    /// to respect the cancellation tokens used in normal shutdown().
-    #[instrument(skip_all, fields(tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug(), %drop_cache))]
-    pub(crate) async fn reset_tenant(
-        &self,
-        tenant_shard_id: TenantShardId,
-        drop_cache: bool,
-        ctx: RequestContext,
-    ) -> anyhow::Result<()> {
-        let mut slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::Any)?;
-        let Some(old_slot) = slot_guard.get_old_value() else {
-            anyhow::bail!("Tenant not found when trying to reset");
-        };
-
-        let Some(tenant) = old_slot.get_attached() else {
-            slot_guard.revert();
-            anyhow::bail!("Tenant is not in attached state");
-        };
-
-        let (_guard, progress) = utils::completion::channel();
-        match tenant.shutdown(progress, false).await {
-            Ok(()) => {
-                slot_guard.drop_old_value()?;
-            }
-            Err(_barrier) => {
-                slot_guard.revert();
-                anyhow::bail!("Cannot reset Tenant, already shutting down");
-            }
-        }
-
-        let tenant_path = self.conf.tenant_path(&tenant_shard_id);
-        let timelines_path = self.conf.timelines_path(&tenant_shard_id);
-        let config = Tenant::load_tenant_config(self.conf, &tenant_shard_id)?;
-
-        if drop_cache {
-            tracing::info!("Dropping local file cache");
-
-            match tokio::fs::read_dir(&timelines_path).await {
-                Err(e) => {
-                    tracing::warn!("Failed to list timelines while dropping cache: {}", e);
-                }
-                Ok(mut entries) => {
-                    while let Some(entry) = entries.next_entry().await? {
-                        tokio::fs::remove_dir_all(entry.path()).await?;
-                    }
-                }
-            }
-        }
-
-        let shard_identity = config.shard;
-        let tenant = tenant_spawn(
-            self.conf,
-            tenant_shard_id,
-            &tenant_path,
-            self.resources.clone(),
-            AttachedTenantConf::try_from(config)?,
-            shard_identity,
-            None,
-            self.tenants,
-            SpawnMode::Normal,
-            &ctx,
-        )?;
-
-        slot_guard.upsert(TenantSlot::Attached(tenant))?;
-
-        Ok(())
-    }
 }

 #[derive(Debug, thiserror::Error)]
@@ -1203,7 +1100,6 @@ pub(crate) enum GetActiveTenantError {
 /// then wait for up to `timeout` (minus however long we waited for the slot).
 pub(crate) async fn get_active_tenant_with_timeout(
    tenant_id: TenantId,
-    shard_selector: ShardSelector,
    timeout: Duration,
    cancel: &CancellationToken,
 ) -> Result<Arc<Tenant>, GetActiveTenantError> {
@@ -1212,17 +1108,15 @@ pub(crate) async fn get_active_tenant_with_timeout(
        Tenant(Arc<Tenant>),
    }

+    // TODO(sharding): make page service interface sharding-aware (page service should apply ShardIdentity to the key
+    // to decide which shard services the request)
+    let tenant_shard_id = TenantShardId::unsharded(tenant_id);
+
    let wait_start = Instant::now();
    let deadline = wait_start + timeout;

-    let (wait_for, tenant_shard_id) = {
+    let wait_for = {
        let locked = TENANTS.read().unwrap();
-
-        // Resolve TenantId to TenantShardId
-        let tenant_shard_id = locked.resolve_shard(&tenant_id, shard_selector).ok_or(
-            GetActiveTenantError::NotFound(GetTenantError::NotFound(tenant_id)),
-        )?;
-
        let peek_slot = tenant_map_peek_slot(&locked, &tenant_shard_id, TenantSlotPeekMode::Read)
            .map_err(GetTenantError::MapState)?;
        match peek_slot {
@@ -1232,7 +1126,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                        // Fast path: we don't need to do any async waiting.
                        return Ok(tenant.clone());
                    }
-                    _ => (WaitFor::Tenant(tenant.clone()), tenant_shard_id),
+                    _ => WaitFor::Tenant(tenant.clone()),
                }
            }
            Some(TenantSlot::Secondary) => {
@@ -1240,9 +1134,7 @@ pub(crate) async fn get_active_tenant_with_timeout(
                    tenant_id,
                )))
            }
-            Some(TenantSlot::InProgress(barrier)) => {
-                (WaitFor::Barrier(barrier.clone()), tenant_shard_id)
-            }
+            Some(TenantSlot::InProgress(barrier)) => WaitFor::Barrier(barrier.clone()),
            None => {
                return Err(GetActiveTenantError::NotFound(GetTenantError::NotFound(
                    tenant_id,
@@ -1327,7 +1219,8 @@ pub(crate) async fn delete_tenant(
    // See https://github.com/neondatabase/neon/issues/5080

    // TODO(sharding): make delete API sharding-aware
-    let slot_guard = tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;
+    let mut slot_guard =
+        tenant_map_acquire_slot(&tenant_shard_id, TenantSlotAcquireMode::MustExist)?;

    // unwrap is safe because we used MustExist mode when acquiring
    let tenant = match slot_guard.get_old_value().as_ref().unwrap() {
@@ -1484,14 +1377,12 @@ pub(crate) async fn load_tenant(

    Tenant::persist_tenant_config(conf, &tenant_shard_id, &location_conf).await?;

-    let shard_identity = location_conf.shard;
    let new_tenant = tenant_spawn(
        conf,
        tenant_shard_id,
        &tenant_path,
        resources,
        AttachedTenantConf::try_from(location_conf)?,
-        shard_identity,
        None,
        &TENANTS,
        SpawnMode::Normal,
@@ -1581,14 +1472,12 @@ pub(crate) async fn attach_tenant(
    // TODO: tenant directory remains on disk if we bail out from here on.
    //       See https://github.com/neondatabase/neon/issues/4233

-    let shard_identity = location_conf.shard;
    let attached_tenant = tenant_spawn(
        conf,
        tenant_shard_id,
        &tenant_dir,
        resources,
        AttachedTenantConf::try_from(location_conf)?,
-        shard_identity,
        None,
        &TENANTS,
        SpawnMode::Normal,
@@ -1654,10 +1543,9 @@ pub enum TenantSlotUpsertError {
    MapState(#[from] TenantMapError),
 }

-#[derive(Debug, thiserror::Error)]
+#[derive(Debug)]
 enum TenantSlotDropError {
    /// It is only legal to drop a TenantSlot if its contents are fully shut down
-    #[error("Tenant was not shut down")]
    NotShutdown,
 }

@@ -1717,9 +1605,9 @@ impl SlotGuard {
        }
    }

-    /// Get any value that was present in the slot before we acquired ownership
+    /// Take any value that was present in the slot before we acquired ownership
    /// of it: in state transitions, this will be the old state.
-    fn get_old_value(&self) -> &Option<TenantSlot> {
+    fn get_old_value(&mut self) -> &Option<TenantSlot> {
        &self.old_value
    }

--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -363,7 +363,7 @@ pub(super) async fn download_index_part(
        None => {
            // Migration from legacy pre-generation state: we have a generation but no prior
            // attached pageservers did.  Try to load from a no-generation path.
-            tracing::debug!("No index_part.json* found");
+            tracing::info!("No index_part.json* found");
            do_download_index_part(
                storage,
                tenant_shard_id,
--- a/pageserver/src/tenant/storage_layer/layer.rs
+++ b/pageserver/src/tenant/storage_layer/layer.rs
@@ -230,10 +230,6 @@ impl Layer {
    ///
    /// It is up to the caller to collect more data from the previous layer and
    /// perform WAL redo, if necessary.
-    ///
-    /// # Cancellation-Safety
-    ///
-    /// This method is cancellation-safe.
    pub(crate) async fn get_value_reconstruct_data(
        &self,
        key: Key,
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -44,7 +44,6 @@ pub(crate) enum BackgroundLoopKind {
    Eviction,
    ConsumptionMetricsCollectMetrics,
    ConsumptionMetricsSyntheticSizeWorker,
-    InitialLogicalSizeCalculation,
 }

 impl BackgroundLoopKind {
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -18,29 +18,25 @@ use pageserver_api::{
        DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest, LayerMapInfo,
        TimelineState,
    },
-    shard::{ShardIdentity, TenantShardId},
+    shard::TenantShardId,
 };
-use rand::Rng;
 use serde_with::serde_as;
 use storage_broker::BrokerClientChannel;
 use tokio::{
    runtime::Handle,
-    sync::{oneshot, watch},
+    sync::{oneshot, watch, TryAcquireError},
 };
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{id::TenantTimelineId, sync::gate::Gate};

+use std::cmp::{max, min, Ordering};
 use std::collections::{BinaryHeap, HashMap, HashSet};
 use std::ops::{Deref, Range};
 use std::pin::pin;
 use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};
-use std::{
-    cmp::{max, min, Ordering},
-    ops::ControlFlow,
-};

 use crate::context::{
    AccessStatsBehavior, DownloadBehavior, RequestContext, RequestContextBuilder,
@@ -167,10 +163,6 @@ pub struct Timeline {
    /// this copy enforces the invariant that generatio doesn't change during a Tenant's lifetime.
    pub(crate) generation: Generation,

-    /// The detailed sharding information from our parent Tenant.  This enables us to map keys
-    /// to shards, and is constant through the lifetime of this Timeline.
-    shard_identity: ShardIdentity,
-
    pub pg_version: u32,

    /// The tuple has two elements.
@@ -306,6 +298,13 @@ pub struct Timeline {

    eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,

+    /// Barrier to wait before doing initial logical size calculation. Used only during startup.
+    initial_logical_size_can_start: Option<completion::Barrier>,
+
+    /// Completion shared between all timelines loaded during startup; used to delay heavier
+    /// background tasks until some logical sizes have been calculated.
+    initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
+
    /// Load or creation time information about the disk_consistent_lsn and when the loading
    /// happened. Used for consumption metrics.
    pub(crate) loaded_at: (Lsn, SystemTime),
@@ -454,11 +453,6 @@ pub enum LogicalSizeCalculationCause {
    TenantSizeHandler,
 }

-pub enum GetLogicalSizePriority {
-    User,
-    Background,
-}
-
 #[derive(enumset::EnumSetType)]
 pub(crate) enum CompactFlags {
    ForceRepartition,
@@ -495,9 +489,6 @@ impl Timeline {
    /// an ancestor branch, for example, or waste a lot of cycles chasing the
    /// non-existing key.
    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
    pub async fn get(
        &self,
        key: Key,
@@ -810,12 +801,7 @@ impl Timeline {
                    .access_stats_behavior(AccessStatsBehavior::Skip)
                    .build();

-                // 2. Compact
-                let timer = self.metrics.compact_time_histo.start_timer();
-                self.compact_level0(target_file_size, ctx).await?;
-                timer.stop_and_record();
-
-                // 3. Create new image layers for partitions that have been modified
+                // 2. Create new image layers for partitions that have been modified
                // "enough".
                let layers = self
                    .create_image_layers(&partitioning, lsn, false, &image_ctx)
@@ -827,6 +813,11 @@ impl Timeline {
                    }
                }

+                // 3. Compact
+                let timer = self.metrics.compact_time_histo.start_timer();
+                self.compact_level0(target_file_size, ctx).await?;
+                timer.stop_and_record();
+
                if let Some(remote_client) = &self.remote_client {
                    // should any new image layer been created, not uploading index_part will
                    // result in a mismatch between remote_physical_size and layermap calculated
@@ -858,6 +849,46 @@ impl Timeline {
        }
    }

+    /// Retrieve current logical size of the timeline.
+    ///
+    /// The size could be lagging behind the actual number, in case
+    /// the initial size calculation has not been run (gets triggered on the first size access).
+    ///
+    /// return size and boolean flag that shows if the size is exact
+    pub(crate) fn get_current_logical_size(
+        self: &Arc<Self>,
+        ctx: &RequestContext,
+    ) -> logical_size::CurrentLogicalSize {
+        let current_size = self.current_logical_size.current_size();
+        debug!("Current size: {current_size:?}");
+
+        if let (CurrentLogicalSize::Approximate(_), Some(initial_part_end)) =
+            (current_size, self.current_logical_size.initial_part_end)
+        {
+            self.try_spawn_size_init_task(initial_part_end, ctx);
+        }
+
+        if let CurrentLogicalSize::Approximate(_) = &current_size {
+            if ctx.task_kind() == TaskKind::WalReceiverConnectionHandler {
+                let first = self
+                    .current_logical_size
+                    .did_return_approximate_to_walreceiver
+                    .compare_exchange(
+                        false,
+                        true,
+                        AtomicOrdering::Relaxed,
+                        AtomicOrdering::Relaxed,
+                    )
+                    .is_ok();
+                if first {
+                    crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE.inc();
+                }
+            }
+        }
+
+        current_size
+    }
+
    /// Check if more than 'checkpoint_distance' of WAL has been accumulated in
    /// the in-memory layer, and initiate flushing it if so.
    ///
@@ -907,7 +938,6 @@ impl Timeline {
        background_jobs_can_start: Option<&completion::Barrier>,
        ctx: &RequestContext,
    ) {
-        self.spawn_initial_logical_size_computation_task(ctx);
        self.launch_wal_receiver(ctx, broker_client);
        self.set_state(TimelineState::Active);
        self.launch_eviction_task(background_jobs_can_start);
@@ -1021,6 +1051,17 @@ impl Timeline {
                error!("Not activating a Stopping timeline");
            }
            (_, new_state) => {
+                if matches!(
+                    new_state,
+                    TimelineState::Stopping | TimelineState::Broken { .. }
+                ) {
+                    // drop the completion guard, if any; it might be holding off the completion
+                    // forever needlessly
+                    self.initial_logical_size_attempt
+                        .lock()
+                        .unwrap_or_else(|e| e.into_inner())
+                        .take();
+                }
                self.state.send_replace(new_state);
            }
        }
@@ -1339,10 +1380,11 @@ impl Timeline {
        timeline_id: TimelineId,
        tenant_shard_id: TenantShardId,
        generation: Generation,
-        shard_identity: ShardIdentity,
        walredo_mgr: Arc<super::WalRedoManager>,
        resources: TimelineResources,
        pg_version: u32,
+        initial_logical_size_can_start: Option<completion::Barrier>,
+        initial_logical_size_attempt: Option<completion::Completion>,
        state: TimelineState,
        cancel: CancellationToken,
    ) -> Arc<Self> {
@@ -1369,7 +1411,6 @@ impl Timeline {
                timeline_id,
                tenant_shard_id,
                generation,
-                shard_identity,
                pg_version,
                layers: Arc::new(tokio::sync::RwLock::new(LayerManager::create())),
                wanted_image_layers: Mutex::new(None),
@@ -1443,6 +1484,8 @@ impl Timeline {
                ),
                delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),

+                initial_logical_size_can_start,
+                initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
                cancel,
                gate: Gate::new(format!("Timeline<{tenant_shard_id}/{timeline_id}>")),

@@ -1754,91 +1797,39 @@ impl Timeline {
        Ok(())
    }

-    /// Retrieve current logical size of the timeline.
-    ///
-    /// The size could be lagging behind the actual number, in case
-    /// the initial size calculation has not been run (gets triggered on the first size access).
-    ///
-    /// return size and boolean flag that shows if the size is exact
-    pub(crate) fn get_current_logical_size(
-        self: &Arc<Self>,
-        priority: GetLogicalSizePriority,
-        ctx: &RequestContext,
-    ) -> logical_size::CurrentLogicalSize {
-        let current_size = self.current_logical_size.current_size();
-        debug!("Current size: {current_size:?}");
-
-        match (current_size.accuracy(), priority) {
-            (logical_size::Accuracy::Exact, _) => (), // nothing to do
-            (logical_size::Accuracy::Approximate, GetLogicalSizePriority::Background) => {
-                // background task will eventually deliver an exact value, we're in no rush
-            }
-            (logical_size::Accuracy::Approximate, GetLogicalSizePriority::User) => {
-                // background task is not ready, but user is asking for it now;
-                // => make the background task skip the line
-                // (The alternative would be to calculate the size here, but,
-                //  it can actually take a long time if the user has a lot of rels.
-                //  And we'll inevitable need it again; So, let the background task do the work.)
-                match self
-                    .current_logical_size
-                    .cancel_wait_for_background_loop_concurrency_limit_semaphore
-                    .get()
-                {
-                    Some(cancel) => cancel.cancel(),
-                    None => {
-                        let state = self.current_state();
-                        if matches!(
-                            state,
-                            TimelineState::Broken { .. } | TimelineState::Stopping
-                        ) {
-
-                            // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
-                            // Don't make noise.
-                        } else {
-                            warn!("unexpected: cancel_wait_for_background_loop_concurrency_limit_semaphore not set, priority-boosting of logical size calculation will not work");
-                        }
-                    }
-                };
-            }
-        }
-
-        if let CurrentLogicalSize::Approximate(_) = &current_size {
-            if ctx.task_kind() == TaskKind::WalReceiverConnectionHandler {
-                let first = self
-                    .current_logical_size
-                    .did_return_approximate_to_walreceiver
-                    .compare_exchange(
-                        false,
-                        true,
-                        AtomicOrdering::Relaxed,
-                        AtomicOrdering::Relaxed,
-                    )
-                    .is_ok();
-                if first {
-                    crate::metrics::initial_logical_size::TIMELINES_WHERE_WALRECEIVER_GOT_APPROXIMATE_SIZE.inc();
-                }
-            }
-        }
-
-        current_size
-    }
-
-    fn spawn_initial_logical_size_computation_task(self: &Arc<Self>, ctx: &RequestContext) {
-        let Some(initial_part_end) = self.current_logical_size.initial_part_end else {
-            // nothing to do for freshly created timelines;
-            assert_eq!(
-                self.current_logical_size.current_size().accuracy(),
-                logical_size::Accuracy::Exact,
-            );
+    fn try_spawn_size_init_task(self: &Arc<Self>, lsn: Lsn, ctx: &RequestContext) {
+        let state = self.current_state();
+        if matches!(
+            state,
+            TimelineState::Broken { .. } | TimelineState::Stopping
+        ) {
+            // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
            return;
+        }
+
+        let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
+            .try_acquire_owned()
+        {
+            Ok(permit) => permit,
+            Err(TryAcquireError::NoPermits) => {
+                // computation already ongoing or finished with success
+                return;
+            }
+            Err(TryAcquireError::Closed) => unreachable!("we never call close"),
        };
+        debug_assert!(self
+            .current_logical_size
+            .initial_logical_size
+            .get()
+            .is_none());

-        let cancel_wait_for_background_loop_concurrency_limit_semaphore = CancellationToken::new();
-        let token = cancel_wait_for_background_loop_concurrency_limit_semaphore.clone();
-        self.current_logical_size
-            .cancel_wait_for_background_loop_concurrency_limit_semaphore.set(token)
-            .expect("initial logical size calculation task must be spawned exactly once per Timeline object");
-
+        info!(
+            "spawning logical size computation from context of task kind {:?}",
+            ctx.task_kind()
+        );
+        let causing_task_kind = ctx.task_kind();
+        // We need to start the computation task.
+        // It gets a separate context since it will outlive the request that called this function.
        let self_clone = Arc::clone(self);
        let background_ctx = ctx.detached_child(
            TaskKind::InitialLogicalSizeCalculation,
@@ -1853,152 +1844,96 @@ impl Timeline {
            false,
            // NB: don't log errors here, task_mgr will do that.
            async move {
+
                let cancel = task_mgr::shutdown_token();
-                self_clone
-                    .initial_logical_size_calculation_task(
-                        initial_part_end,
-                        cancel_wait_for_background_loop_concurrency_limit_semaphore,
-                        cancel,
-                        background_ctx,
-                    )
-                    .await;
-                Ok(())
-            }
-            .instrument(info_span!(parent: None, "initial_size_calculation", tenant_id=%self.tenant_shard_id.tenant_id, timeline_id=%self.timeline_id)),
-        );
-    }

-    async fn initial_logical_size_calculation_task(
-        self: Arc<Self>,
-        initial_part_end: Lsn,
-        skip_concurrency_limiter: CancellationToken,
-        cancel: CancellationToken,
-        background_ctx: RequestContext,
-    ) {
-        enum BackgroundCalculationError {
-            Cancelled,
-            Other(anyhow::Error),
-        }
-
-        let try_once = |attempt: usize| {
-            let background_ctx = &background_ctx;
-            let self_ref = &self;
-            let skip_concurrency_limiter = &skip_concurrency_limiter;
-            async move {
-                let cancel = task_mgr::shutdown_token();
-                let wait_for_permit = super::tasks::concurrent_background_tasks_rate_limit(
-                    BackgroundLoopKind::InitialLogicalSizeCalculation,
-                    background_ctx,
-                    &cancel,
-                );
-
-                use crate::metrics::initial_logical_size::StartCircumstances;
-                let (_maybe_permit, circumstances) = tokio::select! {
-                    res = wait_for_permit => {
-                        match res {
-                            Ok(permit) => (Some(permit), StartCircumstances::AfterBackgroundTasksRateLimit),
-                            Err(RateLimitError::Cancelled) => {
-                                return Err(BackgroundCalculationError::Cancelled);
-                            }
-                        }
-                    }
-                    () = skip_concurrency_limiter.cancelled() => {
-                        // Some action that is part of a end user interaction requested logical size
-                        // => break out of the rate limit
-                        // TODO: ideally we'd not run on BackgroundRuntime but the requester's runtime;
-                        // but then again what happens if they cancel; also, we should just be using
-                        // one runtime across the entire process, so, let's leave this for now.
-                        (None, StartCircumstances::SkippedConcurrencyLimiter)
-                    }
+                // in case we were created during pageserver initialization, wait for
+                // initialization to complete before proceeding. startup time init runs on the same
+                // runtime.
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()); },
+                    _ = completion::Barrier::maybe_wait(self_clone.initial_logical_size_can_start.clone()) => {}
                };

-                let metrics_guard = if attempt == 1 {
-                    crate::metrics::initial_logical_size::START_CALCULATION.first(circumstances)
-                } else {
-                    crate::metrics::initial_logical_size::START_CALCULATION.retry(circumstances)
+
+
+                // hold off background tasks from starting until all timelines get to try at least
+                // once initial logical size calculation; though retry will rarely be useful.
+                // holding off is done because heavier tasks execute blockingly on the same
+                // runtime.
+                //
+                // dropping this at every outcome is probably better than trying to cling on to it,
+                // delay will be terminated by a timeout regardless.
+                let completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };
+
+                let metrics_guard = match &completion {
+                    Some(_) => crate::metrics::initial_logical_size::START_CALCULATION.first(Some(causing_task_kind)),
+                    None => crate::metrics::initial_logical_size::START_CALCULATION.retry(Some(causing_task_kind)),
                };

-                match self_ref
-                    .logical_size_calculation_task(
-                        initial_part_end,
-                        LogicalSizeCalculationCause::Initial,
-                        background_ctx,
-                    )
+                let calculated_size = match self_clone
+                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx)
                    .await
                {
-                    Ok(calculated_size) => Ok((calculated_size, metrics_guard)),
+                    Ok(s) => s,
                    Err(CalculateLogicalSizeError::Cancelled) => {
-                        Err(BackgroundCalculationError::Cancelled)
+                        // Don't make noise, this is a common task.
+                        // In the unlikely case that there is another call to this function, we'll retry
+                        // because initial_logical_size is still None.
+                        info!("initial size calculation cancelled, likely timeline delete / tenant detach");
+                        return Ok(());
                    }
                    Err(CalculateLogicalSizeError::Other(err)) => {
-                        if let Some(PageReconstructError::AncestorStopping(_)) =
+                        if let Some(e @ PageReconstructError::AncestorStopping(_)) =
                            err.root_cause().downcast_ref()
                        {
-                            Err(BackgroundCalculationError::Cancelled)
-                        } else {
-                            Err(BackgroundCalculationError::Other(err))
+                            // This can happen if the timeline parent timeline switches to
+                            // Stopping state while we're still calculating the initial
+                            // timeline size for the child, for example if the tenant is
+                            // being detached or the pageserver is shut down. Like with
+                            // CalculateLogicalSizeError::Cancelled, don't make noise.
+                            info!("initial size calculation failed because the timeline or its ancestor is Stopping, likely because the tenant is being detached: {e:#}");
+                            return Ok(());
                        }
+                        return Err(err.context("Failed to calculate logical size"));
+                    }
+                };
+
+                // we cannot query current_logical_size.current_size() to know the current
+                // *negative* value, only truncated to u64.
+                let added = self_clone
+                    .current_logical_size
+                    .size_added_after_initial
+                    .load(AtomicOrdering::Relaxed);
+
+                let sum = calculated_size.saturating_add_signed(added);
+
+                // set the gauge value before it can be set in `update_current_logical_size`.
+                self_clone.metrics.current_logical_size_gauge.set(sum);
+
+                match self_clone
+                    .current_logical_size
+                    .initial_logical_size
+                    .set((calculated_size, metrics_guard.calculation_result_saved()))
+                {
+                    Ok(()) => (),
+                    Err(_what_we_just_attempted_to_set) => {
+                        let (existing_size, _) = self_clone
+                            .current_logical_size
+                            .initial_logical_size
+                            .get()
+                            .expect("once_cell set was lost, then get failed, impossible.");
+                        // This shouldn't happen because the semaphore is initialized with 1.
+                        // But if it happens, just complain & report success so there are no further retries.
+                        error!("Tried to update initial timeline size value to {calculated_size}, but the size was already set to {existing_size}, not changing")
                    }
                }
-            }
-        };
-
-        let retrying = async {
-            let mut attempt = 0;
-            loop {
-                attempt += 1;
-
-                match try_once(attempt).await {
-                    Ok(res) => return ControlFlow::Continue(res),
-                    Err(BackgroundCalculationError::Cancelled) => return ControlFlow::Break(()),
-                    Err(BackgroundCalculationError::Other(e)) => {
-                        warn!(attempt, "initial size calculation failed: {e:?}");
-                        // exponential back-off doesn't make sense at these long intervals;
-                        // use fixed retry interval with generous jitter instead
-                        let sleep_duration = Duration::from_secs(
-                            u64::try_from(
-                                // 1hour base
-                                (60_i64 * 60_i64)
-                                    // 10min jitter
-                                    + rand::thread_rng().gen_range(-10 * 60..10 * 60),
-                            )
-                            .expect("10min < 1hour"),
-                        );
-                        tokio::time::sleep(sleep_duration).await;
-                    }
-                }
-            }
-        };
-
-        let (calculated_size, metrics_guard) = tokio::select! {
-            res = retrying  => {
-                match res {
-                    ControlFlow::Continue(calculated_size) => calculated_size,
-                    ControlFlow::Break(()) => return,
-                }
-            }
-            _ = cancel.cancelled() => {
-                return;
-            }
-        };
-
-        // we cannot query current_logical_size.current_size() to know the current
-        // *negative* value, only truncated to u64.
-        let added = self
-            .current_logical_size
-            .size_added_after_initial
-            .load(AtomicOrdering::Relaxed);
-
-        let sum = calculated_size.saturating_add_signed(added);
-
-        // set the gauge value before it can be set in `update_current_logical_size`.
-        self.metrics.current_logical_size_gauge.set(sum);
-
-        self.current_logical_size
-            .initial_logical_size
-            .set((calculated_size, metrics_guard.calculation_result_saved()))
-            .ok()
-            .expect("only this task sets it");
+                // now that `initial_logical_size.is_some()`, reduce permit count to 0
+                // so that we prevent future callers from spawning this task
+                permit.forget();
+                Ok(())
+            }.in_current_span(),
+        );
    }

    pub fn spawn_ondemand_logical_size_calculation(
@@ -2036,9 +1971,6 @@ impl Timeline {
        receiver
    }

-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
    #[instrument(skip_all)]
    async fn logical_size_calculation_task(
        self: &Arc<Self>,
@@ -2076,10 +2008,6 @@ impl Timeline {
    ///
    /// NOTE: counted incrementally, includes ancestors. This can be a slow operation,
    /// especially if we need to download remote layers.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
    pub async fn calculate_logical_size(
        &self,
        up_to_lsn: Lsn,
@@ -2195,10 +2123,6 @@ impl Timeline {
    ///
    /// This function takes the current timeline's locked LayerMap as an argument,
    /// so callers can avoid potential race conditions.
-    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
    async fn get_reconstruct_data(
        &self,
        key: Key,
@@ -2447,9 +2371,6 @@ impl Timeline {
        }
    }

-    /// # Cancel-safety
-    ///
-    /// This method is cancellation-safe.
    async fn lookup_cached_page(
        &self,
        key: &Key,
@@ -2484,10 +2405,6 @@ impl Timeline {
        Ok(Arc::clone(ancestor))
    }

-    pub(crate) fn get_shard_identity(&self) -> &ShardIdentity {
-        &self.shard_identity
-    }
-
    ///
    /// Get a handle to the latest layer for appending.
    ///
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -21,6 +21,7 @@ use crate::{
        },
        CreateTimelineCause, DeleteTimelineError, Tenant,
    },
+    InitializationOrder,
 };

 use super::{Timeline, TimelineResources};
@@ -406,6 +407,7 @@ impl DeleteTimelineFlow {
        local_metadata: &TimelineMetadata,
        remote_client: Option<RemoteTimelineClient>,
        deletion_queue_client: DeletionQueueClient,
+        init_order: Option<&InitializationOrder>,
    ) -> anyhow::Result<()> {
        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
        // RemoteTimelineClient is the only functioning part.
@@ -418,6 +420,7 @@ impl DeleteTimelineFlow {
                    remote_client,
                    deletion_queue_client,
                },
+                init_order,
                // Important. We dont pass ancestor above because it can be missing.
                // Thus we need to skip the validation here.
                CreateTimelineCause::Delete,
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -1,10 +1,11 @@
 use anyhow::Context;
-
 use once_cell::sync::OnceCell;
-use tokio_util::sync::CancellationToken;
+
+use tokio::sync::Semaphore;
 use utils::lsn::Lsn;

 use std::sync::atomic::{AtomicBool, AtomicI64, Ordering as AtomicOrdering};
+use std::sync::Arc;

 /// Internal structure to hold all data needed for logical size calculation.
 ///
@@ -27,12 +28,8 @@ pub(super) struct LogicalSize {
        crate::metrics::initial_logical_size::FinishedCalculationGuard,
    )>,

-    /// Cancellation for the best-effort logical size calculation.
-    ///
-    /// The token is kept in a once-cell so that we can error out if a higher priority
-    /// request comes in *before* we have started the normal logical size calculation.
-    pub(crate) cancel_wait_for_background_loop_concurrency_limit_semaphore:
-        OnceCell<CancellationToken>,
+    /// Semaphore to track ongoing calculation of `initial_logical_size`.
+    pub initial_size_computation: Arc<tokio::sync::Semaphore>,

    /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
    pub initial_part_end: Option<Lsn>,
@@ -75,7 +72,7 @@ pub(crate) enum CurrentLogicalSize {
    Exact(Exact),
 }

-#[derive(Debug, Copy, Clone, PartialEq, Eq)]
+#[derive(Debug, Copy, Clone)]
 pub(crate) enum Accuracy {
    Approximate,
    Exact,
@@ -118,10 +115,11 @@ impl LogicalSize {
        Self {
            initial_logical_size: OnceCell::with_value((0, {
                crate::metrics::initial_logical_size::START_CALCULATION
-                    .first(crate::metrics::initial_logical_size::StartCircumstances::EmptyInitial)
+                    .first(None)
                    .calculation_result_saved()
            })),
-            cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
+            //  initial_logical_size already computed, so, don't admit any calculations
+            initial_size_computation: Arc::new(Semaphore::new(0)),
            initial_part_end: None,
            size_added_after_initial: AtomicI64::new(0),
            did_return_approximate_to_walreceiver: AtomicBool::new(false),
@@ -131,7 +129,7 @@ impl LogicalSize {
    pub(super) fn deferred_initial(compute_to: Lsn) -> Self {
        Self {
            initial_logical_size: OnceCell::new(),
-            cancel_wait_for_background_loop_concurrency_limit_semaphore: OnceCell::new(),
+            initial_size_computation: Arc::new(Semaphore::new(1)),
            initial_part_end: Some(compute_to),
            size_added_after_initial: AtomicI64::new(0),
            did_return_approximate_to_walreceiver: AtomicBool::new(false),
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -397,10 +397,7 @@ pub(super) async fn handle_walreceiver_connection(
            // Send the replication feedback message.
            // Regular standby_status_update fields are put into this message.
            let current_timeline_size = timeline
-                .get_current_logical_size(
-                    crate::tenant::timeline::GetLogicalSizePriority::User,
-                    &ctx,
-                )
+                .get_current_logical_size(&ctx)
                // FIXME: https://github.com/neondatabase/neon/issues/5963
                .size_dont_care_about_accuracy();
            let status_update = PageserverFeedback {
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -34,6 +34,7 @@ use std::process::{Child, ChildStdin, ChildStdout, Command};
 use std::sync::{Arc, Mutex, MutexGuard, RwLock};
 use std::time::Duration;
 use std::time::Instant;
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::{bin_ser::BeSer, id::TenantId, lsn::Lsn, nonblock::set_nonblock};

@@ -123,9 +124,7 @@ impl PostgresRedoManager {
    /// The WAL redo is handled by a separate thread, so this just sends a request
    /// to the thread and waits for response.
    ///
-    /// # Cancel-Safety
-    ///
-    /// This method is cancellation-safe.
+    /// CANCEL SAFETY: NOT CANCEL SAFE.
    pub async fn request_redo(
        &self,
        key: Key,
@@ -158,6 +157,7 @@ impl PostgresRedoManager {
                        self.conf.wal_redo_timeout,
                        pg_version,
                    )
+                    .await
                };
                img = Some(result?);

@@ -178,6 +178,7 @@ impl PostgresRedoManager {
                self.conf.wal_redo_timeout,
                pg_version,
            )
+            .await
        }
    }
 }
@@ -215,7 +216,7 @@ impl PostgresRedoManager {
    /// Process one request for WAL redo using wal-redo postgres
    ///
    #[allow(clippy::too_many_arguments)]
-    fn apply_batch_postgres(
+    async fn apply_batch_postgres(
        &self,
        key: Key,
        lsn: Lsn,
@@ -331,7 +332,12 @@ impl PostgresRedoManager {
                // than we can SIGKILL & `wait` for them to exit. By doing it the way we do here,
                // we limit this risk of run-away to at most $num_runtimes * $num_executor_threads.
                // This probably needs revisiting at some later point.
+                let mut wait_done = proc.stderr_logger_task_done.clone();
                drop(proc);
+                wait_done
+                    .wait_for(|v| *v)
+                    .await
+                    .expect("we use scopeguard to ensure we always send `true` to the channel before dropping the sender");
            } else if n_attempts != 0 {
                info!(n_attempts, "retried walredo succeeded");
            }
@@ -643,6 +649,8 @@ struct WalRedoProcess {
    child: Option<NoLeakChild>,
    stdout: Mutex<ProcessOutput>,
    stdin: Mutex<ProcessInput>,
+    stderr_logger_cancel: CancellationToken,
+    stderr_logger_task_done: tokio::sync::watch::Receiver<bool>,
    /// Counter to separate same sized walredo inputs failing at the same millisecond.
    #[cfg(feature = "testing")]
    dump_sequence: AtomicUsize,
@@ -691,8 +699,6 @@ impl WalRedoProcess {
        let stdin = child.stdin.take().unwrap();
        let stdout = child.stdout.take().unwrap();
        let stderr = child.stderr.take().unwrap();
-        let stderr = tokio::process::ChildStderr::from_std(stderr)
-            .context("convert to tokio::ChildStderr")?;
        macro_rules! set_nonblock_or_log_err {
            ($file:ident) => {{
                let res = set_nonblock($file.as_raw_fd());
@@ -704,45 +710,69 @@ impl WalRedoProcess {
        }
        set_nonblock_or_log_err!(stdin)?;
        set_nonblock_or_log_err!(stdout)?;
+        set_nonblock_or_log_err!(stderr)?;
+
+        let mut stderr = tokio::io::unix::AsyncFd::new(stderr).context("AsyncFd::with_interest")?;

        // all fallible operations post-spawn are complete, so get rid of the guard
        let child = scopeguard::ScopeGuard::into_inner(child);

-        tokio::spawn(
+        let stderr_logger_cancel = CancellationToken::new();
+        let (stderr_logger_task_done_tx, stderr_logger_task_done_rx) =
+            tokio::sync::watch::channel(false);
+        tokio::spawn({
+            let stderr_logger_cancel = stderr_logger_cancel.clone();
            async move {
                scopeguard::defer! {
                    debug!("wal-redo-postgres stderr_logger_task finished");
-                    crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_finished.inc();
+                    let _ = stderr_logger_task_done_tx.send(true);
                }
                debug!("wal-redo-postgres stderr_logger_task started");
-                crate::metrics::WAL_REDO_PROCESS_COUNTERS.active_stderr_logger_tasks_started.inc();
-
-                use tokio::io::AsyncBufReadExt;
-                let mut stderr_lines = tokio::io::BufReader::new(stderr);
-                let mut buf = Vec::new();
-                let res = loop {
-                    buf.clear();
-                    // TODO we don't trust the process to cap its stderr length.
-                    // Currently it can do unbounded Vec allocation.
-                    match stderr_lines.read_until(b'\n', &mut buf).await {
-                        Ok(0) => break Ok(()), // eof
-                        Ok(num_bytes) => {
-                            let output = String::from_utf8_lossy(&buf[..num_bytes]);
-                            error!(%output, "received output");
+                loop {
+                    // NB: we purposefully don't do a select! for the cancellation here.
+                    // The cancellation would likely cause us to miss stderr messages.
+                    // We can rely on this to return from .await because when we SIGKILL
+                    // the child, the writing end of the stderr pipe gets closed.
+                    match stderr.readable_mut().await {
+                        Ok(mut guard) => {
+                            let mut errbuf = [0; 16384];
+                            let res = guard.try_io(|fd| {
+                                use std::io::Read;
+                                fd.get_mut().read(&mut errbuf)
+                            });
+                            match res {
+                                Ok(Ok(0)) => {
+                                    // it closed the stderr pipe
+                                    break;
+                                }
+                                Ok(Ok(n)) => {
+                                    // The message might not be split correctly into lines here. But this is
+                                    // good enough, the important thing is to get the message to the log.
+                                    let output = String::from_utf8_lossy(&errbuf[0..n]).to_string();
+                                    error!(output, "received output");
+                                },
+                                Ok(Err(e)) => {
+                                    error!(error = ?e, "read() error, waiting for cancellation");
+                                    stderr_logger_cancel.cancelled().await;
+                                    error!(error = ?e, "read() error, cancellation complete");
+                                    break;
+                                }
+                                Err(e) => {
+                                    let _e: tokio::io::unix::TryIoError = e;
+                                    // the read() returned WouldBlock, that's expected
+                                }
+                            }
                        }
                        Err(e) => {
-                            break Err(e);
+                            error!(error = ?e, "read() error, waiting for cancellation");
+                            stderr_logger_cancel.cancelled().await;
+                            error!(error = ?e, "read() error, cancellation complete");
+                            break;
                        }
                    }
-                };
-                match res {
-                    Ok(()) => (),
-                    Err(e) => {
-                        error!(error=?e, "failed to read from walredo stderr");
-                    }
                }
            }.instrument(tracing::info_span!(parent: None, "wal-redo-postgres-stderr", pid = child.id(), tenant_id = %tenant_id, %pg_version))
-        );
+        });

        Ok(Self {
            conf,
@@ -757,6 +787,8 @@ impl WalRedoProcess {
                pending_responses: VecDeque::new(),
                n_processed_responses: 0,
            }),
+            stderr_logger_cancel,
+            stderr_logger_task_done: stderr_logger_task_done_rx,
            #[cfg(feature = "testing")]
            dump_sequence: AtomicUsize::default(),
        })
@@ -997,6 +1029,7 @@ impl Drop for WalRedoProcess {
            .take()
            .expect("we only do this once")
            .kill_and_wait(WalRedoKillCause::WalRedoProcessDrop);
+        self.stderr_logger_cancel.cancel();
        // no way to wait for stderr_logger_task from Drop because that is async only
    }
 }
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -19,6 +19,7 @@
 #include "access/xlog.h"
 #include "access/xlogutils.h"
 #include "storage/buf_internals.h"
+#include "storage/lwlock.h"
 #include "storage/ipc.h"
 #include "storage/pg_shmem.h"
 #include "c.h"
@@ -36,8 +37,6 @@
 #include "walproposer.h"
 #include "neon_utils.h"

-#include <pthread.h>
-
 #define PageStoreTrace DEBUG5

 #define RECONNECT_INTERVAL_USEC 1000000
@@ -70,7 +69,7 @@ int			max_reconnect_attempts = 60;

 typedef struct
 {
-    pthread_rwlock_t lock;
+    LWLockId lock;
    pg_atomic_uint64 update_counter;
    char pageserver_connstring[MAX_PAGESERVER_CONNSTRING_SIZE];
 } PagestoreShmemState;
@@ -106,10 +105,10 @@ AssignPageserverConnstring(const char *newval, void *extra)
 {
    if(!PagestoreShmemIsValid())
        return;
-    pthread_rwlock_wrlock(&pagestore_shared->lock);
+    LWLockAcquire(pagestore_shared->lock, LW_EXCLUSIVE);
    strlcpy(pagestore_shared->pageserver_connstring, newval, MAX_PAGESERVER_CONNSTRING_SIZE);
    pg_atomic_fetch_add_u64(&pagestore_shared->update_counter, 1);
-    pthread_rwlock_unlock(&pagestore_shared->lock);
+    LWLockRelease(pagestore_shared->lock);
 }

 static bool
@@ -120,24 +119,15 @@ CheckConnstringUpdated()
    return pagestore_local_counter < pg_atomic_read_u64(&pagestore_shared->update_counter);
 }

-/* Returns true if the connstring has changed and false if not */
-static bool
+static void
 ReloadConnstring()
 {
    if(!PagestoreShmemIsValid())
-        return false;
-    pthread_rwlock_rdlock(&pagestore_shared->lock);
-
-    if(strcmp(local_pageserver_connstring, pagestore_shared->pageserver_connstring) == 0)
-    {
-        pthread_rwlock_unlock(&pagestore_shared->lock);
-        return false;
-    }
-
+        return;
+    LWLockAcquire(pagestore_shared->lock, LW_SHARED);
    strlcpy(local_pageserver_connstring, pagestore_shared->pageserver_connstring, sizeof(local_pageserver_connstring));
    pagestore_local_counter = pg_atomic_read_u64(&pagestore_shared->update_counter);
-    pthread_rwlock_unlock(&pagestore_shared->lock);
-    return true;
+    LWLockRelease(pagestore_shared->lock);
 }

 static bool
@@ -300,6 +290,7 @@ pageserver_disconnect(void)
 	 */
 	if (connected)
 	{
+		neon_log(LOG, "dropping connection to page server due to error");
 		PQfinish(pageserver_conn);
 		pageserver_conn = NULL;
 		connected = false;
@@ -320,12 +311,8 @@ pageserver_send(NeonRequest * request)

        if(CheckConnstringUpdated())
        {
-            bool should_disconnect = ReloadConnstring();
-            if(should_disconnect)
-            {
-                neon_log(LOG, "pageserver_send disconnect because connstring changed");
-                pageserver_disconnect();
-            }
+            pageserver_disconnect();
+            ReloadConnstring();
        }

 	/* If the connection was lost for some reason, reconnect */
@@ -497,12 +484,7 @@ PagestoreShmemInit(void)
                                       &found);
    if(!found)
    {
-        pthread_rwlockattr_t attr;
-        pthread_rwlockattr_init(&attr);
-        pthread_rwlockattr_setpshared(&attr, PTHREAD_PROCESS_SHARED); 
-        pthread_rwlock_init(&pagestore_shared->lock, &attr);
-        pthread_rwlockattr_destroy(&attr);
-
+        pagestore_shared->lock = &(GetNamedLWLockTranche("neon_libpagestore")->lock);
        pg_atomic_init_u64(&pagestore_shared->update_counter, 0);
        AssignPageserverConnstring(page_server_connstring, NULL);
    }
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -59,7 +59,6 @@
 #include "replication/walsender.h"
 #include "storage/bufmgr.h"
 #include "storage/buf_internals.h"
-#include "storage/fsm_internals.h"
 #include "storage/smgr.h"
 #include "storage/md.h"
 #include "pgstat.h"
@@ -2723,86 +2722,6 @@ smgr_init_neon(void)
 }


-static void
-neon_extend_rel_size(NRelFileInfo rinfo, ForkNumber forknum, BlockNumber blkno, XLogRecPtr end_recptr)
-{
-	BlockNumber relsize;
-	/* Extend the relation if we know its size */
-	if (get_cached_relsize(rinfo, forknum, &relsize))
-	{
-		if (relsize < blkno + 1)
-		{
-			update_cached_relsize(rinfo, forknum, blkno + 1);
-			SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
-		}
-	}
-	else
-	{
-		/*
-		 * Size was not cached. We populate the cache now, with the size of the
-		 * relation measured after this WAL record is applied.
-		 *
-		 * This length is later reused when we open the smgr to read the block,
-		 * which is fine and expected.
-		 */
-
-		NeonResponse *response;
-		NeonNblocksResponse *nbresponse;
-		NeonNblocksRequest request = {
-			.req = (NeonRequest) {
-				.lsn = end_recptr,
-				.latest = false,
-				.tag = T_NeonNblocksRequest,
-			},
-			.rinfo = rinfo,
-			.forknum = forknum,
-		};
-
-		response = page_server_request(&request);
-
-		Assert(response->tag == T_NeonNblocksResponse);
-		nbresponse = (NeonNblocksResponse *) response;
-
-		relsize = Max(nbresponse->n_blocks, blkno+1);
-
-		set_cached_relsize(rinfo, forknum, relsize);
-		SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
-
-		elog(SmgrTrace, "Set length to %d", relsize);
-	}
-}
-
-#define FSM_TREE_DEPTH	((SlotsPerFSMPage >= 1626) ? 3 : 4)
-
-/*
- * TODO: May be it is better to make correspondent fgunctio from freespace.c public?
- */
-static BlockNumber
-get_fsm_physical_block(BlockNumber heapblk)
-{
-	BlockNumber pages;
-	int			leafno;
-	int			l;
-
-	/*
-	 * Calculate the logical page number of the first leaf page below the
-	 * given page.
-	 */
-	leafno = heapblk / SlotsPerFSMPage;
-
-	/* Count upper level nodes required to address the leaf page */
-	pages = 0;
-	for (l = 0; l < FSM_TREE_DEPTH; l++)
-	{
-		pages += leafno + 1;
-		leafno /= SlotsPerFSMPage;
-	}
-
-	/* Turn the page count into 0-based block number */
-	return pages - 1;
-}
-
-
 /*
 * Return whether we can skip the redo for this block.
 * 
@@ -2850,6 +2769,7 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)
 	LWLock	   *partitionLock;
 	Buffer		buffer;
 	bool		no_redo_needed;
+	BlockNumber relsize;

 	if (old_redo_read_buffer_filter && old_redo_read_buffer_filter(record, block_id))
 		return true;
@@ -2899,10 +2819,49 @@ neon_redo_read_buffer_filter(XLogReaderState *record, uint8 block_id)

 	LWLockRelease(partitionLock);

-	neon_extend_rel_size(rinfo, forknum, blkno, end_recptr);
-	if (forknum == MAIN_FORKNUM)
+	/* Extend the relation if we know its size */
+	if (get_cached_relsize(rinfo, forknum, &relsize))
 	{
-		neon_extend_rel_size(rinfo, FSM_FORKNUM, get_fsm_physical_block(blkno), end_recptr);
+		if (relsize < blkno + 1)
+		{
+			update_cached_relsize(rinfo, forknum, blkno + 1);
+			SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
+		}
 	}
+	else
+	{
+		/*
+		 * Size was not cached. We populate the cache now, with the size of the
+		 * relation measured after this WAL record is applied.
+		 *
+		 * This length is later reused when we open the smgr to read the block,
+		 * which is fine and expected.
+		 */
+
+		NeonResponse *response;
+		NeonNblocksResponse *nbresponse;
+		NeonNblocksRequest request = {
+			.req = (NeonRequest) {
+				.lsn = end_recptr,
+				.latest = false,
+				.tag = T_NeonNblocksRequest,
+			},
+			.rinfo = rinfo,
+			.forknum = forknum,
+		};
+
+		response = page_server_request(&request);
+
+		Assert(response->tag == T_NeonNblocksResponse);
+		nbresponse = (NeonNblocksResponse *) response;
+
+		Assert(nbresponse->n_blocks > blkno);
+
+		set_cached_relsize(rinfo, forknum, nbresponse->n_blocks);
+		SetLastWrittenLSNForRelation(end_recptr, rinfo, forknum);
+
+		elog(SmgrTrace, "Set length to %d", nbresponse->n_blocks);
+	}
+
 	return no_redo_needed;
 }
--- a/proxy/src/bin/proxy.rs
+++ b/proxy/src/bin/proxy.rs
@@ -103,7 +103,7 @@ struct ProxyCliArgs {
    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    require_client_ip: bool,
    /// Disable dynamic rate limiter and store the metrics to ensure its production behaviour.
-    #[clap(long, default_value_t = false, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
+    #[clap(long, default_value_t = true, value_parser = clap::builder::BoolishValueParser::new(), action = clap::ArgAction::Set)]
    disable_dynamic_rate_limiter: bool,
    /// Rate limit algorithm. Makes sense only if `disable_rate_limiter` is `false`.
    #[clap(value_enum, long, default_value_t = proxy::rate_limiter::RateLimitAlgorithm::Aimd)]
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -134,9 +134,9 @@ pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy<IntCounterVec> = Lazy::new(|| {

 pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy<Histogram> = Lazy::new(|| {
    register_histogram!(
-        "proxy_control_plane_token_acquire_seconds",
+        "semaphore_control_plane_token_acquire_seconds",
        "Time it took for proxy to establish a connection to the compute endpoint",
-        // largest bucket = 3^16 * 0.05ms = 2.15s
+        // largest bucket = 3^16 * 0.00005ms = 2.15s
        exponential_buckets(0.00005, 3.0, 16).unwrap(),
    )
    .unwrap()
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -914,14 +914,9 @@ where
        Ok(())
    }

-    /// Persist in-memory state of control file to disk.
-    //
-    // TODO: passing inmem_remote_consistent_lsn everywhere is ugly, better
-    // separate state completely and give Arc to all those who need it.
-    pub async fn persist_inmem(&mut self, inmem_remote_consistent_lsn: Lsn) -> Result<()> {
-        let mut state = self.state.clone();
-        state.remote_consistent_lsn = inmem_remote_consistent_lsn;
-        self.persist_control_file(state).await
+    /// Persist control file to disk, called only after timeline creation (bootstrap).
+    pub async fn persist(&mut self) -> Result<()> {
+        self.persist_control_file(self.state.clone()).await
    }

    /// Persist in-memory state to the disk, taking other data from state.
@@ -935,7 +930,7 @@ where

    /// Persist control file if there is something to save and enough time
    /// passed after the last save.
-    pub async fn maybe_persist_inmem_control_file(
+    pub async fn maybe_persist_control_file(
        &mut self,
        inmem_remote_consistent_lsn: Lsn,
    ) -> Result<()> {
@@ -948,7 +943,9 @@ where
            || self.inmem.peer_horizon_lsn > self.state.peer_horizon_lsn
            || inmem_remote_consistent_lsn > self.state.remote_consistent_lsn;
        if need_persist {
-            self.persist_inmem(inmem_remote_consistent_lsn).await?;
+            let mut state = self.state.clone();
+            state.remote_consistent_lsn = inmem_remote_consistent_lsn;
+            self.persist_control_file(state).await?;
            trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
        }
        Ok(())
@@ -1067,6 +1064,8 @@ where

        if sync_control_file {
            let mut state = self.state.clone();
+            // Note: we could make remote_consistent_lsn update in cf common by
+            // storing Arc to walsenders in Safekeeper.
            state.remote_consistent_lsn = new_remote_consistent_lsn;
            self.persist_control_file(state).await?;
        }
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -182,9 +182,8 @@ impl SharedState {
    }

    /// Mark timeline active/inactive and return whether s3 offloading requires
-    /// start/stop action. If timeline is deactivated, control file is persisted
-    /// as maintenance task does that only for active timelines.
-    async fn update_status(
+    /// start/stop action.
+    fn update_status(
        &mut self,
        num_computes: usize,
        remote_consistent_lsn: Lsn,
@@ -192,15 +191,7 @@ impl SharedState {
    ) -> bool {
        let is_active = self.is_active(num_computes, remote_consistent_lsn);
        if self.active != is_active {
-            info!(
-                "timeline {} active={} now, remote_consistent_lsn={}, commit_lsn={}",
-                ttid, is_active, remote_consistent_lsn, self.sk.inmem.commit_lsn
-            );
-            if !is_active {
-                if let Err(e) = self.sk.persist_inmem(remote_consistent_lsn).await {
-                    warn!("control file save in update_status failed: {:?}", e);
-                }
-            }
+            info!("timeline {} active={} now", ttid, is_active);
        }
        self.active = is_active;
        self.is_wal_backup_action_pending(num_computes)
@@ -447,7 +438,7 @@ impl Timeline {
        fs::create_dir_all(&self.timeline_dir).await?;

        // Write timeline to disk and start background tasks.
-        if let Err(e) = shared_state.sk.persist_inmem(Lsn::INVALID).await {
+        if let Err(e) = shared_state.sk.persist().await {
            // Bootstrap failed, cancel timeline and remove timeline directory.
            self.cancel(shared_state);

@@ -520,14 +511,12 @@ impl Timeline {
        self.mutex.lock().await
    }

-    async fn update_status(&self, shared_state: &mut SharedState) -> bool {
-        shared_state
-            .update_status(
-                self.walreceivers.get_num(),
-                self.get_walsenders().get_remote_consistent_lsn(),
-                self.ttid,
-            )
-            .await
+    fn update_status(&self, shared_state: &mut SharedState) -> bool {
+        shared_state.update_status(
+            self.walreceivers.get_num(),
+            self.get_walsenders().get_remote_consistent_lsn(),
+            self.ttid,
+        )
    }

    /// Update timeline status and kick wal backup launcher to stop/start offloading if needed.
@@ -537,7 +526,7 @@ impl Timeline {
        }
        let is_wal_backup_action_pending: bool = {
            let mut shared_state = self.write_shared_state().await;
-            self.update_status(&mut shared_state).await
+            self.update_status(&mut shared_state)
        };
        if is_wal_backup_action_pending {
            // Can fail only if channel to a static thread got closed, which is not normal at all.
@@ -694,7 +683,7 @@ impl Timeline {
            shared_state.sk.record_safekeeper_info(&sk_info).await?;
            let peer_info = PeerInfo::from_sk_info(&sk_info, Instant::now());
            shared_state.peers_info.upsert(&peer_info);
-            is_wal_backup_action_pending = self.update_status(&mut shared_state).await;
+            is_wal_backup_action_pending = self.update_status(&mut shared_state);
            commit_lsn = shared_state.sk.inmem.commit_lsn;
        }
        self.commit_lsn_watch_tx.send(commit_lsn)?;
@@ -839,7 +828,7 @@ impl Timeline {
        self.write_shared_state()
            .await
            .sk
-            .maybe_persist_inmem_control_file(remote_consistent_lsn)
+            .maybe_persist_control_file(remote_consistent_lsn)
            .await
    }

--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -260,14 +260,6 @@ class PageserverHttpClient(requests.Session):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
        self.verbose_error(res)

-    def tenant_reset(self, tenant_id: TenantId, drop_cache: bool):
-        params = {}
-        if drop_cache:
-            params["drop_cache"] = "true"
-
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/reset", params=params)
-        self.verbose_error(res)
-
    def tenant_delete(self, tenant_id: TenantId):
        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
        self.verbose_error(res)
--- a/test_runner/pg_clients/test_pg_clients.py
+++ b/test_runner/pg_clients/test_pg_clients.py
@@ -48,6 +48,6 @@ def test_pg_clients(test_output_dir: Path, remote_pg: RemotePostgres, client: st
    subprocess_capture(test_output_dir, build_cmd, check=True)

    run_cmd = [docker_bin, "run", "--rm", "--env-file", env_file, image_tag]
-    _, output, _ = subprocess_capture(test_output_dir, run_cmd, check=True, capture_stdout=True)
+    basepath, _, _ = subprocess_capture(test_output_dir, run_cmd, check=True)

-    assert str(output).strip() == "1"
+    assert Path(f"{basepath}.stdout").read_text().strip() == "1"
--- a/test_runner/regress/test_change_pageserver.py
+++ b/test_runner/regress/test_change_pageserver.py
@@ -60,10 +60,7 @@ def test_change_pageserver(neon_env_builder: NeonEnvBuilder):
    execute("SELECT count(*) FROM foo")
    assert fetchone() == (100000,)

-    # Reconfigure it using the same connstring just to make sure nothing breaks
-    # as we have special handling for if the connstring doesn't change
-    for _ in range(5):
-        endpoint.reconfigure(pageserver_id=alt_pageserver_id)
+    endpoint.reconfigure(pageserver_id=alt_pageserver_id)

    # Verify that the neon.pageserver_connstring GUC is set to the correct thing
    execute("SELECT setting FROM pg_settings WHERE name='neon.pageserver_connstring'")
--- a/test_runner/regress/test_layers_from_future.py
+++ b/test_runner/regress/test_layers_from_future.py
@@ -49,7 +49,7 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
        "compaction_period": "0s",  # we want to control when compaction runs
        "checkpoint_timeout": "24h",  # something we won't reach
        "checkpoint_distance": f"{50 * (1024**2)}",  # something we won't reach, we checkpoint manually
-        "image_creation_threshold": "100",  # we want to control when image is created
+        "image_creation_threshold": f"{image_creation_threshold}",
        "compaction_threshold": f"{l0_l1_threshold}",
        "compaction_target_size": f"{128 * (1024**3)}",  # make it so that we only have 1 partition => image coverage for delta layers => enables gc of delta layers
    }
@@ -124,10 +124,6 @@ def test_issue_5878(neon_env_builder: NeonEnvBuilder):
    ), "sanity check for what above loop is supposed to do"

    # create the image layer from the future
-    ps_http.patch_tenant_config_client_side(
-        tenant_id, {"image_creation_threshold": image_creation_threshold}, None
-    )
-    assert ps_http.tenant_config(tenant_id).effective_config["image_creation_threshold"] == 1
    ps_http.timeline_compact(tenant_id, timeline_id, force_repartition=True)
    assert (
        len(
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -384,7 +384,7 @@ def test_download_remote_layers_api(
    env.pageserver.allowed_errors.extend(
        [
            ".*download failed: downloading evicted layer file failed.*",
-            f".*initial_size_calculation.*{tenant_id}.*{timeline_id}.*initial size calculation failed: downloading evicted layer file failed",
+            f".*initial size calculation.*{tenant_id}.*{timeline_id}.*Failed to calculate logical size",
        ]
    )

--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -106,6 +106,7 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder, generations: bool)
        # Initial tenant load should reflect the delay we injected
        ("initial_tenant_load", lambda t, p: t >= (tenant_load_delay_ms / 1000.0) and t >= p),
        # Subsequent steps should occur in expected order
+        ("initial_logical_sizes", lambda t, p: t > 0 and t >= p),
        ("background_jobs_can_start", lambda t, p: t > 0 and t >= p),
        ("complete", lambda t, p: t > 0 and t >= p),
    ]
--- a/test_runner/regress/test_physical_replication.py
+++ b/test_runner/regress/test_physical_replication.py
@@ -1,29 +0,0 @@
-import random
-import time
-
-from fixtures.neon_fixtures import NeonEnv
-
-
-def test_physical_replication(neon_simple_env: NeonEnv):
-    env = neon_simple_env
-    n_records = 100000
-    with env.endpoints.create_start(
-        branch_name="main",
-        endpoint_id="primary",
-    ) as primary:
-        with primary.connect() as p_con:
-            with p_con.cursor() as p_cur:
-                p_cur.execute(
-                    "CREATE TABLE t(pk bigint primary key, payload text default repeat('?',200))"
-                )
-        time.sleep(1)
-        with env.endpoints.new_replica_start(origin=primary, endpoint_id="secondary") as secondary:
-            with primary.connect() as p_con:
-                with p_con.cursor() as p_cur:
-                    with secondary.connect() as s_con:
-                        with s_con.cursor() as s_cur:
-                            for pk in range(n_records):
-                                p_cur.execute("insert into t (pk) values (%s)", (pk,))
-                                s_cur.execute(
-                                    "select * from t where pk=%s", (random.randrange(1, n_records),)
-                                )
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -1,5 +1,4 @@
 import asyncio
-import enum
 import random
 import time
 from threading import Thread
@@ -52,20 +51,11 @@ def do_gc_target(
        log.info("gc http thread returning")


-class ReattachMode(str, enum.Enum):
-    REATTACH_EXPLICIT = "explicit"
-    REATTACH_RESET = "reset"
-    REATTACH_RESET_DROP = "reset"
-
-
 # Basic detach and re-attach test
@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
-@pytest.mark.parametrize(
-    "mode",
-    [ReattachMode.REATTACH_EXPLICIT, ReattachMode.REATTACH_RESET, ReattachMode.REATTACH_RESET_DROP],
-)
 def test_tenant_reattach(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind, mode: str
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
 ):
    neon_env_builder.enable_pageserver_remote_storage(remote_storage_kind)

@@ -110,15 +100,8 @@ def test_tenant_reattach(
        ps_metrics.query_one("pageserver_last_record_lsn", filter=tenant_metric_filter).value
    )

-    if mode == ReattachMode.REATTACH_EXPLICIT:
-        # Explicitly detach then attach the tenant as two separate API calls
-        pageserver_http.tenant_detach(tenant_id)
-        pageserver_http.tenant_attach(tenant_id)
-    elif mode in (ReattachMode.REATTACH_RESET, ReattachMode.REATTACH_RESET_DROP):
-        # Use the reset API to detach/attach in one shot
-        pageserver_http.tenant_reset(tenant_id, mode == ReattachMode.REATTACH_RESET_DROP)
-    else:
-        raise NotImplementedError(mode)
+    pageserver_http.tenant_detach(tenant_id)
+    pageserver_http.tenant_attach(tenant_id)

    time.sleep(1)  # for metrics propagation

--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -30,7 +30,6 @@ from fixtures.neon_fixtures import (
    Safekeeper,
    SafekeeperHttpClient,
    SafekeeperPort,
-    last_flush_lsn_upload,
 )
 from fixtures.pageserver.utils import (
    timeline_delete_wait_completed,
@@ -287,43 +286,29 @@ def test_broker(neon_env_builder: NeonEnvBuilder):
    # wait until remote_consistent_lsn gets advanced on all safekeepers
    clients = [sk.http_client() for sk in env.safekeepers]
    stat_before = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
-    log.info(f"statuses before insert: {stat_before}")
+    log.info(f"statuses is {stat_before}")

    endpoint.safe_psql("INSERT INTO t SELECT generate_series(1,100), 'payload'")

-    # wait for remote_consistent_lsn to reach flush_lsn, forcing it with checkpoint
-    new_rcl = last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
-    log.info(f"new_rcl: {new_rcl}")
-    endpoint.stop()
+    # force checkpoint in pageserver to advance remote_consistent_lsn
+    wait_lsn_force_checkpoint(tenant_id, timeline_id, endpoint, env.pageserver)

    # and wait till remote_consistent_lsn propagates to all safekeepers
-    #
-    # TODO: this executes long as timeline on safekeeper is immediately
-    # deactivated once rcl reaches pageserver one, and thus we generally wait
-    # till pageserver reconnects to all safekeepers one by one here. Timeline
-    # status on safekeeper should take into account peers state as well.
    started_at = time.time()
    while True:
        stat_after = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
-        if all([s_after.remote_consistent_lsn >= new_rcl for s_after in stat_after]):
+        if all(
+            s_after.remote_consistent_lsn > s_before.remote_consistent_lsn
+            for s_after, s_before in zip(stat_after, stat_before)
+        ):
            break
        elapsed = time.time() - started_at
-        if elapsed > 30:
+        if elapsed > 20:
            raise RuntimeError(
                f"timed out waiting {elapsed:.0f}s for remote_consistent_lsn propagation: status before {stat_before}, status current {stat_after}"
            )
        time.sleep(1)

-    # Ensure that safekeepers don't lose remote_consistent_lsn on restart.
-    # Control file is persisted each 5s. TODO: do that on shutdown and remove sleep.
-    time.sleep(6)
-    for sk in env.safekeepers:
-        sk.stop()
-        sk.start()
-    stat_after_restart = [cli.timeline_status(tenant_id, timeline_id) for cli in clients]
-    log.info(f"statuses after {stat_after_restart}")
-    assert all([s.remote_consistent_lsn >= new_rcl for s in stat_after_restart])
-

 # Test that old WAL consumed by peers and pageserver is removed from safekeepers.
@pytest.mark.parametrize("auth_enabled", [False, True])