Merge pull request #4633 from neondatabase/releases/2023-07-05

Release 2023-07-05
Merge branch 'release' into releases/2023-07-05
2026-03-04 08:50:38 +00:00 · 2023-07-05 15:10:47 +02:00 · 2023-07-05 12:41:48 +02:00 · 2023-06-27 12:55:33 +02:00 · 2023-06-27 12:22:12 +02:00 · 2023-06-23 15:38:50 -04:00
36 changed files with 598 additions and 796 deletions
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -515,25 +515,6 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control

-#########################################################################################
-#
-# Layer "pg-embedding-pg-build"
-# compile pg_embedding extension
-#
-#########################################################################################
-FROM build-deps AS pg-embedding-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
-# 2465f831ea1f8d49c1d74f8959adb7fc277d70cd made on 05/07/2023
-# There is no release tag yet
-RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1d74f8959adb7fc277d70cd.tar.gz -O pg_embedding.tar.gz && \
-    echo "047af2b1f664a1e6e37867bd4eeaf5934fa27d6ba3d6c4461efa388ddf7cd1d5 pg_embedding.tar.gz" | sha256sum --check && \
-    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/embedding.control
-
 #########################################################################################
 #
 # Layer "pg-anon-pg-build"
@@ -690,7 +671,6 @@ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -189,7 +189,7 @@ services:
      - "/bin/bash"
      - "-c"
    command:
-      - "until pg_isready -h compute -p 55433 -U cloud_admin ; do
+      - "until pg_isready -h compute -p 55433 ; do
            echo 'Waiting to start compute...' && sleep 1;
         done"
    depends_on:
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -48,7 +48,6 @@ Creating docker-compose_storage_broker_1       ... done
 2. connect compute node
 ```
 $ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
-$ chmod 600 ~/.pgpass
 $ psql -h localhost -p 55433 -U cloud_admin
 postgres=# CREATE TABLE t(key int primary key, value text);
 CREATE TABLE
--- a/libs/postgres_ffi/src/nonrelfile_utils.rs
+++ b/libs/postgres_ffi/src/nonrelfile_utils.rs
@@ -57,9 +57,9 @@ pub fn slru_may_delete_clogsegment(segpage: u32, cutoff_page: u32) -> bool {
 // Multixact utils

 pub fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize {
-    ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32)
-        % pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE as u32
-        * pg_constants::MULTIXACT_MEMBERGROUP_SIZE as u32) as usize
+    ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) as u16
+        % pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE
+        * pg_constants::MULTIXACT_MEMBERGROUP_SIZE) as usize
 }

 pub fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 {
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -234,18 +234,14 @@ pub async fn collect_metrics_iteration(
        // Note that this metric is calculated in a separate bgworker
        // Here we only use cached value, which may lag behind the real latest one
        let tenant_synthetic_size = tenant.get_cached_synthetic_size();
-
-        if tenant_synthetic_size != 0 {
-            // only send non-zeroes because otherwise these show up as errors in logs
-            current_metrics.push((
-                PageserverConsumptionMetricsKey {
-                    tenant_id,
-                    timeline_id: None,
-                    metric: SYNTHETIC_STORAGE_SIZE,
-                },
-                tenant_synthetic_size,
-            ));
-        }
+        current_metrics.push((
+            PageserverConsumptionMetricsKey {
+                tenant_id,
+                timeline_id: None,
+                metric: SYNTHETIC_STORAGE_SIZE,
+            },
+            tenant_synthetic_size,
+        ));
    }

    // Filter metrics, unless we want to send all metrics, including cached ones.
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -110,6 +110,7 @@ pub fn launch_disk_usage_global_eviction_task(

            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
                .await;
+            info!("disk usage based eviction task finishing");
            Ok(())
        },
    );
@@ -125,16 +126,13 @@ async fn disk_usage_eviction_task(
    tenants_dir: &Path,
    cancel: CancellationToken,
 ) {
-    scopeguard::defer! {
-        info!("disk usage based eviction task finishing");
-    };
-
    use crate::tenant::tasks::random_init_delay;
    {
        if random_init_delay(task_config.period, &cancel)
            .await
            .is_err()
        {
+            info!("shutting down");
            return;
        }
    }
@@ -169,6 +167,7 @@ async fn disk_usage_eviction_task(
        tokio::select! {
            _ = tokio::time::sleep_until(sleep_until) => {},
            _ = cancel.cancelled() => {
+                info!("shutting down");
                break
            }
        }
@@ -315,7 +314,7 @@ pub async fn disk_usage_eviction_task_iteration_impl<U: Usage>(
            partition,
            candidate.layer.get_tenant_id(),
            candidate.layer.get_timeline_id(),
-            candidate.layer,
+            candidate.layer.filename().file_name(),
        );
    }

--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -1,9 +1,9 @@
 use metrics::metric_vec_duration::DurationResultObserver;
 use metrics::{
    register_counter_vec, register_histogram, register_histogram_vec, register_int_counter,
-    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge,
-    register_uint_gauge_vec, Counter, CounterVec, Histogram, HistogramVec, IntCounter,
-    IntCounterVec, IntGauge, IntGaugeVec, UIntGauge, UIntGaugeVec,
+    register_int_counter_vec, register_int_gauge, register_int_gauge_vec, register_uint_gauge_vec,
+    Counter, CounterVec, Histogram, HistogramVec, IntCounter, IntCounterVec, IntGauge, IntGaugeVec,
+    UIntGauge, UIntGaugeVec,
 };
 use once_cell::sync::Lazy;
 use pageserver_api::models::TenantState;
@@ -203,49 +203,6 @@ pub static PAGE_CACHE: Lazy<PageCacheMetrics> = Lazy::new(|| PageCacheMetrics {
    },
 });

-pub struct PageCacheSizeMetrics {
-    pub max_bytes: UIntGauge,
-
-    pub current_bytes_ephemeral: UIntGauge,
-    pub current_bytes_immutable: UIntGauge,
-    pub current_bytes_materialized_page: UIntGauge,
-}
-
-static PAGE_CACHE_SIZE_CURRENT_BYTES: Lazy<UIntGaugeVec> = Lazy::new(|| {
-    register_uint_gauge_vec!(
-        "pageserver_page_cache_size_current_bytes",
-        "Current size of the page cache in bytes, by key kind",
-        &["key_kind"]
-    )
-    .expect("failed to define a metric")
-});
-
-pub static PAGE_CACHE_SIZE: Lazy<PageCacheSizeMetrics> = Lazy::new(|| PageCacheSizeMetrics {
-    max_bytes: {
-        register_uint_gauge!(
-            "pageserver_page_cache_size_max_bytes",
-            "Maximum size of the page cache in bytes"
-        )
-        .expect("failed to define a metric")
-    },
-
-    current_bytes_ephemeral: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["ephemeral"])
-            .unwrap()
-    },
-    current_bytes_immutable: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["immutable"])
-            .unwrap()
-    },
-    current_bytes_materialized_page: {
-        PAGE_CACHE_SIZE_CURRENT_BYTES
-            .get_metric_with_label_values(&["materialized_page"])
-            .unwrap()
-    },
-});
-
 static WAIT_LSN_TIME: Lazy<HistogramVec> = Lazy::new(|| {
    register_histogram_vec!(
        "pageserver_wait_lsn_seconds",
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -53,8 +53,8 @@ use utils::{
    lsn::Lsn,
 };

+use crate::repository::Key;
 use crate::tenant::writeback_ephemeral_file;
-use crate::{metrics::PageCacheSizeMetrics, repository::Key};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
 const TEST_PAGE_CACHE_SIZE: usize = 50;
@@ -187,8 +187,6 @@ pub struct PageCache {
    /// Index of the next candidate to evict, for the Clock replacement algorithm.
    /// This is interpreted modulo the page cache size.
    next_evict_slot: AtomicUsize,
-
-    size_metrics: &'static PageCacheSizeMetrics,
 }

 ///
@@ -720,9 +718,6 @@ impl PageCache {

                    if let Ok(version_idx) = versions.binary_search_by_key(old_lsn, |v| v.lsn) {
                        versions.remove(version_idx);
-                        self.size_metrics
-                            .current_bytes_materialized_page
-                            .sub_page_sz(1);
                        if versions.is_empty() {
                            old_entry.remove_entry();
                        }
@@ -735,13 +730,11 @@ impl PageCache {
                let mut map = self.ephemeral_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
                    .expect("could not find old key in mapping");
-                self.size_metrics.current_bytes_ephemeral.sub_page_sz(1);
            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
                    .expect("could not find old key in mapping");
-                self.size_metrics.current_bytes_immutable.sub_page_sz(1);
            }
        }
    }
@@ -769,9 +762,6 @@ impl PageCache {
                                slot_idx,
                            },
                        );
-                        self.size_metrics
-                            .current_bytes_materialized_page
-                            .add_page_sz(1);
                        None
                    }
                }
@@ -782,7 +772,6 @@ impl PageCache {
                    Entry::Occupied(entry) => Some(*entry.get()),
                    Entry::Vacant(entry) => {
                        entry.insert(slot_idx);
-                        self.size_metrics.current_bytes_ephemeral.add_page_sz(1);
                        None
                    }
                }
@@ -793,7 +782,6 @@ impl PageCache {
                    Entry::Occupied(entry) => Some(*entry.get()),
                    Entry::Vacant(entry) => {
                        entry.insert(slot_idx);
-                        self.size_metrics.current_bytes_immutable.add_page_sz(1);
                        None
                    }
                }
@@ -893,12 +881,6 @@ impl PageCache {

        let page_buffer = Box::leak(vec![0u8; num_pages * PAGE_SZ].into_boxed_slice());

-        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
-        size_metrics.max_bytes.set_page_sz(num_pages);
-        size_metrics.current_bytes_ephemeral.set_page_sz(0);
-        size_metrics.current_bytes_immutable.set_page_sz(0);
-        size_metrics.current_bytes_materialized_page.set_page_sz(0);
-
        let slots = page_buffer
            .chunks_exact_mut(PAGE_SZ)
            .map(|chunk| {
@@ -921,30 +903,6 @@ impl PageCache {
            immutable_page_map: Default::default(),
            slots,
            next_evict_slot: AtomicUsize::new(0),
-            size_metrics,
        }
    }
 }
-
-trait PageSzBytesMetric {
-    fn set_page_sz(&self, count: usize);
-    fn add_page_sz(&self, count: usize);
-    fn sub_page_sz(&self, count: usize);
-}
-
-#[inline(always)]
-fn count_times_page_sz(count: usize) -> u64 {
-    u64::try_from(count).unwrap() * u64::try_from(PAGE_SZ).unwrap()
-}
-
-impl PageSzBytesMetric for metrics::UIntGauge {
-    fn set_page_sz(&self, count: usize) {
-        self.set(count_times_page_sz(count));
-    }
-    fn add_page_sz(&self, count: usize) {
-        self.add(count_times_page_sz(count));
-    }
-    fn sub_page_sz(&self, count: usize) {
-        self.sub(count_times_page_sz(count));
-    }
-}
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -11,7 +11,7 @@
 //! parent timeline, and the last LSN that has been written to disk.
 //!

-use anyhow::{bail, Context};
+use anyhow::{bail, ensure, Context};
 use futures::FutureExt;
 use pageserver_api::models::TimelineState;
 use remote_storage::DownloadError;
@@ -49,8 +49,6 @@ use std::time::{Duration, Instant};
 use self::config::TenantConf;
 use self::metadata::TimelineMetadata;
 use self::remote_timeline_client::RemoteTimelineClient;
-use self::timeline::uninit::TimelineUninitMark;
-use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -70,7 +68,6 @@ use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
 use crate::InitializationOrder;

-use crate::tenant::timeline::uninit::cleanup_timeline_directory;
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
 use crate::walredo::WalRedoManager;
@@ -90,7 +87,6 @@ pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
 pub mod manifest;
-mod span;

 pub mod metadata;
 mod par_fsync;
@@ -106,7 +102,7 @@ mod timeline;

 pub mod size;

-pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
+pub(crate) use timeline::debug_assert_current_span_has_tenant_and_timeline_id;
 pub use timeline::{
    LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
 };
@@ -165,6 +161,200 @@ pub struct Tenant {
    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
 }

+/// A timeline with some of its files on disk, being initialized.
+/// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
+/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory
+/// to be removed on next restart.
+///
+/// The caller is responsible for proper timeline data filling before the final init.
+#[must_use]
+pub struct UninitializedTimeline<'t> {
+    owning_tenant: &'t Tenant,
+    timeline_id: TimelineId,
+    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
+}
+
+/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
+/// or gets removed eventually.
+///
+/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
+#[must_use]
+struct TimelineUninitMark {
+    uninit_mark_deleted: bool,
+    uninit_mark_path: PathBuf,
+    timeline_path: PathBuf,
+}
+
+impl UninitializedTimeline<'_> {
+    /// Finish timeline creation: insert it into the Tenant's timelines map and remove the
+    /// uninit mark file.
+    ///
+    /// This function launches the flush loop if not already done.
+    ///
+    /// The caller is responsible for activating the timeline (function `.activate()`).
+    fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
+        let timeline_id = self.timeline_id;
+        let tenant_id = self.owning_tenant.tenant_id;
+
+        let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
+            format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
+        })?;
+
+        // Check that the caller initialized disk_consistent_lsn
+        let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
+        ensure!(
+            new_disk_consistent_lsn.is_valid(),
+            "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
+        );
+
+        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
+        match timelines.entry(timeline_id) {
+            Entry::Occupied(_) => anyhow::bail!(
+                "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
+            ),
+            Entry::Vacant(v) => {
+                uninit_mark.remove_uninit_mark().with_context(|| {
+                    format!(
+                        "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
+                    )
+                })?;
+                v.insert(Arc::clone(&new_timeline));
+
+                new_timeline.maybe_spawn_flush_loop();
+            }
+        }
+
+        Ok(new_timeline)
+    }
+
+    /// Prepares timeline data by loading it from the basebackup archive.
+    pub async fn import_basebackup_from_tar(
+        self,
+        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
+        base_lsn: Lsn,
+        broker_client: storage_broker::BrokerClientChannel,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Arc<Timeline>> {
+        let raw_timeline = self.raw_timeline()?;
+
+        import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx)
+            .await
+            .context("Failed to import basebackup")?;
+
+        // Flush the new layer files to disk, before we make the timeline as available to
+        // the outside world.
+        //
+        // Flush loop needs to be spawned in order to be able to flush.
+        raw_timeline.maybe_spawn_flush_loop();
+
+        fail::fail_point!("before-checkpoint-new-timeline", |_| {
+            bail!("failpoint before-checkpoint-new-timeline");
+        });
+
+        raw_timeline
+            .freeze_and_flush()
+            .await
+            .context("Failed to flush after basebackup import")?;
+
+        // All the data has been imported. Insert the Timeline into the tenant's timelines
+        // map and remove the uninit mark file.
+        let tl = self.finish_creation()?;
+        tl.activate(broker_client, None, ctx);
+        Ok(tl)
+    }
+
+    fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
+        Ok(&self
+            .raw_timeline
+            .as_ref()
+            .with_context(|| {
+                format!(
+                    "No raw timeline {}/{} found",
+                    self.owning_tenant.tenant_id, self.timeline_id
+                )
+            })?
+            .0)
+    }
+}
+
+impl Drop for UninitializedTimeline<'_> {
+    fn drop(&mut self) {
+        if let Some((_, uninit_mark)) = self.raw_timeline.take() {
+            let _entered = info_span!("drop_uninitialized_timeline", tenant = %self.owning_tenant.tenant_id, timeline = %self.timeline_id).entered();
+            error!("Timeline got dropped without initializing, cleaning its files");
+            cleanup_timeline_directory(uninit_mark);
+        }
+    }
+}
+
+fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
+    let timeline_path = &uninit_mark.timeline_path;
+    match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
+        Ok(()) => {
+            info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
+        }
+        Err(e) => {
+            error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
+        }
+    }
+    drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists
+}
+
+impl TimelineUninitMark {
+    fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self {
+        Self {
+            uninit_mark_deleted: false,
+            uninit_mark_path,
+            timeline_path,
+        }
+    }
+
+    fn remove_uninit_mark(mut self) -> anyhow::Result<()> {
+        if !self.uninit_mark_deleted {
+            self.delete_mark_file_if_present()?;
+        }
+
+        Ok(())
+    }
+
+    fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> {
+        let uninit_mark_file = &self.uninit_mark_path;
+        let uninit_mark_parent = uninit_mark_file
+            .parent()
+            .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
+        ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
+            format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
+        })?;
+        crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
+        self.uninit_mark_deleted = true;
+
+        Ok(())
+    }
+}
+
+impl Drop for TimelineUninitMark {
+    fn drop(&mut self) {
+        if !self.uninit_mark_deleted {
+            if self.timeline_path.exists() {
+                error!(
+                    "Uninit mark {} is not removed, timeline {} stays uninitialized",
+                    self.uninit_mark_path.display(),
+                    self.timeline_path.display()
+                )
+            } else {
+                // unblock later timeline creation attempts
+                warn!(
+                    "Removing intermediate uninit mark file {}",
+                    self.uninit_mark_path.display()
+                );
+                if let Err(e) = self.delete_mark_file_if_present() {
+                    error!("Failed to remove the uninit mark file: {e}")
+                }
+            }
+        }
+    }
+}
+
 // We should not blindly overwrite local metadata with remote one.
 // For example, consider the following case:
 //     Image layer is flushed to disk as a new delta layer, we update local metadata and start upload task but after that
@@ -505,7 +695,7 @@ impl Tenant {
    /// No background tasks are started as part of this routine.
    ///
    async fn attach(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
-        span::debug_assert_current_span_has_tenant_id();
+        debug_assert_current_span_has_tenant_id();

        let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id);
        if !tokio::fs::try_exists(&marker_file)
@@ -643,7 +833,7 @@ impl Tenant {
        remote_client: RemoteTimelineClient,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        span::debug_assert_current_span_has_tenant_id();
+        debug_assert_current_span_has_tenant_id();

        info!("downloading index file for timeline {}", timeline_id);
        tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id))
@@ -722,7 +912,7 @@ impl Tenant {
        init_order: Option<InitializationOrder>,
        ctx: &RequestContext,
    ) -> Arc<Tenant> {
-        span::debug_assert_current_span_has_tenant_id();
+        debug_assert_current_span_has_tenant_id();

        let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
            Ok(conf) => conf,
@@ -908,7 +1098,7 @@ impl Tenant {
        init_order: Option<&InitializationOrder>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        span::debug_assert_current_span_has_tenant_id();
+        debug_assert_current_span_has_tenant_id();

        debug!("loading tenant task");

@@ -954,7 +1144,7 @@ impl Tenant {
        init_order: Option<&InitializationOrder>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        span::debug_assert_current_span_has_tenant_id();
+        debug_assert_current_span_has_tenant_id();

        let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
            RemoteTimelineClient::new(
@@ -1545,7 +1735,7 @@ impl Tenant {
        timeline_id: TimelineId,
        _ctx: &RequestContext,
    ) -> Result<(), DeleteTimelineError> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
+        timeline::debug_assert_current_span_has_tenant_and_timeline_id();

        // Transition the timeline into TimelineState::Stopping.
        // This should prevent new operations from starting.
@@ -1709,7 +1899,7 @@ impl Tenant {
        background_jobs_can_start: Option<&completion::Barrier>,
        ctx: &RequestContext,
    ) {
-        span::debug_assert_current_span_has_tenant_id();
+        debug_assert_current_span_has_tenant_id();

        let mut activating = false;
        self.state.send_modify(|current_state| {
@@ -1780,7 +1970,7 @@ impl Tenant {
    ///
    /// This will attempt to shutdown even if tenant is broken.
    pub(crate) async fn shutdown(&self, freeze_and_flush: bool) -> Result<(), ShutdownError> {
-        span::debug_assert_current_span_has_tenant_id();
+        debug_assert_current_span_has_tenant_id();
        // Set tenant (and its timlines) to Stoppping state.
        //
        // Since we can only transition into Stopping state after activation is complete,
@@ -2822,11 +3012,11 @@ impl Tenant {

        debug!("Successfully created initial files for timeline {tenant_id}/{new_timeline_id}");

-        Ok(UninitializedTimeline::new(
-            self,
-            new_timeline_id,
-            Some((timeline_struct, uninit_mark)),
-        ))
+        Ok(UninitializedTimeline {
+            owning_tenant: self,
+            timeline_id: new_timeline_id,
+            raw_timeline: Some((timeline_struct, uninit_mark)),
+        })
    }

    fn create_timeline_files(
@@ -4381,3 +4571,28 @@ mod tests {
        Ok(())
    }
 }
+
+#[cfg(not(debug_assertions))]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_id() {}
+
+#[cfg(debug_assertions)]
+pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<
+    utils::tracing_span_assert::MultiNameExtractor<2>,
+> = once_cell::sync::Lazy::new(|| {
+    utils::tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"])
+});
+
+#[cfg(debug_assertions)]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_id() {
+    use utils::tracing_span_assert;
+
+    match tracing_span_assert::check_fields_present([&*TENANT_ID_EXTRACTOR]) {
+        Ok(()) => (),
+        Err(missing) => panic!(
+            "missing extractors: {:?}",
+            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
+        ),
+    }
+}
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -608,7 +608,10 @@ impl RemoteTimelineClient {
        self.calls_unfinished_metric_begin(&op);
        upload_queue.queued_operations.push_back(op);

-        info!("scheduled layer file upload {layer_file_name}");
+        info!(
+            "scheduled layer file upload {}",
+            layer_file_name.file_name()
+        );

        // Launch the task immediately, if possible
        self.launch_queued_tasks(upload_queue);
@@ -661,7 +664,7 @@ impl RemoteTimelineClient {
                });
                self.calls_unfinished_metric_begin(&op);
                upload_queue.queued_operations.push_back(op);
-                info!("scheduled layer file deletion {name}");
+                info!("scheduled layer file deletion {}", name.file_name());
            }

            // Launch the tasks immediately, if possible
@@ -825,7 +828,7 @@ impl RemoteTimelineClient {
                    .queued_operations
                    .push_back(op);

-                info!("scheduled layer file deletion {name}");
+                info!("scheduled layer file deletion {}", name.file_name());
                deletions_queued += 1;
            }

--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -16,7 +16,7 @@ use tracing::{info, warn};

 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
-use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::timeline::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
--- a/pageserver/src/tenant/span.rs
+++ b/pageserver/src/tenant/span.rs
@@ -1,20 +0,0 @@
-#[cfg(debug_assertions)]
-use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
-
-#[cfg(not(debug_assertions))]
-pub(crate) fn debug_assert_current_span_has_tenant_id() {}
-
-#[cfg(debug_assertions)]
-pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<2>> =
-    once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]));
-
-#[cfg(debug_assertions)]
-#[track_caller]
-pub(crate) fn debug_assert_current_span_has_tenant_id() {
-    if let Err(missing) = check_fields_present([&*TENANT_ID_EXTRACTOR]) {
-        panic!(
-            "missing extractors: {:?}",
-            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
-        )
-    }
-}
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -335,7 +335,7 @@ impl LayerAccessStats {
 /// All layers should implement a minimal `std::fmt::Debug` without tenant or
 /// timeline names, because those are known in the context of which the layers
 /// are used in (timeline).
-pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
+pub trait Layer: std::fmt::Debug + Send + Sync {
    /// Range of keys that this layer covers
    fn get_key_range(&self) -> Range<Key>;

@@ -373,6 +373,9 @@ pub trait Layer: std::fmt::Debug + std::fmt::Display + Send + Sync {
        ctx: &RequestContext,
    ) -> Result<ValueReconstructResult>;

+    /// A short ID string that uniquely identifies the given layer within a [`LayerMap`].
+    fn short_id(&self) -> String;
+
    /// Dump summary of the contents of the layer to stdout
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()>;
 }
@@ -509,12 +512,10 @@ pub mod tests {
        fn is_incremental(&self) -> bool {
            self.layer_desc().is_incremental
        }
-    }

-    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-    impl std::fmt::Display for LayerDescriptor {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            write!(f, "{}", self.layer_desc().short_id())
+        /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+        fn short_id(&self) -> String {
+            self.layer_desc().short_id()
        }
    }

--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -222,14 +222,13 @@ impl Layer for DeltaLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
-            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
+            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
            self.desc.tenant_id,
            self.desc.timeline_id,
            self.desc.key_range.start,
            self.desc.key_range.end,
            self.desc.lsn_range.start,
-            self.desc.lsn_range.end,
-            self.desc.file_size,
+            self.desc.lsn_range.end
        );

        if !verbose {
@@ -395,11 +394,10 @@ impl Layer for DeltaLayer {
    fn is_incremental(&self) -> bool {
        self.layer_desc().is_incremental
    }
-}
-/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-impl std::fmt::Display for DeltaLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.layer_desc().short_id())
+
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn short_id(&self) -> String {
+        self.layer_desc().short_id()
    }
 }

--- a/pageserver/src/tenant/storage_layer/filename.rs
+++ b/pageserver/src/tenant/storage_layer/filename.rs
@@ -210,15 +210,9 @@ pub enum LayerFileName {

 impl LayerFileName {
    pub fn file_name(&self) -> String {
-        self.to_string()
-    }
-}
-
-impl fmt::Display for LayerFileName {
-    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
        match self {
-            Self::Image(fname) => write!(f, "{fname}"),
-            Self::Delta(fname) => write!(f, "{fname}"),
+            Self::Image(fname) => fname.to_string(),
+            Self::Delta(fname) => fname.to_string(),
        }
    }
 }
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -153,14 +153,12 @@ impl Layer for ImageLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
-            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
+            "----- image layer for ten {} tli {} key {}-{} at {} ----",
            self.desc.tenant_id,
            self.desc.timeline_id,
            self.desc.key_range.start,
            self.desc.key_range.end,
-            self.lsn,
-            self.desc.is_incremental,
-            self.desc.file_size
+            self.lsn
        );

        if !verbose {
@@ -232,12 +230,10 @@ impl Layer for ImageLayer {
    fn is_incremental(&self) -> bool {
        self.layer_desc().is_incremental
    }
-}

-/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-impl std::fmt::Display for ImageLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.layer_desc().short_id())
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn short_id(&self) -> String {
+        self.layer_desc().short_id()
    }
 }

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -131,6 +131,13 @@ impl Layer for InMemoryLayer {
        true
    }

+    fn short_id(&self) -> String {
+        let inner = self.inner.read().unwrap();
+
+        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
+        format!("inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
+    }
+
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
        let inner = self.inner.read().unwrap();
@@ -233,15 +240,6 @@ impl Layer for InMemoryLayer {
    }
 }

-impl std::fmt::Display for InMemoryLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let inner = self.inner.read().unwrap();
-
-        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
-        write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
-    }
-}
-
 impl InMemoryLayer {
    ///
    /// Get layer size on the disk
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,5 +1,4 @@
 use anyhow::Result;
-use core::fmt::Display;
 use std::ops::Range;
 use utils::{
    id::{TenantId, TimelineId},
@@ -49,8 +48,8 @@ impl PersistentLayerDesc {
        }
    }

-    pub fn short_id(&self) -> impl Display {
-        self.filename()
+    pub fn short_id(&self) -> String {
+        self.filename().file_name()
    }

    #[cfg(test)]
@@ -174,16 +173,13 @@ impl PersistentLayerDesc {

    pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        println!(
-            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
+            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
            self.tenant_id,
            self.timeline_id,
            self.key_range.start,
            self.key_range.end,
            self.lsn_range.start,
-            self.lsn_range.end,
-            self.is_delta,
-            self.is_incremental,
-            self.file_size,
+            self.lsn_range.end
        );

        Ok(())
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -71,22 +71,22 @@ impl Layer for RemoteLayer {
        _reconstruct_state: &mut ValueReconstructState,
        _ctx: &RequestContext,
    ) -> Result<ValueReconstructResult> {
-        bail!("layer {self} needs to be downloaded");
+        bail!(
+            "layer {} needs to be downloaded",
+            self.filename().file_name()
+        );
    }

    /// debugging function to print out the contents of the layer
    fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        println!(
-            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
+            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
            self.desc.tenant_id,
            self.desc.timeline_id,
            self.desc.key_range.start,
            self.desc.key_range.end,
            self.desc.lsn_range.start,
-            self.desc.lsn_range.end,
-            self.desc.is_delta,
-            self.desc.is_incremental,
-            self.desc.file_size,
+            self.desc.lsn_range.end
        );

        Ok(())
@@ -106,12 +106,10 @@ impl Layer for RemoteLayer {
    fn is_incremental(&self) -> bool {
        self.layer_desc().is_incremental
    }
-}

-/// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
-impl std::fmt::Display for RemoteLayer {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        write!(f, "{}", self.layer_desc().short_id())
+    /// Boilerplate to implement the Layer trait, always use layer_desc for persistent layers.
+    fn short_id(&self) -> String {
+        self.layer_desc().short_id()
    }
 }

--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,9 +1,6 @@
 //!

 mod eviction_task;
-mod logical_size;
-pub mod span;
-pub mod uninit;
 mod walreceiver;

 use anyhow::{anyhow, bail, ensure, Context, Result};
@@ -11,6 +8,7 @@ use bytes::Bytes;
 use fail::fail_point;
 use futures::StreamExt;
 use itertools::Itertools;
+use once_cell::sync::OnceCell;
 use pageserver_api::models::{
    DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
    DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceEventReason, LayerResidenceStatus,
@@ -19,7 +17,7 @@ use pageserver_api::models::{
 use remote_storage::GenericRemoteStorage;
 use serde_with::serde_as;
 use storage_broker::BrokerClientChannel;
-use tokio::sync::{oneshot, watch, TryAcquireError};
+use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::TenantTimelineId;
@@ -30,7 +28,7 @@ use std::fs;
 use std::ops::{Deref, Range};
 use std::path::{Path, PathBuf};
 use std::pin::pin;
-use std::sync::atomic::Ordering as AtomicOrdering;
+use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};

@@ -40,7 +38,6 @@ use crate::tenant::storage_layer::{
    DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
    LayerAccessStats, LayerFileName, RemoteLayer,
 };
-use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    ephemeral_file::is_ephemeral_file,
    layer_map::{LayerMap, SearchResult},
@@ -82,7 +79,6 @@ use crate::{is_temporary, task_mgr};

 pub(super) use self::eviction_task::EvictionTaskTenantState;
 use self::eviction_task::EvictionTaskTimelineState;
-use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

 use super::config::TenantConf;
@@ -132,7 +128,7 @@ impl LayerFileManager {
        // A layer's descriptor is present in the LayerMap => the LayerFileManager contains a layer for the descriptor.
        self.0
            .get(&desc.key())
-            .with_context(|| format!("get layer from desc: {}", desc.filename()))
+            .with_context(|| format!("get layer from desc: {}", desc.filename().file_name()))
            .expect("not found")
            .clone()
    }
@@ -369,6 +365,126 @@ pub struct Timeline {
    initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
 }

+/// Internal structure to hold all data needed for logical size calculation.
+///
+/// Calculation consists of two stages:
+///
+/// 1. Initial size calculation. That might take a long time, because it requires
+/// reading all layers containing relation sizes at `initial_part_end`.
+///
+/// 2. Collecting an incremental part and adding that to the initial size.
+/// Increments are appended on walreceiver writing new timeline data,
+/// which result in increase or decrease of the logical size.
+struct LogicalSize {
+    /// Size, potentially slow to compute. Calculating this might require reading multiple
+    /// layers, and even ancestor's layers.
+    ///
+    /// NOTE: size at a given LSN is constant, but after a restart we will calculate
+    /// the initial size at a different LSN.
+    initial_logical_size: OnceCell<u64>,
+
+    /// Semaphore to track ongoing calculation of `initial_logical_size`.
+    initial_size_computation: Arc<tokio::sync::Semaphore>,
+
+    /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
+    initial_part_end: Option<Lsn>,
+
+    /// All other size changes after startup, combined together.
+    ///
+    /// Size shouldn't ever be negative, but this is signed for two reasons:
+    ///
+    /// 1. If we initialized the "baseline" size lazily, while we already
+    /// process incoming WAL, the incoming WAL records could decrement the
+    /// variable and temporarily make it negative. (This is just future-proofing;
+    /// the initialization is currently not done lazily.)
+    ///
+    /// 2. If there is a bug and we e.g. forget to increment it in some cases
+    /// when size grows, but remember to decrement it when it shrinks again, the
+    /// variable could go negative. In that case, it seems better to at least
+    /// try to keep tracking it, rather than clamp or overflow it. Note that
+    /// get_current_logical_size() will clamp the returned value to zero if it's
+    /// negative, and log an error. Could set it permanently to zero or some
+    /// special value to indicate "broken" instead, but this will do for now.
+    ///
+    /// Note that we also expose a copy of this value as a prometheus metric,
+    /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
+    /// to modify this, it will also keep the prometheus metric in sync.
+    size_added_after_initial: AtomicI64,
+}
+
+/// Normalized current size, that the data in pageserver occupies.
+#[derive(Debug, Clone, Copy)]
+enum CurrentLogicalSize {
+    /// The size is not yet calculated to the end, this is an intermediate result,
+    /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
+    /// yet total logical size cannot be below 0.
+    Approximate(u64),
+    // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
+    // available for observation without any calculations.
+    Exact(u64),
+}
+
+impl CurrentLogicalSize {
+    fn size(&self) -> u64 {
+        *match self {
+            Self::Approximate(size) => size,
+            Self::Exact(size) => size,
+        }
+    }
+}
+
+impl LogicalSize {
+    fn empty_initial() -> Self {
+        Self {
+            initial_logical_size: OnceCell::with_value(0),
+            //  initial_logical_size already computed, so, don't admit any calculations
+            initial_size_computation: Arc::new(Semaphore::new(0)),
+            initial_part_end: None,
+            size_added_after_initial: AtomicI64::new(0),
+        }
+    }
+
+    fn deferred_initial(compute_to: Lsn) -> Self {
+        Self {
+            initial_logical_size: OnceCell::new(),
+            initial_size_computation: Arc::new(Semaphore::new(1)),
+            initial_part_end: Some(compute_to),
+            size_added_after_initial: AtomicI64::new(0),
+        }
+    }
+
+    fn current_size(&self) -> anyhow::Result<CurrentLogicalSize> {
+        let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
+        //                  ^^^ keep this type explicit so that the casts in this function break if
+        //                  we change the type.
+        match self.initial_logical_size.get() {
+            Some(initial_size) => {
+                initial_size.checked_add_signed(size_increment)
+                    .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
+                    .map(CurrentLogicalSize::Exact)
+            }
+            None => {
+                let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
+                Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
+            }
+        }
+    }
+
+    fn increment_size(&self, delta: i64) {
+        self.size_added_after_initial
+            .fetch_add(delta, AtomicOrdering::SeqCst);
+    }
+
+    /// Make the value computed by initial logical size computation
+    /// available for re-use. This doesn't contain the incremental part.
+    fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
+        match self.initial_part_end {
+            Some(v) if v == lsn => self.initial_logical_size.get().copied(),
+            _ => None,
+        }
+    }
+}
+
 pub struct WalReceiverInfo {
    pub wal_source_connconf: PgConnectionConfig,
    pub last_received_msg_lsn: Lsn,
@@ -1265,9 +1381,9 @@ impl Timeline {
                        .read()
                        .unwrap()
                        .observe(delta);
-                    info!(layer=%local_layer, residence_millis=delta.as_millis(), "evicted layer after known residence period");
+                    info!(layer=%local_layer.short_id(), residence_millis=delta.as_millis(), "evicted layer after known residence period");
                } else {
-                    info!(layer=%local_layer, "evicted layer after unknown residence period");
+                    info!(layer=%local_layer.short_id(), "evicted layer after unknown residence period");
                }

                true
@@ -2123,7 +2239,7 @@ impl Timeline {
        ctx: &RequestContext,
        cancel: CancellationToken,
    ) -> Result<u64, CalculateLogicalSizeError> {
-        span::debug_assert_current_span_has_tenant_and_timeline_id();
+        debug_assert_current_span_has_tenant_and_timeline_id();

        let mut timeline_state_updates = self.subscribe_for_state_updates();
        let self_calculation = Arc::clone(self);
@@ -2346,7 +2462,11 @@ impl TraversalLayerExt for Arc<dyn PersistentLayer> {
                format!("{}", local_path.display())
            }
            None => {
-                format!("remote {}/{self}", self.get_timeline_id())
+                format!(
+                    "remote {}/{}",
+                    self.get_timeline_id(),
+                    self.filename().file_name()
+                )
            }
        }
    }
@@ -2354,7 +2474,11 @@ impl TraversalLayerExt for Arc<dyn PersistentLayer> {

 impl TraversalLayerExt for Arc<InMemoryLayer> {
    fn traversal_id(&self) -> TraversalId {
-        format!("timeline {} in-memory {self}", self.get_timeline_id())
+        format!(
+            "timeline {} in-memory {}",
+            self.get_timeline_id(),
+            self.short_id()
+        )
    }
 }

@@ -2874,7 +2998,7 @@ impl Timeline {
    }

    /// Flush one frozen in-memory layer to disk, as a new delta layer.
-    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer))]
+    #[instrument(skip_all, fields(tenant_id=%self.tenant_id, timeline_id=%self.timeline_id, layer=%frozen_layer.short_id()))]
    async fn flush_frozen_layer(
        self: &Arc<Self>,
        frozen_layer: Arc<InMemoryLayer>,
@@ -3553,7 +3677,7 @@ impl Timeline {
        let remotes = deltas_to_compact
            .iter()
            .filter(|l| l.is_remote_layer())
-            .inspect(|l| info!("compact requires download of {l}"))
+            .inspect(|l| info!("compact requires download of {}", l.filename().file_name()))
            .map(|l| {
                l.clone()
                    .downcast_remote_layer()
@@ -3577,7 +3701,7 @@ impl Timeline {
        );

        for l in deltas_to_compact.iter() {
-            info!("compact includes {l}");
+            info!("compact includes {}", l.filename().file_name());
        }

        // We don't need the original list of layers anymore. Drop it so that
@@ -4192,8 +4316,8 @@ impl Timeline {
            if l.get_lsn_range().end > horizon_cutoff {
                debug!(
                    "keeping {} because it's newer than horizon_cutoff {}",
-                    l.filename(),
-                    horizon_cutoff,
+                    l.filename().file_name(),
+                    horizon_cutoff
                );
                result.layers_needed_by_cutoff += 1;
                continue 'outer;
@@ -4203,8 +4327,8 @@ impl Timeline {
            if l.get_lsn_range().end > pitr_cutoff {
                debug!(
                    "keeping {} because it's newer than pitr_cutoff {}",
-                    l.filename(),
-                    pitr_cutoff,
+                    l.filename().file_name(),
+                    pitr_cutoff
                );
                result.layers_needed_by_pitr += 1;
                continue 'outer;
@@ -4222,7 +4346,7 @@ impl Timeline {
                if &l.get_lsn_range().start <= retain_lsn {
                    debug!(
                        "keeping {} because it's still might be referenced by child branch forked at {} is_dropped: xx is_incremental: {}",
-                        l.filename(),
+                        l.filename().file_name(),
                        retain_lsn,
                        l.is_incremental(),
                    );
@@ -4253,7 +4377,10 @@ impl Timeline {
            if !layers
                .image_layer_exists(&l.get_key_range(), &(l.get_lsn_range().end..new_gc_cutoff))?
            {
-                debug!("keeping {} because it is the latest layer", l.filename());
+                debug!(
+                    "keeping {} because it is the latest layer",
+                    l.filename().file_name()
+                );
                // Collect delta key ranges that need image layers to allow garbage
                // collecting the layers.
                // It is not so obvious whether we need to propagate information only about
@@ -4270,7 +4397,7 @@ impl Timeline {
            // We didn't find any reason to keep this file, so remove it.
            debug!(
                "garbage collecting {} is_dropped: xx is_incremental: {}",
-                l.filename(),
+                l.filename().file_name(),
                l.is_incremental(),
            );
            layers_to_remove.push(Arc::clone(&l));
@@ -4424,12 +4551,12 @@ impl Timeline {
    /// If the caller has a deadline or needs a timeout, they can simply stop polling:
    /// we're **cancellation-safe** because the download happens in a separate task_mgr task.
    /// So, the current download attempt will run to completion even if we stop polling.
-    #[instrument(skip_all, fields(layer=%remote_layer))]
+    #[instrument(skip_all, fields(layer=%remote_layer.short_id()))]
    pub async fn download_remote_layer(
        &self,
        remote_layer: Arc<RemoteLayer>,
    ) -> anyhow::Result<()> {
-        span::debug_assert_current_span_has_tenant_and_timeline_id();
+        debug_assert_current_span_has_tenant_and_timeline_id();

        use std::sync::atomic::Ordering::Relaxed;

@@ -4462,7 +4589,7 @@ impl Timeline {
            TaskKind::RemoteDownloadTask,
            Some(self.tenant_id),
            Some(self.timeline_id),
-            &format!("download layer {}", remote_layer),
+            &format!("download layer {}", remote_layer.short_id()),
            false,
            async move {
                let remote_client = self_clone.remote_client.as_ref().unwrap();
@@ -4738,12 +4865,15 @@ impl Timeline {
                continue;
            }

-            let last_activity_ts = l.access_stats().latest_activity().unwrap_or_else(|| {
-                // We only use this fallback if there's an implementation error.
-                // `latest_activity` already does rate-limited warn!() log.
-                debug!(layer=%l, "last_activity returns None, using SystemTime::now");
-                SystemTime::now()
-            });
+            let last_activity_ts = l
+                .access_stats()
+                .latest_activity()
+                .unwrap_or_else(|| {
+                    // We only use this fallback if there's an implementation error.
+                    // `latest_activity` already does rate-limited warn!() log.
+                    debug!(layer=%l.filename().file_name(), "last_activity returns None, using SystemTime::now");
+                    SystemTime::now()
+                });

            resident_layers.push(LocalLayerInfoForDiskUsageEviction {
                layer: l,
@@ -4863,6 +4993,33 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
    bail!("couldn't find an unused backup number for {:?}", path)
 }

+#[cfg(not(debug_assertions))]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
+
+#[cfg(debug_assertions)]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
+    use utils::tracing_span_assert;
+
+    pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<
+        tracing_span_assert::MultiNameExtractor<2>,
+    > = once_cell::sync::Lazy::new(|| {
+        tracing_span_assert::MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"])
+    });
+
+    match tracing_span_assert::check_fields_present([
+        &*super::TENANT_ID_EXTRACTOR,
+        &*TIMELINE_ID_EXTRACTOR,
+    ]) {
+        Ok(()) => (),
+        Err(missing) => panic!(
+            "missing extractors: {:?}",
+            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
+        ),
+    }
+}
+
 /// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
 ///
 /// Returns `true` if the two `Arc` point to the same layer, false otherwise.
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -70,6 +70,7 @@ impl Timeline {
                };

                self_clone.eviction_task(cancel).await;
+                info!("eviction task finishing");
                Ok(())
            },
        );
@@ -77,9 +78,6 @@ impl Timeline {

    #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
    async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
-        scopeguard::defer! {
-            info!("eviction task finishing");
-        }
        use crate::tenant::tasks::random_init_delay;
        {
            let policy = self.get_eviction_policy();
@@ -88,6 +86,7 @@ impl Timeline {
                EvictionPolicy::NoEviction => Duration::from_secs(10),
            };
            if random_init_delay(period, &cancel).await.is_err() {
+                info!("shutting down");
                return;
            }
        }
@@ -102,6 +101,7 @@ impl Timeline {
                ControlFlow::Continue(sleep_until) => {
                    tokio::select! {
                        _ = cancel.cancelled() => {
+                            info!("shutting down");
                            break;
                        }
                        _ = tokio::time::sleep_until(sleep_until) => { }
@@ -209,7 +209,7 @@ impl Timeline {
                let last_activity_ts = hist_layer.access_stats().latest_activity().unwrap_or_else(|| {
                    // We only use this fallback if there's an implementation error.
                    // `latest_activity` already does rate-limited warn!() log.
-                    debug!(layer=%hist_layer, "last_activity returns None, using SystemTime::now");
+                    debug!(layer=%hist_layer.filename().file_name(), "last_activity returns None, using SystemTime::now");
                    SystemTime::now()
                });

--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -1,128 +0,0 @@
-use anyhow::Context;
-use once_cell::sync::OnceCell;
-
-use tokio::sync::Semaphore;
-use utils::lsn::Lsn;
-
-use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
-use std::sync::Arc;
-
-/// Internal structure to hold all data needed for logical size calculation.
-///
-/// Calculation consists of two stages:
-///
-/// 1. Initial size calculation. That might take a long time, because it requires
-/// reading all layers containing relation sizes at `initial_part_end`.
-///
-/// 2. Collecting an incremental part and adding that to the initial size.
-/// Increments are appended on walreceiver writing new timeline data,
-/// which result in increase or decrease of the logical size.
-pub(super) struct LogicalSize {
-    /// Size, potentially slow to compute. Calculating this might require reading multiple
-    /// layers, and even ancestor's layers.
-    ///
-    /// NOTE: size at a given LSN is constant, but after a restart we will calculate
-    /// the initial size at a different LSN.
-    pub initial_logical_size: OnceCell<u64>,
-
-    /// Semaphore to track ongoing calculation of `initial_logical_size`.
-    pub initial_size_computation: Arc<tokio::sync::Semaphore>,
-
-    /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
-    pub initial_part_end: Option<Lsn>,
-
-    /// All other size changes after startup, combined together.
-    ///
-    /// Size shouldn't ever be negative, but this is signed for two reasons:
-    ///
-    /// 1. If we initialized the "baseline" size lazily, while we already
-    /// process incoming WAL, the incoming WAL records could decrement the
-    /// variable and temporarily make it negative. (This is just future-proofing;
-    /// the initialization is currently not done lazily.)
-    ///
-    /// 2. If there is a bug and we e.g. forget to increment it in some cases
-    /// when size grows, but remember to decrement it when it shrinks again, the
-    /// variable could go negative. In that case, it seems better to at least
-    /// try to keep tracking it, rather than clamp or overflow it. Note that
-    /// get_current_logical_size() will clamp the returned value to zero if it's
-    /// negative, and log an error. Could set it permanently to zero or some
-    /// special value to indicate "broken" instead, but this will do for now.
-    ///
-    /// Note that we also expose a copy of this value as a prometheus metric,
-    /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
-    /// to modify this, it will also keep the prometheus metric in sync.
-    pub size_added_after_initial: AtomicI64,
-}
-
-/// Normalized current size, that the data in pageserver occupies.
-#[derive(Debug, Clone, Copy)]
-pub(super) enum CurrentLogicalSize {
-    /// The size is not yet calculated to the end, this is an intermediate result,
-    /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
-    /// yet total logical size cannot be below 0.
-    Approximate(u64),
-    // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
-    // available for observation without any calculations.
-    Exact(u64),
-}
-
-impl CurrentLogicalSize {
-    pub(super) fn size(&self) -> u64 {
-        *match self {
-            Self::Approximate(size) => size,
-            Self::Exact(size) => size,
-        }
-    }
-}
-
-impl LogicalSize {
-    pub(super) fn empty_initial() -> Self {
-        Self {
-            initial_logical_size: OnceCell::with_value(0),
-            //  initial_logical_size already computed, so, don't admit any calculations
-            initial_size_computation: Arc::new(Semaphore::new(0)),
-            initial_part_end: None,
-            size_added_after_initial: AtomicI64::new(0),
-        }
-    }
-
-    pub(super) fn deferred_initial(compute_to: Lsn) -> Self {
-        Self {
-            initial_logical_size: OnceCell::new(),
-            initial_size_computation: Arc::new(Semaphore::new(1)),
-            initial_part_end: Some(compute_to),
-            size_added_after_initial: AtomicI64::new(0),
-        }
-    }
-
-    pub(super) fn current_size(&self) -> anyhow::Result<CurrentLogicalSize> {
-        let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
-        //                  ^^^ keep this type explicit so that the casts in this function break if
-        //                  we change the type.
-        match self.initial_logical_size.get() {
-            Some(initial_size) => {
-                initial_size.checked_add_signed(size_increment)
-                    .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
-                    .map(CurrentLogicalSize::Exact)
-            }
-            None => {
-                let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
-                Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
-            }
-        }
-    }
-
-    pub(super) fn increment_size(&self, delta: i64) {
-        self.size_added_after_initial
-            .fetch_add(delta, AtomicOrdering::SeqCst);
-    }
-
-    /// Make the value computed by initial logical size computation
-    /// available for re-use. This doesn't contain the incremental part.
-    pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
-        match self.initial_part_end {
-            Some(v) if v == lsn => self.initial_logical_size.get().copied(),
-            _ => None,
-        }
-    }
-}
--- a/pageserver/src/tenant/timeline/span.rs
+++ b/pageserver/src/tenant/timeline/span.rs
@@ -1,25 +0,0 @@
-#[cfg(debug_assertions)]
-use utils::tracing_span_assert::{check_fields_present, Extractor, MultiNameExtractor};
-
-#[cfg(not(debug_assertions))]
-pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
-
-#[cfg(debug_assertions)]
-#[track_caller]
-pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
-    static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<2>> =
-        once_cell::sync::Lazy::new(|| {
-            MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"])
-        });
-
-    let fields: [&dyn Extractor; 2] = [
-        &*crate::tenant::span::TENANT_ID_EXTRACTOR,
-        &*TIMELINE_ID_EXTRACTOR,
-    ];
-    if let Err(missing) = check_fields_present(fields) {
-        panic!(
-            "missing extractors: {:?}",
-            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
-        )
-    }
-}
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -1,219 +0,0 @@
-use std::{collections::hash_map::Entry, fs, path::PathBuf, sync::Arc};
-
-use anyhow::Context;
-use tracing::{error, info, info_span, warn};
-use utils::{crashsafe, id::TimelineId, lsn::Lsn};
-
-use crate::{
-    context::RequestContext,
-    import_datadir,
-    tenant::{ignore_absent_files, Tenant},
-};
-
-use super::Timeline;
-
-/// A timeline with some of its files on disk, being initialized.
-/// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
-/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory
-/// to be removed on next restart.
-///
-/// The caller is responsible for proper timeline data filling before the final init.
-#[must_use]
-pub struct UninitializedTimeline<'t> {
-    pub(crate) owning_tenant: &'t Tenant,
-    timeline_id: TimelineId,
-    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
-}
-
-impl<'t> UninitializedTimeline<'t> {
-    pub(crate) fn new(
-        owning_tenant: &'t Tenant,
-        timeline_id: TimelineId,
-        raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
-    ) -> Self {
-        Self {
-            owning_tenant,
-            timeline_id,
-            raw_timeline,
-        }
-    }
-
-    /// Finish timeline creation: insert it into the Tenant's timelines map and remove the
-    /// uninit mark file.
-    ///
-    /// This function launches the flush loop if not already done.
-    ///
-    /// The caller is responsible for activating the timeline (function `.activate()`).
-    pub(crate) fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
-        let timeline_id = self.timeline_id;
-        let tenant_id = self.owning_tenant.tenant_id;
-
-        let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
-            format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
-        })?;
-
-        // Check that the caller initialized disk_consistent_lsn
-        let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
-        anyhow::ensure!(
-            new_disk_consistent_lsn.is_valid(),
-            "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
-        );
-
-        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
-        match timelines.entry(timeline_id) {
-            Entry::Occupied(_) => anyhow::bail!(
-                "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
-            ),
-            Entry::Vacant(v) => {
-                uninit_mark.remove_uninit_mark().with_context(|| {
-                    format!(
-                        "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
-                    )
-                })?;
-                v.insert(Arc::clone(&new_timeline));
-
-                new_timeline.maybe_spawn_flush_loop();
-            }
-        }
-
-        Ok(new_timeline)
-    }
-
-    /// Prepares timeline data by loading it from the basebackup archive.
-    pub(crate) async fn import_basebackup_from_tar(
-        self,
-        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
-        base_lsn: Lsn,
-        broker_client: storage_broker::BrokerClientChannel,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<Timeline>> {
-        let raw_timeline = self.raw_timeline()?;
-
-        import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx)
-            .await
-            .context("Failed to import basebackup")?;
-
-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        raw_timeline.maybe_spawn_flush_loop();
-
-        fail::fail_point!("before-checkpoint-new-timeline", |_| {
-            anyhow::bail!("failpoint before-checkpoint-new-timeline");
-        });
-
-        raw_timeline
-            .freeze_and_flush()
-            .await
-            .context("Failed to flush after basebackup import")?;
-
-        // All the data has been imported. Insert the Timeline into the tenant's timelines
-        // map and remove the uninit mark file.
-        let tl = self.finish_creation()?;
-        tl.activate(broker_client, None, ctx);
-        Ok(tl)
-    }
-
-    pub(crate) fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
-        Ok(&self
-            .raw_timeline
-            .as_ref()
-            .with_context(|| {
-                format!(
-                    "No raw timeline {}/{} found",
-                    self.owning_tenant.tenant_id, self.timeline_id
-                )
-            })?
-            .0)
-    }
-}
-
-impl Drop for UninitializedTimeline<'_> {
-    fn drop(&mut self) {
-        if let Some((_, uninit_mark)) = self.raw_timeline.take() {
-            let _entered = info_span!("drop_uninitialized_timeline", tenant = %self.owning_tenant.tenant_id, timeline = %self.timeline_id).entered();
-            error!("Timeline got dropped without initializing, cleaning its files");
-            cleanup_timeline_directory(uninit_mark);
-        }
-    }
-}
-
-pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
-    let timeline_path = &uninit_mark.timeline_path;
-    match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
-        Ok(()) => {
-            info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
-        }
-        Err(e) => {
-            error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
-        }
-    }
-    drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists
-}
-
-/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
-/// or gets removed eventually.
-///
-/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
-#[must_use]
-pub(crate) struct TimelineUninitMark {
-    uninit_mark_deleted: bool,
-    uninit_mark_path: PathBuf,
-    pub(crate) timeline_path: PathBuf,
-}
-
-impl TimelineUninitMark {
-    pub(crate) fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self {
-        Self {
-            uninit_mark_deleted: false,
-            uninit_mark_path,
-            timeline_path,
-        }
-    }
-
-    fn remove_uninit_mark(mut self) -> anyhow::Result<()> {
-        if !self.uninit_mark_deleted {
-            self.delete_mark_file_if_present()?;
-        }
-
-        Ok(())
-    }
-
-    fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> {
-        let uninit_mark_file = &self.uninit_mark_path;
-        let uninit_mark_parent = uninit_mark_file
-            .parent()
-            .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
-        ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
-            format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
-        })?;
-        crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
-        self.uninit_mark_deleted = true;
-
-        Ok(())
-    }
-}
-
-impl Drop for TimelineUninitMark {
-    fn drop(&mut self) {
-        if !self.uninit_mark_deleted {
-            if self.timeline_path.exists() {
-                error!(
-                    "Uninit mark {} is not removed, timeline {} stays uninitialized",
-                    self.uninit_mark_path.display(),
-                    self.timeline_path.display()
-                )
-            } else {
-                // unblock later timeline creation attempts
-                warn!(
-                    "Removing intermediate uninit mark file {}",
-                    self.uninit_mark_path.display()
-                );
-                if let Err(e) = self.delete_mark_file_if_present() {
-                    error!("Failed to remove the uninit mark file: {e}")
-                }
-            }
-        }
-    }
-}
--- a/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/walreceiver_connection.rs
@@ -71,8 +71,6 @@ pub(super) async fn handle_walreceiver_connection(
    ctx: RequestContext,
    node: NodeId,
 ) -> anyhow::Result<()> {
-    debug_assert_current_span_has_tenant_and_timeline_id();
-
    WALRECEIVER_STARTED_CONNECTIONS.inc();

    // Connect to the database in replication mode.
@@ -142,9 +140,6 @@ pub(super) async fn handle_walreceiver_connection(
            }
            Ok(())
        }
-        // Enrich the log lines emitted by this closure with meaningful context.
-        // TODO: technically, this task outlives the surrounding function, so, the
-        // spans won't be properly nested.
        .instrument(tracing::info_span!("poller")),
    );

--- a/pgxn/hnsw/hnsw.c
+++ b/pgxn/hnsw/hnsw.c
@@ -149,7 +149,7 @@ hnsw_check_available_memory(Size requested)
 	struct sysinfo si;
 	Size total;
 	if (sysinfo(&si) < 0)
-		elog(ERROR, "Failed to get amount of RAM: %m");
+		elog(ERROR, "Failed to get amount of RAM: %n");

 	total = si.totalram*si.mem_unit;
 	if ((Size)NBuffers*BLCKSZ + requested >= total)
@@ -580,7 +580,6 @@ l2_distance(PG_FUNCTION_ARGS)
 				 errmsg("different array dimensions %d and %d", a_dim, b_dim)));
 	}

-	#pragma clang loop vectorize(enable)
 	for (int i = 0; i < a_dim; i++)
 	{
 		diff = ax[i] - bx[i];
--- a/pgxn/hnsw/hnswalg.cpp
+++ b/pgxn/hnsw/hnswalg.cpp
@@ -223,7 +223,6 @@ dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n)
 {
    dist_t 	distance = 0.0;

-    #pragma clang loop vectorize(enable)
    for (size_t i = 0; i < n; i++)
    {
        dist_t diff = x[i] - y[i];
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -34,6 +34,7 @@

 #define PageStoreTrace DEBUG5

+#define MAX_RECONNECT_ATTEMPTS 5
 #define RECONNECT_INTERVAL_USEC 1000000

 bool		connected = false;
@@ -54,15 +55,13 @@ int32		max_cluster_size;
 char	   *page_server_connstring;
 char	   *neon_auth_token;

-int			readahead_buffer_size = 128;
+int			n_unflushed_requests = 0;
 int			flush_every_n_requests = 8;
-
-int			n_reconnect_attempts = 0;
-int			max_reconnect_attempts = 60;
+int			readahead_buffer_size = 128;

 bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;

-static bool pageserver_flush(void);
+static void pageserver_flush(void);

 static bool
 pageserver_connect(int elevel)
@@ -233,17 +232,16 @@ pageserver_disconnect(void)
 	}
 }

-static bool
+static void
 pageserver_send(NeonRequest * request)
 {
 	StringInfoData req_buff;
+	int n_reconnect_attempts = 0;

 	/* If the connection was lost for some reason, reconnect */
 	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
-	{
-		neon_log(LOG, "pageserver_send disconnect bad connection");
 		pageserver_disconnect();
-	}
+

 	req_buff = nm_pack_request(request);

@@ -254,36 +252,53 @@ pageserver_send(NeonRequest * request)
 	 * See https://github.com/neondatabase/neon/issues/1138
 	 * So try to reestablish connection in case of failure.
 	 */
-	if (!connected)
+	while (true)
 	{
-		while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
+		if (!connected)
 		{
-			n_reconnect_attempts += 1;
-			pg_usleep(RECONNECT_INTERVAL_USEC);
+			if (!pageserver_connect(n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS ? LOG : ERROR))
+			{
+				n_reconnect_attempts += 1;
+				pg_usleep(RECONNECT_INTERVAL_USEC);
+				continue;
+			}
 		}
-		n_reconnect_attempts = 0;
-	}

-	/*
-	 * Send request.
-	 *
-	 * In principle, this could block if the output buffer is full, and we
-	 * should use async mode and check for interrupts while waiting. In
-	 * practice, our requests are small enough to always fit in the output and
-	 * TCP buffer.
-	 */
-	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
-	{
-		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-		pageserver_disconnect();
-		neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
-		pfree(msg);
-		pfree(req_buff.data);
-		return false;
+		/*
+		 * Send request.
+		 *
+		 * In principle, this could block if the output buffer is full, and we
+		 * should use async mode and check for interrupts while waiting. In
+		 * practice, our requests are small enough to always fit in the output and
+		 * TCP buffer.
+		 */
+		if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
+		{
+			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+			if (n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS)
+			{
+				neon_log(LOG, "failed to send page request (try to reconnect): %s", msg);
+				if (n_reconnect_attempts != 0) /* do not sleep before first reconnect attempt, assuming that pageserver is already restarted */
+					pg_usleep(RECONNECT_INTERVAL_USEC);
+				n_reconnect_attempts += 1;
+				continue;
+			}
+			else
+			{
+				pageserver_disconnect();
+				neon_log(ERROR, "failed to send page request: %s", msg);
+			}
+		}
+		break;
 	}

 	pfree(req_buff.data);

+	n_unflushed_requests++;
+
+	if (flush_every_n_requests > 0 && n_unflushed_requests >= flush_every_n_requests)
+		pageserver_flush();
+
 	if (message_level_is_interesting(PageStoreTrace))
 	{
 		char	   *msg = nm_to_string((NeonMessage *) request);
@@ -291,7 +306,6 @@ pageserver_send(NeonRequest * request)
 		neon_log(PageStoreTrace, "sent request: %s", msg);
 		pfree(msg);
 	}
-	return true;
 }

 static NeonResponse *
@@ -326,25 +340,16 @@ pageserver_receive(void)
 		}
 		else if (rc == -1)
 		{
-			neon_log(LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
 			pageserver_disconnect();
 			resp = NULL;
 		}
 		else if (rc == -2)
-		{
-			char* msg = pchomp(PQerrorMessage(pageserver_conn));
-			pageserver_disconnect();
-			neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
-		}
+			neon_log(ERROR, "could not read COPY data: %s", pchomp(PQerrorMessage(pageserver_conn)));
 		else
-		{
-			pageserver_disconnect();
-			neon_log(ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
-		}
+			neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc);
 	}
 	PG_CATCH();
 	{
-		neon_log(LOG, "pageserver_receive disconnect due to caught exception");
 		pageserver_disconnect();
 		PG_RE_THROW();
 	}
@@ -354,25 +359,21 @@ pageserver_receive(void)
 }


-static bool
+static void
 pageserver_flush(void)
 {
 	if (!connected)
 	{
 		neon_log(WARNING, "Tried to flush while disconnected");
 	}
-	else
+	else if (PQflush(pageserver_conn))
 	{
-		if (PQflush(pageserver_conn))
-		{
-			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-			pageserver_disconnect();
-			neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
-			pfree(msg);
-			return false;
-		}
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
+		pageserver_disconnect();
+		neon_log(ERROR, "failed to flush page requests: %s", msg);
 	}
-	return true;
+	n_unflushed_requests = 0;
 }

 page_server_api api = {
@@ -438,14 +439,6 @@ pg_init_libpagestore(void)
 							PGC_USERSET,
 							0,	/* no flags required */
 							NULL, NULL, NULL);
-	DefineCustomIntVariable("neon.max_reconnect_attempts",
-							"Maximal attempts to reconnect to pages server (with 1 second timeout)",
-							NULL,
-							&max_reconnect_attempts,
-							10, 0, INT_MAX,
-							PGC_USERSET,
-							0,
-							NULL, NULL, NULL);
 	DefineCustomIntVariable("neon.readahead_buffer_size",
 							"number of prefetches to buffer",
 							"This buffer is used to hold and manage prefetched "
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -145,9 +145,9 @@ extern char *nm_to_string(NeonMessage * msg);

 typedef struct
 {
-	bool		(*send) (NeonRequest * request);
+	void		(*send) (NeonRequest * request);
 	NeonResponse *(*receive) (void);
-	bool		(*flush) (void);
+	void		(*flush) (void);
 }			page_server_api;

 extern void prefetch_on_ps_disconnect(void);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -489,8 +489,7 @@ prefetch_wait_for(uint64 ring_index)
 	if (MyPState->ring_flush <= ring_index &&
 		MyPState->ring_unused > MyPState->ring_flush)
 	{
-		if (!page_server->flush())
-			return false;
+		page_server->flush();
 		MyPState->ring_flush = MyPState->ring_unused;
 	}

@@ -667,7 +666,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 		 * smaller than the current WAL insert/redo pointer, which is already
 		 * larger than this prefetch_lsn. So in any case, that would
 		 * invalidate this cache.
-		 *
+		 * 
 		 * The best LSN to use for effective_request_lsn would be
 		 * XLogCtl->Insert.RedoRecPtr, but that's expensive to access.
 		 */
@@ -678,8 +677,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force

 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
-
-	while (!page_server->send((NeonRequest *) &request));
+	page_server->send((NeonRequest *) &request);

 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;
@@ -689,7 +687,6 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	/* update slot state */
 	slot->status = PRFS_REQUESTED;

-
 	prfh_insert(MyPState->prf_hash, slot, &found);
 	Assert(!found);
 }
@@ -746,7 +743,6 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 					prefetch_set_unused(ring_index);
 					entry = NULL;
 				}
-
 			}
 			/* if we don't want the latest version, only accept requests with the exact same LSN */
 			else
@@ -760,23 +756,20 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 			}
 		}

-		if (entry != NULL)
+		/*
+		 * We received a prefetch for a page that was recently read and
+		 * removed from the buffers. Remove that request from the buffers.
+		 */
+		else if (slot->status == PRFS_TAG_REMAINS)
 		{
-			/*
-			 * We received a prefetch for a page that was recently read and
-			 * removed from the buffers. Remove that request from the buffers.
-			 */
-			if (slot->status == PRFS_TAG_REMAINS)
-			{
-				prefetch_set_unused(ring_index);
-				entry = NULL;
-			}
-			else
-			{
-				/* The buffered request is good enough, return that index */
-				pgBufferUsage.prefetch.duplicates++;
-				return ring_index;
-			}
+			prefetch_set_unused(ring_index);
+			entry = NULL;
+		}
+		else
+		{
+			/* The buffered request is good enough, return that index */
+			pgBufferUsage.prefetch.duplicates++;
+			return ring_index;
 		}
 	}

@@ -866,7 +859,8 @@ page_server_request(void const *req)
 {
 	NeonResponse* resp;
 	do {
-		while (!page_server->send((NeonRequest *) req) || !page_server->flush());
+		page_server->send((NeonRequest *) req);
+		page_server->flush();
 		MyPState->ring_flush = MyPState->ring_unused;
 		consume_prefetch_responses();
 		resp = page_server->receive();
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -191,12 +191,6 @@ impl Storage for FileStorage {
                control_partial_path.display()
            )
        })?;
-        control_partial.flush().await.with_context(|| {
-            format!(
-                "failed to flush safekeeper state into control file at: {}",
-                control_partial_path.display()
-            )
-        })?;

        // fsync the file
        if !self.conf.no_sync {
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -188,7 +188,6 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
        let mut response = client.get(&http_url).send().await?;
        while let Some(chunk) = response.chunk().await? {
            file.write_all(&chunk).await?;
-            file.flush().await?;
        }
    }

--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -403,18 +403,16 @@ impl SafekeeperPostgresHandler {
        };

        // take the latest commit_lsn if don't have stop_pos
-        let end_pos = stop_pos.unwrap_or(*commit_lsn_watch_rx.borrow());
+        let mut end_pos = stop_pos.unwrap_or(*commit_lsn_watch_rx.borrow());

        if end_pos < start_pos {
-            warn!(
-                "requested start_pos {} is ahead of available WAL end_pos {}",
-                start_pos, end_pos
-            );
+            warn!("start_pos {} is ahead of end_pos {}", start_pos, end_pos);
+            end_pos = start_pos;
        }

        info!(
-            "starting streaming from {:?} till {:?}, available WAL ends at {}",
-            start_pos, stop_pos, end_pos
+            "starting streaming from {:?} till {:?}",
+            start_pos, stop_pos
        );

        // switch to copy
@@ -549,14 +547,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
            self.end_pos = *self.commit_lsn_watch_rx.borrow();
            if self.end_pos > self.start_pos {
                // We have something to send.
-                trace!("got end_pos {:?}, streaming", self.end_pos);
                return Ok(());
            }

            // Wait for WAL to appear, now self.end_pos == self.start_pos.
            if let Some(lsn) = wait_for_lsn(&mut self.commit_lsn_watch_rx, self.start_pos).await? {
                self.end_pos = lsn;
-                trace!("got end_pos {:?}, streaming", self.end_pos);
                return Ok(());
            }

--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -248,10 +248,6 @@ impl PhysicalStorage {
        };

        file.write_all(buf).await?;
-        // Note: flush just ensures write above reaches the OS (this is not
-        // needed in case of sync IO as Write::write there calls directly write
-        // syscall, but needed in case of async). It does *not* fsyncs the file.
-        file.flush().await?;

        if xlogoff + buf.len() == self.wal_seg_size {
            // If we reached the end of a WAL segment, flush and close it.
@@ -720,7 +716,6 @@ async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
        count -= XLOG_BLCKSZ;
    }
    file.write_all(&ZERO_BLOCK[0..count]).await?;
-    file.flush().await?;
    Ok(())
 }

--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -61,8 +61,6 @@ PAGESERVER_GLOBAL_METRICS: Tuple[str, ...] = (
    "pageserver_materialized_cache_hits_direct_total",
    "pageserver_page_cache_read_hits_total",
    "pageserver_page_cache_read_accesses_total",
-    "pageserver_page_cache_size_current_bytes",
-    "pageserver_page_cache_size_max_bytes",
    "pageserver_getpage_reconstruct_seconds_bucket",
    "pageserver_getpage_reconstruct_seconds_count",
    "pageserver_getpage_reconstruct_seconds_sum",
--- a/test_runner/regress/test_multixact.py
+++ b/test_runner/regress/test_multixact.py
@@ -1,7 +1,3 @@
-import random
-import threading
-from threading import Thread
-
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
 from fixtures.utils import query_scalar
@@ -19,17 +15,11 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir):
    endpoint = env.endpoints.create_start("test_multixact")

    log.info("postgres is running on 'test_multixact' branch")
-
-    n_records = 100
-    n_threads =   5
-    n_iters =  1000
-    n_restarts = 10
-
    cur = endpoint.connect().cursor()
    cur.execute(
-        f"""
-        CREATE TABLE t1(pk int primary key, val integer);
-        INSERT INTO t1 values (generate_series(1, {n_records}), 0);
+        """
+        CREATE TABLE t1(i int primary key);
+        INSERT INTO t1 select * from generate_series(1, 100);
    """
    )

@@ -38,32 +28,26 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir):
    )

    # Lock entries using parallel connections in a round-robin fashion.
-    def do_updates():
+    nclients = 20
+    connections = []
+    for i in range(nclients):
+        # Do not turn on autocommit. We want to hold the key-share locks.
        conn = endpoint.connect(autocommit=False)
-        for i in range(n_iters):
-            pk = random.randrange(1, n_records)
-            conn.cursor().execute(f"update t1 set val=val+1 where pk={pk}")
-            conn.cursor().execute("select * from t1 for key share")
-            conn.commit()
-        conn.close()
+        connections.append(conn)

-    for iter in range(n_restarts):
-        threads: List[threading.Thread] = []
-        for i in range(n_threads):
-            threads.append(threading.Thread(target=do_updates, args=(), daemon=False))
-            threads[-1].start()
+    # On each iteration, we commit the previous transaction on a connection,
+    # and issue antoher select. Each SELECT generates a new multixact that
+    # includes the new XID, and the XIDs of all the other parallel transactions.
+    # This generates enough traffic on both multixact offsets and members SLRUs
+    # to cross page boundaries.
+    for i in range(5000):
+        conn = connections[i % nclients]
+        conn.commit()
+        conn.cursor().execute("select * from t1 for key share")

-        for thread in threads:
-            thread.join()
-
-        # Restart endpoint
-        endpoint.stop()
-        endpoint.start()
-
-        conn = endpoint.connect()
-        cur = conn.cursor()
-        cur.execute("select count(*) from t1")
-        assert cur.fetchone() == (n_records,)
+    # We have multixacts now. We can close the connections.
+    for c in connections:
+        c.close()

    # force wal flush
    cur.execute("checkpoint")
@@ -90,3 +74,6 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir):

    # Check that we restored pg_controlfile correctly
    assert next_multixact_id_new == next_multixact_id
+
+    # Check that we can restore the content of the datadir correctly
+    check_restored_datadir_content(test_output_dir, env, endpoint)