Modify test_multixact to reproduce the problem with incorrect offset calculation

Fix bug in mx_offset_to_flags_offset
Fix error message
2026-03-04 17:00:37 +00:00 · 2023-07-21 22:16:42 +03:00 · 2023-07-21 18:02:33 +03:00 · 2023-07-07 21:59:20 +03:00 · 2023-07-07 09:42:44 +03:00 · 2023-07-06 19:49:14 +03:00
26 changed files with 623 additions and 511 deletions
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -515,6 +515,25 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control

+#########################################################################################
+#
+# Layer "pg-embedding-pg-build"
+# compile pg_embedding extension
+#
+#########################################################################################
+FROM build-deps AS pg-embedding-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+ENV PATH "/usr/local/pgsql/bin/:$PATH"
+# 2465f831ea1f8d49c1d74f8959adb7fc277d70cd made on 05/07/2023
+# There is no release tag yet
+RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1d74f8959adb7fc277d70cd.tar.gz -O pg_embedding.tar.gz && \
+    echo "047af2b1f664a1e6e37867bd4eeaf5934fa27d6ba3d6c4461efa388ddf7cd1d5 pg_embedding.tar.gz" | sha256sum --check && \
+    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
+    make -j $(getconf _NPROCESSORS_ONLN) && \
+    make -j $(getconf _NPROCESSORS_ONLN) install && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/embedding.control
+
 #########################################################################################
 #
 # Layer "pg-anon-pg-build"
@@ -671,6 +690,7 @@ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -189,7 +189,7 @@ services:
      - "/bin/bash"
      - "-c"
    command:
-      - "until pg_isready -h compute -p 55433 ; do
+      - "until pg_isready -h compute -p 55433 -U cloud_admin ; do
            echo 'Waiting to start compute...' && sleep 1;
         done"
    depends_on:
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -48,6 +48,7 @@ Creating docker-compose_storage_broker_1       ... done
 2. connect compute node
 ```
 $ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
+$ chmod 600 ~/.pgpass
 $ psql -h localhost -p 55433 -U cloud_admin
 postgres=# CREATE TABLE t(key int primary key, value text);
 CREATE TABLE
--- a/libs/postgres_ffi/src/nonrelfile_utils.rs
+++ b/libs/postgres_ffi/src/nonrelfile_utils.rs
@@ -57,9 +57,9 @@ pub fn slru_may_delete_clogsegment(segpage: u32, cutoff_page: u32) -> bool {
 // Multixact utils

 pub fn mx_offset_to_flags_offset(xid: MultiXactId) -> usize {
-    ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32) as u16
-        % pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE
-        * pg_constants::MULTIXACT_MEMBERGROUP_SIZE) as usize
+    ((xid / pg_constants::MULTIXACT_MEMBERS_PER_MEMBERGROUP as u32)
+        % pg_constants::MULTIXACT_MEMBERGROUPS_PER_PAGE as u32
+        * pg_constants::MULTIXACT_MEMBERGROUP_SIZE as u32) as usize
 }

 pub fn mx_offset_to_flags_bitshift(xid: MultiXactId) -> u16 {
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -234,14 +234,18 @@ pub async fn collect_metrics_iteration(
        // Note that this metric is calculated in a separate bgworker
        // Here we only use cached value, which may lag behind the real latest one
        let tenant_synthetic_size = tenant.get_cached_synthetic_size();
-        current_metrics.push((
-            PageserverConsumptionMetricsKey {
-                tenant_id,
-                timeline_id: None,
-                metric: SYNTHETIC_STORAGE_SIZE,
-            },
-            tenant_synthetic_size,
-        ));
+
+        if tenant_synthetic_size != 0 {
+            // only send non-zeroes because otherwise these show up as errors in logs
+            current_metrics.push((
+                PageserverConsumptionMetricsKey {
+                    tenant_id,
+                    timeline_id: None,
+                    metric: SYNTHETIC_STORAGE_SIZE,
+                },
+                tenant_synthetic_size,
+            ));
+        }
    }

    // Filter metrics, unless we want to send all metrics, including cached ones.
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -11,7 +11,7 @@
 //! parent timeline, and the last LSN that has been written to disk.
 //!

-use anyhow::{bail, ensure, Context};
+use anyhow::{bail, Context};
 use futures::FutureExt;
 use pageserver_api::models::TimelineState;
 use remote_storage::DownloadError;
@@ -49,6 +49,8 @@ use std::time::{Duration, Instant};
 use self::config::TenantConf;
 use self::metadata::TimelineMetadata;
 use self::remote_timeline_client::RemoteTimelineClient;
+use self::timeline::uninit::TimelineUninitMark;
+use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -68,6 +70,7 @@ use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
 use crate::InitializationOrder;

+use crate::tenant::timeline::uninit::cleanup_timeline_directory;
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
 use crate::walredo::WalRedoManager;
@@ -87,6 +90,7 @@ pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
 pub mod manifest;
+mod span;

 pub mod metadata;
 mod par_fsync;
@@ -102,7 +106,7 @@ mod timeline;

 pub mod size;

-pub(crate) use timeline::debug_assert_current_span_has_tenant_and_timeline_id;
+pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 pub use timeline::{
    LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
 };
@@ -161,200 +165,6 @@ pub struct Tenant {
    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
 }

-/// A timeline with some of its files on disk, being initialized.
-/// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
-/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory
-/// to be removed on next restart.
-///
-/// The caller is responsible for proper timeline data filling before the final init.
-#[must_use]
-pub struct UninitializedTimeline<'t> {
-    owning_tenant: &'t Tenant,
-    timeline_id: TimelineId,
-    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
-}
-
-/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
-/// or gets removed eventually.
-///
-/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
-#[must_use]
-struct TimelineUninitMark {
-    uninit_mark_deleted: bool,
-    uninit_mark_path: PathBuf,
-    timeline_path: PathBuf,
-}
-
-impl UninitializedTimeline<'_> {
-    /// Finish timeline creation: insert it into the Tenant's timelines map and remove the
-    /// uninit mark file.
-    ///
-    /// This function launches the flush loop if not already done.
-    ///
-    /// The caller is responsible for activating the timeline (function `.activate()`).
-    fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
-        let timeline_id = self.timeline_id;
-        let tenant_id = self.owning_tenant.tenant_id;
-
-        let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
-            format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
-        })?;
-
-        // Check that the caller initialized disk_consistent_lsn
-        let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
-        ensure!(
-            new_disk_consistent_lsn.is_valid(),
-            "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
-        );
-
-        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
-        match timelines.entry(timeline_id) {
-            Entry::Occupied(_) => anyhow::bail!(
-                "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
-            ),
-            Entry::Vacant(v) => {
-                uninit_mark.remove_uninit_mark().with_context(|| {
-                    format!(
-                        "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
-                    )
-                })?;
-                v.insert(Arc::clone(&new_timeline));
-
-                new_timeline.maybe_spawn_flush_loop();
-            }
-        }
-
-        Ok(new_timeline)
-    }
-
-    /// Prepares timeline data by loading it from the basebackup archive.
-    pub async fn import_basebackup_from_tar(
-        self,
-        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
-        base_lsn: Lsn,
-        broker_client: storage_broker::BrokerClientChannel,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<Timeline>> {
-        let raw_timeline = self.raw_timeline()?;
-
-        import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx)
-            .await
-            .context("Failed to import basebackup")?;
-
-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        raw_timeline.maybe_spawn_flush_loop();
-
-        fail::fail_point!("before-checkpoint-new-timeline", |_| {
-            bail!("failpoint before-checkpoint-new-timeline");
-        });
-
-        raw_timeline
-            .freeze_and_flush()
-            .await
-            .context("Failed to flush after basebackup import")?;
-
-        // All the data has been imported. Insert the Timeline into the tenant's timelines
-        // map and remove the uninit mark file.
-        let tl = self.finish_creation()?;
-        tl.activate(broker_client, None, ctx);
-        Ok(tl)
-    }
-
-    fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
-        Ok(&self
-            .raw_timeline
-            .as_ref()
-            .with_context(|| {
-                format!(
-                    "No raw timeline {}/{} found",
-                    self.owning_tenant.tenant_id, self.timeline_id
-                )
-            })?
-            .0)
-    }
-}
-
-impl Drop for UninitializedTimeline<'_> {
-    fn drop(&mut self) {
-        if let Some((_, uninit_mark)) = self.raw_timeline.take() {
-            let _entered = info_span!("drop_uninitialized_timeline", tenant = %self.owning_tenant.tenant_id, timeline = %self.timeline_id).entered();
-            error!("Timeline got dropped without initializing, cleaning its files");
-            cleanup_timeline_directory(uninit_mark);
-        }
-    }
-}
-
-fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
-    let timeline_path = &uninit_mark.timeline_path;
-    match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
-        Ok(()) => {
-            info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
-        }
-        Err(e) => {
-            error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
-        }
-    }
-    drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists
-}
-
-impl TimelineUninitMark {
-    fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self {
-        Self {
-            uninit_mark_deleted: false,
-            uninit_mark_path,
-            timeline_path,
-        }
-    }
-
-    fn remove_uninit_mark(mut self) -> anyhow::Result<()> {
-        if !self.uninit_mark_deleted {
-            self.delete_mark_file_if_present()?;
-        }
-
-        Ok(())
-    }
-
-    fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> {
-        let uninit_mark_file = &self.uninit_mark_path;
-        let uninit_mark_parent = uninit_mark_file
-            .parent()
-            .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
-        ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
-            format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
-        })?;
-        crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
-        self.uninit_mark_deleted = true;
-
-        Ok(())
-    }
-}
-
-impl Drop for TimelineUninitMark {
-    fn drop(&mut self) {
-        if !self.uninit_mark_deleted {
-            if self.timeline_path.exists() {
-                error!(
-                    "Uninit mark {} is not removed, timeline {} stays uninitialized",
-                    self.uninit_mark_path.display(),
-                    self.timeline_path.display()
-                )
-            } else {
-                // unblock later timeline creation attempts
-                warn!(
-                    "Removing intermediate uninit mark file {}",
-                    self.uninit_mark_path.display()
-                );
-                if let Err(e) = self.delete_mark_file_if_present() {
-                    error!("Failed to remove the uninit mark file: {e}")
-                }
-            }
-        }
-    }
-}
-
 // We should not blindly overwrite local metadata with remote one.
 // For example, consider the following case:
 //     Image layer is flushed to disk as a new delta layer, we update local metadata and start upload task but after that
@@ -695,7 +505,7 @@ impl Tenant {
    /// No background tasks are started as part of this routine.
    ///
    async fn attach(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
-        debug_assert_current_span_has_tenant_id();
+        span::debug_assert_current_span_has_tenant_id();

        let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id);
        if !tokio::fs::try_exists(&marker_file)
@@ -833,7 +643,7 @@ impl Tenant {
        remote_client: RemoteTimelineClient,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        debug_assert_current_span_has_tenant_id();
+        span::debug_assert_current_span_has_tenant_id();

        info!("downloading index file for timeline {}", timeline_id);
        tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id))
@@ -912,7 +722,7 @@ impl Tenant {
        init_order: Option<InitializationOrder>,
        ctx: &RequestContext,
    ) -> Arc<Tenant> {
-        debug_assert_current_span_has_tenant_id();
+        span::debug_assert_current_span_has_tenant_id();

        let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
            Ok(conf) => conf,
@@ -1098,7 +908,7 @@ impl Tenant {
        init_order: Option<&InitializationOrder>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        debug_assert_current_span_has_tenant_id();
+        span::debug_assert_current_span_has_tenant_id();

        debug!("loading tenant task");

@@ -1144,7 +954,7 @@ impl Tenant {
        init_order: Option<&InitializationOrder>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        debug_assert_current_span_has_tenant_id();
+        span::debug_assert_current_span_has_tenant_id();

        let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
            RemoteTimelineClient::new(
@@ -1735,7 +1545,7 @@ impl Tenant {
        timeline_id: TimelineId,
        _ctx: &RequestContext,
    ) -> Result<(), DeleteTimelineError> {
-        timeline::debug_assert_current_span_has_tenant_and_timeline_id();
+        debug_assert_current_span_has_tenant_and_timeline_id();

        // Transition the timeline into TimelineState::Stopping.
        // This should prevent new operations from starting.
@@ -1899,7 +1709,7 @@ impl Tenant {
        background_jobs_can_start: Option<&completion::Barrier>,
        ctx: &RequestContext,
    ) {
-        debug_assert_current_span_has_tenant_id();
+        span::debug_assert_current_span_has_tenant_id();

        let mut activating = false;
        self.state.send_modify(|current_state| {
@@ -1970,7 +1780,7 @@ impl Tenant {
    ///
    /// This will attempt to shutdown even if tenant is broken.
    pub(crate) async fn shutdown(&self, freeze_and_flush: bool) -> Result<(), ShutdownError> {
-        debug_assert_current_span_has_tenant_id();
+        span::debug_assert_current_span_has_tenant_id();
        // Set tenant (and its timlines) to Stoppping state.
        //
        // Since we can only transition into Stopping state after activation is complete,
@@ -3012,11 +2822,11 @@ impl Tenant {

        debug!("Successfully created initial files for timeline {tenant_id}/{new_timeline_id}");

-        Ok(UninitializedTimeline {
-            owning_tenant: self,
-            timeline_id: new_timeline_id,
-            raw_timeline: Some((timeline_struct, uninit_mark)),
-        })
+        Ok(UninitializedTimeline::new(
+            self,
+            new_timeline_id,
+            Some((timeline_struct, uninit_mark)),
+        ))
    }

    fn create_timeline_files(
@@ -4571,28 +4381,3 @@ mod tests {
        Ok(())
    }
 }
-
-#[cfg(not(debug_assertions))]
-#[inline]
-pub(crate) fn debug_assert_current_span_has_tenant_id() {}
-
-#[cfg(debug_assertions)]
-pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<
-    utils::tracing_span_assert::MultiNameExtractor<2>,
-> = once_cell::sync::Lazy::new(|| {
-    utils::tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"])
-});
-
-#[cfg(debug_assertions)]
-#[inline]
-pub(crate) fn debug_assert_current_span_has_tenant_id() {
-    use utils::tracing_span_assert;
-
-    match tracing_span_assert::check_fields_present([&*TENANT_ID_EXTRACTOR]) {
-        Ok(()) => (),
-        Err(missing) => panic!(
-            "missing extractors: {:?}",
-            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
-        ),
-    }
-}
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -16,7 +16,7 @@ use tracing::{info, warn};

 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
-use crate::tenant::timeline::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
--- a/pageserver/src/tenant/span.rs
+++ b/pageserver/src/tenant/span.rs
@@ -0,0 +1,20 @@
+#[cfg(debug_assertions)]
+use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
+
+#[cfg(not(debug_assertions))]
+pub(crate) fn debug_assert_current_span_has_tenant_id() {}
+
+#[cfg(debug_assertions)]
+pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<2>> =
+    once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]));
+
+#[cfg(debug_assertions)]
+#[track_caller]
+pub(crate) fn debug_assert_current_span_has_tenant_id() {
+    if let Err(missing) = check_fields_present([&*TENANT_ID_EXTRACTOR]) {
+        panic!(
+            "missing extractors: {:?}",
+            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
+        )
+    }
+}
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -222,13 +222,14 @@ impl Layer for DeltaLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
-            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
+            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
            self.desc.tenant_id,
            self.desc.timeline_id,
            self.desc.key_range.start,
            self.desc.key_range.end,
            self.desc.lsn_range.start,
-            self.desc.lsn_range.end
+            self.desc.lsn_range.end,
+            self.desc.file_size,
        );

        if !verbose {
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -153,12 +153,14 @@ impl Layer for ImageLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
-            "----- image layer for ten {} tli {} key {}-{} at {} ----",
+            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
            self.desc.tenant_id,
            self.desc.timeline_id,
            self.desc.key_range.start,
            self.desc.key_range.end,
-            self.lsn
+            self.lsn,
+            self.desc.is_incremental,
+            self.desc.file_size
        );

        if !verbose {
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -174,13 +174,16 @@ impl PersistentLayerDesc {

    pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        println!(
-            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
+            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
            self.tenant_id,
            self.timeline_id,
            self.key_range.start,
            self.key_range.end,
            self.lsn_range.start,
-            self.lsn_range.end
+            self.lsn_range.end,
+            self.is_delta,
+            self.is_incremental,
+            self.file_size,
        );

        Ok(())
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -77,13 +77,16 @@ impl Layer for RemoteLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        println!(
-            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
+            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
            self.desc.tenant_id,
            self.desc.timeline_id,
            self.desc.key_range.start,
            self.desc.key_range.end,
            self.desc.lsn_range.start,
-            self.desc.lsn_range.end
+            self.desc.lsn_range.end,
+            self.desc.is_delta,
+            self.desc.is_incremental,
+            self.desc.file_size,
        );

        Ok(())
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,6 +1,9 @@
 //!

 mod eviction_task;
+mod logical_size;
+pub mod span;
+pub mod uninit;
 mod walreceiver;

 use anyhow::{anyhow, bail, ensure, Context, Result};
@@ -8,7 +11,6 @@ use bytes::Bytes;
 use fail::fail_point;
 use futures::StreamExt;
 use itertools::Itertools;
-use once_cell::sync::OnceCell;
 use pageserver_api::models::{
    DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
    DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceEventReason, LayerResidenceStatus,
@@ -17,7 +19,7 @@ use pageserver_api::models::{
 use remote_storage::GenericRemoteStorage;
 use serde_with::serde_as;
 use storage_broker::BrokerClientChannel;
-use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
+use tokio::sync::{oneshot, watch, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::TenantTimelineId;
@@ -28,7 +30,7 @@ use std::fs;
 use std::ops::{Deref, Range};
 use std::path::{Path, PathBuf};
 use std::pin::pin;
-use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
+use std::sync::atomic::Ordering as AtomicOrdering;
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};

@@ -38,6 +40,7 @@ use crate::tenant::storage_layer::{
    DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
    LayerAccessStats, LayerFileName, RemoteLayer,
 };
+use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    ephemeral_file::is_ephemeral_file,
    layer_map::{LayerMap, SearchResult},
@@ -79,6 +82,7 @@ use crate::{is_temporary, task_mgr};

 pub(super) use self::eviction_task::EvictionTaskTenantState;
 use self::eviction_task::EvictionTaskTimelineState;
+use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

 use super::config::TenantConf;
@@ -365,126 +369,6 @@ pub struct Timeline {
    initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
 }

-/// Internal structure to hold all data needed for logical size calculation.
-///
-/// Calculation consists of two stages:
-///
-/// 1. Initial size calculation. That might take a long time, because it requires
-/// reading all layers containing relation sizes at `initial_part_end`.
-///
-/// 2. Collecting an incremental part and adding that to the initial size.
-/// Increments are appended on walreceiver writing new timeline data,
-/// which result in increase or decrease of the logical size.
-struct LogicalSize {
-    /// Size, potentially slow to compute. Calculating this might require reading multiple
-    /// layers, and even ancestor's layers.
-    ///
-    /// NOTE: size at a given LSN is constant, but after a restart we will calculate
-    /// the initial size at a different LSN.
-    initial_logical_size: OnceCell<u64>,
-
-    /// Semaphore to track ongoing calculation of `initial_logical_size`.
-    initial_size_computation: Arc<tokio::sync::Semaphore>,
-
-    /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
-    initial_part_end: Option<Lsn>,
-
-    /// All other size changes after startup, combined together.
-    ///
-    /// Size shouldn't ever be negative, but this is signed for two reasons:
-    ///
-    /// 1. If we initialized the "baseline" size lazily, while we already
-    /// process incoming WAL, the incoming WAL records could decrement the
-    /// variable and temporarily make it negative. (This is just future-proofing;
-    /// the initialization is currently not done lazily.)
-    ///
-    /// 2. If there is a bug and we e.g. forget to increment it in some cases
-    /// when size grows, but remember to decrement it when it shrinks again, the
-    /// variable could go negative. In that case, it seems better to at least
-    /// try to keep tracking it, rather than clamp or overflow it. Note that
-    /// get_current_logical_size() will clamp the returned value to zero if it's
-    /// negative, and log an error. Could set it permanently to zero or some
-    /// special value to indicate "broken" instead, but this will do for now.
-    ///
-    /// Note that we also expose a copy of this value as a prometheus metric,
-    /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
-    /// to modify this, it will also keep the prometheus metric in sync.
-    size_added_after_initial: AtomicI64,
-}
-
-/// Normalized current size, that the data in pageserver occupies.
-#[derive(Debug, Clone, Copy)]
-enum CurrentLogicalSize {
-    /// The size is not yet calculated to the end, this is an intermediate result,
-    /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
-    /// yet total logical size cannot be below 0.
-    Approximate(u64),
-    // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
-    // available for observation without any calculations.
-    Exact(u64),
-}
-
-impl CurrentLogicalSize {
-    fn size(&self) -> u64 {
-        *match self {
-            Self::Approximate(size) => size,
-            Self::Exact(size) => size,
-        }
-    }
-}
-
-impl LogicalSize {
-    fn empty_initial() -> Self {
-        Self {
-            initial_logical_size: OnceCell::with_value(0),
-            //  initial_logical_size already computed, so, don't admit any calculations
-            initial_size_computation: Arc::new(Semaphore::new(0)),
-            initial_part_end: None,
-            size_added_after_initial: AtomicI64::new(0),
-        }
-    }
-
-    fn deferred_initial(compute_to: Lsn) -> Self {
-        Self {
-            initial_logical_size: OnceCell::new(),
-            initial_size_computation: Arc::new(Semaphore::new(1)),
-            initial_part_end: Some(compute_to),
-            size_added_after_initial: AtomicI64::new(0),
-        }
-    }
-
-    fn current_size(&self) -> anyhow::Result<CurrentLogicalSize> {
-        let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
-        //                  ^^^ keep this type explicit so that the casts in this function break if
-        //                  we change the type.
-        match self.initial_logical_size.get() {
-            Some(initial_size) => {
-                initial_size.checked_add_signed(size_increment)
-                    .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
-                    .map(CurrentLogicalSize::Exact)
-            }
-            None => {
-                let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
-                Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
-            }
-        }
-    }
-
-    fn increment_size(&self, delta: i64) {
-        self.size_added_after_initial
-            .fetch_add(delta, AtomicOrdering::SeqCst);
-    }
-
-    /// Make the value computed by initial logical size computation
-    /// available for re-use. This doesn't contain the incremental part.
-    fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
-        match self.initial_part_end {
-            Some(v) if v == lsn => self.initial_logical_size.get().copied(),
-            _ => None,
-        }
-    }
-}
-
 pub struct WalReceiverInfo {
    pub wal_source_connconf: PgConnectionConfig,
    pub last_received_msg_lsn: Lsn,
@@ -2239,7 +2123,7 @@ impl Timeline {
        ctx: &RequestContext,
        cancel: CancellationToken,
    ) -> Result<u64, CalculateLogicalSizeError> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
+        span::debug_assert_current_span_has_tenant_and_timeline_id();

        let mut timeline_state_updates = self.subscribe_for_state_updates();
        let self_calculation = Arc::clone(self);
@@ -4545,7 +4429,7 @@ impl Timeline {
        &self,
        remote_layer: Arc<RemoteLayer>,
    ) -> anyhow::Result<()> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
+        span::debug_assert_current_span_has_tenant_and_timeline_id();

        use std::sync::atomic::Ordering::Relaxed;

@@ -4979,33 +4863,6 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
    bail!("couldn't find an unused backup number for {:?}", path)
 }

-#[cfg(not(debug_assertions))]
-#[inline]
-pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
-
-#[cfg(debug_assertions)]
-#[inline]
-pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
-    use utils::tracing_span_assert;
-
-    pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<
-        tracing_span_assert::MultiNameExtractor<2>,
-    > = once_cell::sync::Lazy::new(|| {
-        tracing_span_assert::MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"])
-    });
-
-    match tracing_span_assert::check_fields_present([
-        &*super::TENANT_ID_EXTRACTOR,
-        &*TIMELINE_ID_EXTRACTOR,
-    ]) {
-        Ok(()) => (),
-        Err(missing) => panic!(
-            "missing extractors: {:?}",
-            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
-        ),
-    }
-}
-
 /// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
 ///
 /// Returns `true` if the two `Arc` point to the same layer, false otherwise.
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -0,0 +1,128 @@
+use anyhow::Context;
+use once_cell::sync::OnceCell;
+
+use tokio::sync::Semaphore;
+use utils::lsn::Lsn;
+
+use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
+use std::sync::Arc;
+
+/// Internal structure to hold all data needed for logical size calculation.
+///
+/// Calculation consists of two stages:
+///
+/// 1. Initial size calculation. That might take a long time, because it requires
+/// reading all layers containing relation sizes at `initial_part_end`.
+///
+/// 2. Collecting an incremental part and adding that to the initial size.
+/// Increments are appended on walreceiver writing new timeline data,
+/// which result in increase or decrease of the logical size.
+pub(super) struct LogicalSize {
+    /// Size, potentially slow to compute. Calculating this might require reading multiple
+    /// layers, and even ancestor's layers.
+    ///
+    /// NOTE: size at a given LSN is constant, but after a restart we will calculate
+    /// the initial size at a different LSN.
+    pub initial_logical_size: OnceCell<u64>,
+
+    /// Semaphore to track ongoing calculation of `initial_logical_size`.
+    pub initial_size_computation: Arc<tokio::sync::Semaphore>,
+
+    /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
+    pub initial_part_end: Option<Lsn>,
+
+    /// All other size changes after startup, combined together.
+    ///
+    /// Size shouldn't ever be negative, but this is signed for two reasons:
+    ///
+    /// 1. If we initialized the "baseline" size lazily, while we already
+    /// process incoming WAL, the incoming WAL records could decrement the
+    /// variable and temporarily make it negative. (This is just future-proofing;
+    /// the initialization is currently not done lazily.)
+    ///
+    /// 2. If there is a bug and we e.g. forget to increment it in some cases
+    /// when size grows, but remember to decrement it when it shrinks again, the
+    /// variable could go negative. In that case, it seems better to at least
+    /// try to keep tracking it, rather than clamp or overflow it. Note that
+    /// get_current_logical_size() will clamp the returned value to zero if it's
+    /// negative, and log an error. Could set it permanently to zero or some
+    /// special value to indicate "broken" instead, but this will do for now.
+    ///
+    /// Note that we also expose a copy of this value as a prometheus metric,
+    /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
+    /// to modify this, it will also keep the prometheus metric in sync.
+    pub size_added_after_initial: AtomicI64,
+}
+
+/// Normalized current size, that the data in pageserver occupies.
+#[derive(Debug, Clone, Copy)]
+pub(super) enum CurrentLogicalSize {
+    /// The size is not yet calculated to the end, this is an intermediate result,
+    /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
+    /// yet total logical size cannot be below 0.
+    Approximate(u64),
+    // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
+    // available for observation without any calculations.
+    Exact(u64),
+}
+
+impl CurrentLogicalSize {
+    pub(super) fn size(&self) -> u64 {
+        *match self {
+            Self::Approximate(size) => size,
+            Self::Exact(size) => size,
+        }
+    }
+}
+
+impl LogicalSize {
+    pub(super) fn empty_initial() -> Self {
+        Self {
+            initial_logical_size: OnceCell::with_value(0),
+            //  initial_logical_size already computed, so, don't admit any calculations
+            initial_size_computation: Arc::new(Semaphore::new(0)),
+            initial_part_end: None,
+            size_added_after_initial: AtomicI64::new(0),
+        }
+    }
+
+    pub(super) fn deferred_initial(compute_to: Lsn) -> Self {
+        Self {
+            initial_logical_size: OnceCell::new(),
+            initial_size_computation: Arc::new(Semaphore::new(1)),
+            initial_part_end: Some(compute_to),
+            size_added_after_initial: AtomicI64::new(0),
+        }
+    }
+
+    pub(super) fn current_size(&self) -> anyhow::Result<CurrentLogicalSize> {
+        let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
+        //                  ^^^ keep this type explicit so that the casts in this function break if
+        //                  we change the type.
+        match self.initial_logical_size.get() {
+            Some(initial_size) => {
+                initial_size.checked_add_signed(size_increment)
+                    .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
+                    .map(CurrentLogicalSize::Exact)
+            }
+            None => {
+                let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
+                Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
+            }
+        }
+    }
+
+    pub(super) fn increment_size(&self, delta: i64) {
+        self.size_added_after_initial
+            .fetch_add(delta, AtomicOrdering::SeqCst);
+    }
+
+    /// Make the value computed by initial logical size computation
+    /// available for re-use. This doesn't contain the incremental part.
+    pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
+        match self.initial_part_end {
+            Some(v) if v == lsn => self.initial_logical_size.get().copied(),
+            _ => None,
+        }
+    }
+}
--- a/pageserver/src/tenant/timeline/span.rs
+++ b/pageserver/src/tenant/timeline/span.rs
@@ -0,0 +1,25 @@
+#[cfg(debug_assertions)]
+use utils::tracing_span_assert::{check_fields_present, Extractor, MultiNameExtractor};
+
+#[cfg(not(debug_assertions))]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
+
+#[cfg(debug_assertions)]
+#[track_caller]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
+    static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<2>> =
+        once_cell::sync::Lazy::new(|| {
+            MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"])
+        });
+
+    let fields: [&dyn Extractor; 2] = [
+        &*crate::tenant::span::TENANT_ID_EXTRACTOR,
+        &*TIMELINE_ID_EXTRACTOR,
+    ];
+    if let Err(missing) = check_fields_present(fields) {
+        panic!(
+            "missing extractors: {:?}",
+            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
+        )
+    }
+}
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -0,0 +1,219 @@
+use std::{collections::hash_map::Entry, fs, path::PathBuf, sync::Arc};
+
+use anyhow::Context;
+use tracing::{error, info, info_span, warn};
+use utils::{crashsafe, id::TimelineId, lsn::Lsn};
+
+use crate::{
+    context::RequestContext,
+    import_datadir,
+    tenant::{ignore_absent_files, Tenant},
+};
+
+use super::Timeline;
+
+/// A timeline with some of its files on disk, being initialized.
+/// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
+/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory
+/// to be removed on next restart.
+///
+/// The caller is responsible for proper timeline data filling before the final init.
+#[must_use]
+pub struct UninitializedTimeline<'t> {
+    pub(crate) owning_tenant: &'t Tenant,
+    timeline_id: TimelineId,
+    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
+}
+
+impl<'t> UninitializedTimeline<'t> {
+    pub(crate) fn new(
+        owning_tenant: &'t Tenant,
+        timeline_id: TimelineId,
+        raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
+    ) -> Self {
+        Self {
+            owning_tenant,
+            timeline_id,
+            raw_timeline,
+        }
+    }
+
+    /// Finish timeline creation: insert it into the Tenant's timelines map and remove the
+    /// uninit mark file.
+    ///
+    /// This function launches the flush loop if not already done.
+    ///
+    /// The caller is responsible for activating the timeline (function `.activate()`).
+    pub(crate) fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
+        let timeline_id = self.timeline_id;
+        let tenant_id = self.owning_tenant.tenant_id;
+
+        let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
+            format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
+        })?;
+
+        // Check that the caller initialized disk_consistent_lsn
+        let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
+        anyhow::ensure!(
+            new_disk_consistent_lsn.is_valid(),
+            "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
+        );
+
+        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
+        match timelines.entry(timeline_id) {
+            Entry::Occupied(_) => anyhow::bail!(
+                "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
+            ),
+            Entry::Vacant(v) => {
+                uninit_mark.remove_uninit_mark().with_context(|| {
+                    format!(
+                        "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
+                    )
+                })?;
+                v.insert(Arc::clone(&new_timeline));
+
+                new_timeline.maybe_spawn_flush_loop();
+            }
+        }
+
+        Ok(new_timeline)
+    }
+
+    /// Prepares timeline data by loading it from the basebackup archive.
+    pub(crate) async fn import_basebackup_from_tar(
+        self,
+        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
+        base_lsn: Lsn,
+        broker_client: storage_broker::BrokerClientChannel,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Arc<Timeline>> {
+        let raw_timeline = self.raw_timeline()?;
+
+        import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx)
+            .await
+            .context("Failed to import basebackup")?;
+
+        // Flush the new layer files to disk, before we make the timeline as available to
+        // the outside world.
+        //
+        // Flush loop needs to be spawned in order to be able to flush.
+        raw_timeline.maybe_spawn_flush_loop();
+
+        fail::fail_point!("before-checkpoint-new-timeline", |_| {
+            anyhow::bail!("failpoint before-checkpoint-new-timeline");
+        });
+
+        raw_timeline
+            .freeze_and_flush()
+            .await
+            .context("Failed to flush after basebackup import")?;
+
+        // All the data has been imported. Insert the Timeline into the tenant's timelines
+        // map and remove the uninit mark file.
+        let tl = self.finish_creation()?;
+        tl.activate(broker_client, None, ctx);
+        Ok(tl)
+    }
+
+    pub(crate) fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
+        Ok(&self
+            .raw_timeline
+            .as_ref()
+            .with_context(|| {
+                format!(
+                    "No raw timeline {}/{} found",
+                    self.owning_tenant.tenant_id, self.timeline_id
+                )
+            })?
+            .0)
+    }
+}
+
+impl Drop for UninitializedTimeline<'_> {
+    fn drop(&mut self) {
+        if let Some((_, uninit_mark)) = self.raw_timeline.take() {
+            let _entered = info_span!("drop_uninitialized_timeline", tenant = %self.owning_tenant.tenant_id, timeline = %self.timeline_id).entered();
+            error!("Timeline got dropped without initializing, cleaning its files");
+            cleanup_timeline_directory(uninit_mark);
+        }
+    }
+}
+
+pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
+    let timeline_path = &uninit_mark.timeline_path;
+    match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
+        Ok(()) => {
+            info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
+        }
+        Err(e) => {
+            error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
+        }
+    }
+    drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists
+}
+
+/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
+/// or gets removed eventually.
+///
+/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
+#[must_use]
+pub(crate) struct TimelineUninitMark {
+    uninit_mark_deleted: bool,
+    uninit_mark_path: PathBuf,
+    pub(crate) timeline_path: PathBuf,
+}
+
+impl TimelineUninitMark {
+    pub(crate) fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self {
+        Self {
+            uninit_mark_deleted: false,
+            uninit_mark_path,
+            timeline_path,
+        }
+    }
+
+    fn remove_uninit_mark(mut self) -> anyhow::Result<()> {
+        if !self.uninit_mark_deleted {
+            self.delete_mark_file_if_present()?;
+        }
+
+        Ok(())
+    }
+
+    fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> {
+        let uninit_mark_file = &self.uninit_mark_path;
+        let uninit_mark_parent = uninit_mark_file
+            .parent()
+            .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
+        ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
+            format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
+        })?;
+        crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
+        self.uninit_mark_deleted = true;
+
+        Ok(())
+    }
+}
+
+impl Drop for TimelineUninitMark {
+    fn drop(&mut self) {
+        if !self.uninit_mark_deleted {
+            if self.timeline_path.exists() {
+                error!(
+                    "Uninit mark {} is not removed, timeline {} stays uninitialized",
+                    self.uninit_mark_path.display(),
+                    self.timeline_path.display()
+                )
+            } else {
+                // unblock later timeline creation attempts
+                warn!(
+                    "Removing intermediate uninit mark file {}",
+                    self.uninit_mark_path.display()
+                );
+                if let Err(e) = self.delete_mark_file_if_present() {
+                    error!("Failed to remove the uninit mark file: {e}")
+                }
+            }
+        }
+    }
+}
--- a/pgxn/hnsw/hnsw.c
+++ b/pgxn/hnsw/hnsw.c
@@ -149,7 +149,7 @@ hnsw_check_available_memory(Size requested)
 	struct sysinfo si;
 	Size total;
 	if (sysinfo(&si) < 0)
-		elog(ERROR, "Failed to get amount of RAM: %n");
+		elog(ERROR, "Failed to get amount of RAM: %m");

 	total = si.totalram*si.mem_unit;
 	if ((Size)NBuffers*BLCKSZ + requested >= total)
@@ -580,6 +580,7 @@ l2_distance(PG_FUNCTION_ARGS)
 				 errmsg("different array dimensions %d and %d", a_dim, b_dim)));
 	}

+	#pragma clang loop vectorize(enable)
 	for (int i = 0; i < a_dim; i++)
 	{
 		diff = ax[i] - bx[i];
--- a/pgxn/hnsw/hnswalg.cpp
+++ b/pgxn/hnsw/hnswalg.cpp
@@ -223,6 +223,7 @@ dist_t fstdistfunc_scalar(const coord_t *x, const coord_t *y, size_t n)
 {
    dist_t 	distance = 0.0;

+    #pragma clang loop vectorize(enable)
    for (size_t i = 0; i < n; i++)
    {
        dist_t diff = x[i] - y[i];
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -34,7 +34,6 @@

 #define PageStoreTrace DEBUG5

-#define MAX_RECONNECT_ATTEMPTS 5
 #define RECONNECT_INTERVAL_USEC 1000000

 bool		connected = false;
@@ -55,13 +54,15 @@ int32		max_cluster_size;
 char	   *page_server_connstring;
 char	   *neon_auth_token;

-int			n_unflushed_requests = 0;
-int			flush_every_n_requests = 8;
 int			readahead_buffer_size = 128;
+int			flush_every_n_requests = 8;
+
+int			n_reconnect_attempts = 0;
+int			max_reconnect_attempts = 60;

 bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;

-static void pageserver_flush(void);
+static bool pageserver_flush(void);

 static bool
 pageserver_connect(int elevel)
@@ -232,16 +233,17 @@ pageserver_disconnect(void)
 	}
 }

-static void
+static bool
 pageserver_send(NeonRequest * request)
 {
 	StringInfoData req_buff;
-	int n_reconnect_attempts = 0;

 	/* If the connection was lost for some reason, reconnect */
 	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
+	{
+		neon_log(LOG, "pageserver_send disconnect bad connection");
 		pageserver_disconnect();
-
+	}

 	req_buff = nm_pack_request(request);

@@ -252,53 +254,36 @@ pageserver_send(NeonRequest * request)
 	 * See https://github.com/neondatabase/neon/issues/1138
 	 * So try to reestablish connection in case of failure.
 	 */
-	while (true)
+	if (!connected)
 	{
-		if (!connected)
+		while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
 		{
-			if (!pageserver_connect(n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS ? LOG : ERROR))
-			{
-				n_reconnect_attempts += 1;
-				pg_usleep(RECONNECT_INTERVAL_USEC);
-				continue;
-			}
+			n_reconnect_attempts += 1;
+			pg_usleep(RECONNECT_INTERVAL_USEC);
 		}
+		n_reconnect_attempts = 0;
+	}

-		/*
-		 * Send request.
-		 *
-		 * In principle, this could block if the output buffer is full, and we
-		 * should use async mode and check for interrupts while waiting. In
-		 * practice, our requests are small enough to always fit in the output and
-		 * TCP buffer.
-		 */
-		if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
-		{
-			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-			if (n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS)
-			{
-				neon_log(LOG, "failed to send page request (try to reconnect): %s", msg);
-				if (n_reconnect_attempts != 0) /* do not sleep before first reconnect attempt, assuming that pageserver is already restarted */
-					pg_usleep(RECONNECT_INTERVAL_USEC);
-				n_reconnect_attempts += 1;
-				continue;
-			}
-			else
-			{
-				pageserver_disconnect();
-				neon_log(ERROR, "failed to send page request: %s", msg);
-			}
-		}
-		break;
+	/*
+	 * Send request.
+	 *
+	 * In principle, this could block if the output buffer is full, and we
+	 * should use async mode and check for interrupts while waiting. In
+	 * practice, our requests are small enough to always fit in the output and
+	 * TCP buffer.
+	 */
+	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
+	{
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+		pageserver_disconnect();
+		neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
+		pfree(msg);
+		pfree(req_buff.data);
+		return false;
 	}

 	pfree(req_buff.data);

-	n_unflushed_requests++;
-
-	if (flush_every_n_requests > 0 && n_unflushed_requests >= flush_every_n_requests)
-		pageserver_flush();
-
 	if (message_level_is_interesting(PageStoreTrace))
 	{
 		char	   *msg = nm_to_string((NeonMessage *) request);
@@ -306,6 +291,7 @@ pageserver_send(NeonRequest * request)
 		neon_log(PageStoreTrace, "sent request: %s", msg);
 		pfree(msg);
 	}
+	return true;
 }

 static NeonResponse *
@@ -340,16 +326,25 @@ pageserver_receive(void)
 		}
 		else if (rc == -1)
 		{
+			neon_log(LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
 			pageserver_disconnect();
 			resp = NULL;
 		}
 		else if (rc == -2)
-			neon_log(ERROR, "could not read COPY data: %s", pchomp(PQerrorMessage(pageserver_conn)));
+		{
+			char* msg = pchomp(PQerrorMessage(pageserver_conn));
+			pageserver_disconnect();
+			neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
+		}
 		else
-			neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc);
+		{
+			pageserver_disconnect();
+			neon_log(ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
+		}
 	}
 	PG_CATCH();
 	{
+		neon_log(LOG, "pageserver_receive disconnect due to caught exception");
 		pageserver_disconnect();
 		PG_RE_THROW();
 	}
@@ -359,21 +354,25 @@ pageserver_receive(void)
 }


-static void
+static bool
 pageserver_flush(void)
 {
 	if (!connected)
 	{
 		neon_log(WARNING, "Tried to flush while disconnected");
 	}
-	else if (PQflush(pageserver_conn))
+	else
 	{
-		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-
-		pageserver_disconnect();
-		neon_log(ERROR, "failed to flush page requests: %s", msg);
+		if (PQflush(pageserver_conn))
+		{
+			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+			pageserver_disconnect();
+			neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
+			pfree(msg);
+			return false;
+		}
 	}
-	n_unflushed_requests = 0;
+	return true;
 }

 page_server_api api = {
@@ -439,6 +438,14 @@ pg_init_libpagestore(void)
 							PGC_USERSET,
 							0,	/* no flags required */
 							NULL, NULL, NULL);
+	DefineCustomIntVariable("neon.max_reconnect_attempts",
+							"Maximal attempts to reconnect to pages server (with 1 second timeout)",
+							NULL,
+							&max_reconnect_attempts,
+							10, 0, INT_MAX,
+							PGC_USERSET,
+							0,
+							NULL, NULL, NULL);
 	DefineCustomIntVariable("neon.readahead_buffer_size",
 							"number of prefetches to buffer",
 							"This buffer is used to hold and manage prefetched "
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -145,9 +145,9 @@ extern char *nm_to_string(NeonMessage * msg);

 typedef struct
 {
-	void		(*send) (NeonRequest * request);
+	bool		(*send) (NeonRequest * request);
 	NeonResponse *(*receive) (void);
-	void		(*flush) (void);
+	bool		(*flush) (void);
 }			page_server_api;

 extern void prefetch_on_ps_disconnect(void);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -489,7 +489,8 @@ prefetch_wait_for(uint64 ring_index)
 	if (MyPState->ring_flush <= ring_index &&
 		MyPState->ring_unused > MyPState->ring_flush)
 	{
-		page_server->flush();
+		if (!page_server->flush())
+			return false;
 		MyPState->ring_flush = MyPState->ring_unused;
 	}

@@ -666,7 +667,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 		 * smaller than the current WAL insert/redo pointer, which is already
 		 * larger than this prefetch_lsn. So in any case, that would
 		 * invalidate this cache.
-		 * 
+		 *
 		 * The best LSN to use for effective_request_lsn would be
 		 * XLogCtl->Insert.RedoRecPtr, but that's expensive to access.
 		 */
@@ -677,7 +678,8 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force

 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
-	page_server->send((NeonRequest *) &request);
+
+	while (!page_server->send((NeonRequest *) &request));

 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;
@@ -687,6 +689,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	/* update slot state */
 	slot->status = PRFS_REQUESTED;

+
 	prfh_insert(MyPState->prf_hash, slot, &found);
 	Assert(!found);
 }
@@ -743,6 +746,7 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 					prefetch_set_unused(ring_index);
 					entry = NULL;
 				}
+
 			}
 			/* if we don't want the latest version, only accept requests with the exact same LSN */
 			else
@@ -756,20 +760,23 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 			}
 		}

-		/*
-		 * We received a prefetch for a page that was recently read and
-		 * removed from the buffers. Remove that request from the buffers.
-		 */
-		else if (slot->status == PRFS_TAG_REMAINS)
+		if (entry != NULL)
 		{
-			prefetch_set_unused(ring_index);
-			entry = NULL;
-		}
-		else
-		{
-			/* The buffered request is good enough, return that index */
-			pgBufferUsage.prefetch.duplicates++;
-			return ring_index;
+			/*
+			 * We received a prefetch for a page that was recently read and
+			 * removed from the buffers. Remove that request from the buffers.
+			 */
+			if (slot->status == PRFS_TAG_REMAINS)
+			{
+				prefetch_set_unused(ring_index);
+				entry = NULL;
+			}
+			else
+			{
+				/* The buffered request is good enough, return that index */
+				pgBufferUsage.prefetch.duplicates++;
+				return ring_index;
+			}
 		}
 	}

@@ -859,8 +866,7 @@ page_server_request(void const *req)
 {
 	NeonResponse* resp;
 	do {
-		page_server->send((NeonRequest *) req);
-		page_server->flush();
+		while (!page_server->send((NeonRequest *) req) || !page_server->flush());
 		MyPState->ring_flush = MyPState->ring_unused;
 		consume_prefetch_responses();
 		resp = page_server->receive();
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -191,6 +191,12 @@ impl Storage for FileStorage {
                control_partial_path.display()
            )
        })?;
+        control_partial.flush().await.with_context(|| {
+            format!(
+                "failed to flush safekeeper state into control file at: {}",
+                control_partial_path.display()
+            )
+        })?;

        // fsync the file
        if !self.conf.no_sync {
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -188,6 +188,7 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
        let mut response = client.get(&http_url).send().await?;
        while let Some(chunk) = response.chunk().await? {
            file.write_all(&chunk).await?;
+            file.flush().await?;
        }
    }

--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -403,16 +403,18 @@ impl SafekeeperPostgresHandler {
        };

        // take the latest commit_lsn if don't have stop_pos
-        let mut end_pos = stop_pos.unwrap_or(*commit_lsn_watch_rx.borrow());
+        let end_pos = stop_pos.unwrap_or(*commit_lsn_watch_rx.borrow());

        if end_pos < start_pos {
-            warn!("start_pos {} is ahead of end_pos {}", start_pos, end_pos);
-            end_pos = start_pos;
+            warn!(
+                "requested start_pos {} is ahead of available WAL end_pos {}",
+                start_pos, end_pos
+            );
        }

        info!(
-            "starting streaming from {:?} till {:?}",
-            start_pos, stop_pos
+            "starting streaming from {:?} till {:?}, available WAL ends at {}",
+            start_pos, stop_pos, end_pos
        );

        // switch to copy
@@ -547,12 +549,14 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
            self.end_pos = *self.commit_lsn_watch_rx.borrow();
            if self.end_pos > self.start_pos {
                // We have something to send.
+                trace!("got end_pos {:?}, streaming", self.end_pos);
                return Ok(());
            }

            // Wait for WAL to appear, now self.end_pos == self.start_pos.
            if let Some(lsn) = wait_for_lsn(&mut self.commit_lsn_watch_rx, self.start_pos).await? {
                self.end_pos = lsn;
+                trace!("got end_pos {:?}, streaming", self.end_pos);
                return Ok(());
            }

--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -248,6 +248,10 @@ impl PhysicalStorage {
        };

        file.write_all(buf).await?;
+        // Note: flush just ensures write above reaches the OS (this is not
+        // needed in case of sync IO as Write::write there calls directly write
+        // syscall, but needed in case of async). It does *not* fsyncs the file.
+        file.flush().await?;

        if xlogoff + buf.len() == self.wal_seg_size {
            // If we reached the end of a WAL segment, flush and close it.
@@ -716,6 +720,7 @@ async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
        count -= XLOG_BLCKSZ;
    }
    file.write_all(&ZERO_BLOCK[0..count]).await?;
+    file.flush().await?;
    Ok(())
 }

--- a/test_runner/regress/test_multixact.py
+++ b/test_runner/regress/test_multixact.py
@@ -1,3 +1,7 @@
+import random
+import threading
+from threading import Thread
+
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
 from fixtures.utils import query_scalar
@@ -15,11 +19,17 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir):
    endpoint = env.endpoints.create_start("test_multixact")

    log.info("postgres is running on 'test_multixact' branch")
+
+    n_records = 100
+    n_threads =   5
+    n_iters =  1000
+    n_restarts = 10
+
    cur = endpoint.connect().cursor()
    cur.execute(
-        """
-        CREATE TABLE t1(i int primary key);
-        INSERT INTO t1 select * from generate_series(1, 100);
+        f"""
+        CREATE TABLE t1(pk int primary key, val integer);
+        INSERT INTO t1 values (generate_series(1, {n_records}), 0);
    """
    )

@@ -28,26 +38,32 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir):
    )

    # Lock entries using parallel connections in a round-robin fashion.
-    nclients = 20
-    connections = []
-    for i in range(nclients):
-        # Do not turn on autocommit. We want to hold the key-share locks.
+    def do_updates():
        conn = endpoint.connect(autocommit=False)
-        connections.append(conn)
+        for i in range(n_iters):
+            pk = random.randrange(1, n_records)
+            conn.cursor().execute(f"update t1 set val=val+1 where pk={pk}")
+            conn.cursor().execute("select * from t1 for key share")
+            conn.commit()
+        conn.close()

-    # On each iteration, we commit the previous transaction on a connection,
-    # and issue antoher select. Each SELECT generates a new multixact that
-    # includes the new XID, and the XIDs of all the other parallel transactions.
-    # This generates enough traffic on both multixact offsets and members SLRUs
-    # to cross page boundaries.
-    for i in range(5000):
-        conn = connections[i % nclients]
-        conn.commit()
-        conn.cursor().execute("select * from t1 for key share")
+    for iter in range(n_restarts):
+        threads: List[threading.Thread] = []
+        for i in range(n_threads):
+            threads.append(threading.Thread(target=do_updates, args=(), daemon=False))
+            threads[-1].start()

-    # We have multixacts now. We can close the connections.
-    for c in connections:
-        c.close()
+        for thread in threads:
+            thread.join()
+
+        # Restart endpoint
+        endpoint.stop()
+        endpoint.start()
+
+        conn = endpoint.connect()
+        cur = conn.cursor()
+        cur.execute("select count(*) from t1")
+        assert cur.fetchone() == (n_records,)

    # force wal flush
    cur.execute("checkpoint")
@@ -74,6 +90,3 @@ def test_multixact(neon_simple_env: NeonEnv, test_output_dir):

    # Check that we restored pg_controlfile correctly
    assert next_multixact_id_new == next_multixact_id
-
-    # Check that we can restore the content of the datadir correctly
-    check_restored_datadir_content(test_output_dir, env, endpoint)
Author	SHA1	Message	Date
Konstantin Knizhnik	10774a5132	Modify test_multixact to reproduce the problem with incorrect offset calculation	2023-07-21 22:16:42 +03:00
Konstantin Knizhnik	bb18c9958a	Fix bug in mx_offset_to_flags_offset	2023-07-21 18:02:33 +03:00
Konstantin Knizhnik	e764fbf9f9	Fix error message	2023-07-07 21:59:20 +03:00
Konstantin Knizhnik	d37f7a0dd2	Vectoprize distance calculation at MacOS	2023-07-07 09:42:44 +03:00
Dmitry Rodionov	20137d9588	Polish tracing helpers (#4651 ) Context: comments here: https://github.com/neondatabase/neon/pull/4645	2023-07-06 19:49:14 +03:00
Arseny Sher	634be4f4e0	Fix async write in safekeepers. General Rust Write trait semantics (as well as its async brother) is that write definitely happens only after Write::flush(). This wasn't needed in sync where rust write calls the syscall directly, but is required in async. Also fix setting initial end_pos in walsender, sometimes it was from the future. fixes https://github.com/neondatabase/neon/issues/4518	2023-07-06 19:56:28 +04:00
Alex Chi Z	d340cf3721	dump more info in layer map (#4567 ) A simple commit extracted from https://github.com/neondatabase/neon/pull/4539 This PR adds more info for layer dumps (is_delta, is_incremental, size). --------- Signed-off-by: Alex Chi Z <chi@neon.tech>	2023-07-06 18:21:45 +03:00
Konstantin Knizhnik	1741edf933	Pageserver reconnect v2 (#4519 ) ## Problem Compute is not always able to reconnect to pages server. First of all it caused by long time of restart of pageserver. So number of attempts is increased from 5 (hardcoded) to 60 (GUC). Also we do not perform flush after each command to increase performance (it is especially critical for prefetch). Unfortunately such pending flush makes it not possible to transparently reconnect to restarted pageserver. What we can do is to try to minimzie such probabilty. Most likely broken connection will be detected in first sens command after some idle period. This is why max_flush_delay parameter is added which force flush to be performed after first request after some idle period. See #4497 ## Summary of changes Add neon.max_reconnect_attempts and neon.max_glush_delay GUCs which contol when flush has to be done and when it is possible to try to reconnect to page server ## Checklist before requesting a review - [ ] I have performed a self-review of my code. - [ ] If it is a core feature, I have added thorough tests. - [ ] Do we need to implement analytics? if so did you add the relevant metrics to the dashboard? - [ ] If this PR requires public announcement, mark it with /release-notes label and add several sentences in this section. ## Checklist before merging - [ ] Do not forget to reformat commit message to not include the above checklist	2023-07-06 12:47:43 +03:00
Joonas Koivunen	269e20aeab	fix: filter out zero synthetic sizes (#4639 ) Apparently sending the synthetic_size == 0 is causing problems, so filter out sending zeros. Slack discussion: https://neondb.slack.com/archives/C03F5SM1N02/p1688574285718989?thread_ts=1687874910.681049&cid=C03F5SM1N02	2023-07-06 12:32:34 +03:00
Tomoka Hayashi	91435006bd	Fix docker-compose file and document (#4621 ) ## Problem - Running the command according to docker.md gives warning and error. - Warning `permissions should be u=rw (0600) or less` is output when executing `psql -h localhost -p 55433 -U cloud_admin`. - `FATAL: password authentication failed for user "root”` is output in compute logs. ## Summary of changes - Add `$ chmod 600 ~/.pgpass` in docker.md to avoid warning. - Add username (cloud_admin) to pg_isready command in docker-compose.yml to avoid error. --------- Co-authored-by: Tomoka Hayashi <tomoka.hayashi@ntt.com>	2023-07-06 10:11:24 +01:00
Dmitry Rodionov	b263510866	move some logical size bits to separate logical_size.rs	2023-07-06 11:58:41 +03:00
Dmitry Rodionov	e418fc6dc3	move some tracing related assertions to separate module for tenant and timeline	2023-07-06 11:58:41 +03:00
Dmitry Rodionov	434eaadbe3	Move uninitialized timeline from tenant.rs to timeline/uninit.rs	2023-07-06 11:58:41 +03:00
Alexander Bayandin	6fb7edf494	Compile `pg_embedding` extension (#4634 ) ``` CREATE EXTENSION embedding; CREATE TABLE t (val real[]); INSERT INTO t (val) VALUES ('{0,0,0}'), ('{1,2,3}'), ('{1,1,1}'), (NULL); CREATE INDEX ON t USING hnsw (val) WITH (maxelements = 10, dims=3, m=3); INSERT INTO t (val) VALUES (array[1,2,4]); SELECT * FROM t ORDER BY val <-> array[3,3,3]; val --------- {1,2,3} {1,2,4} {1,1,1} {0,0,0} (5 rows) ```	2023-07-05 18:40:25 +01:00