Do vacuum freeze before copying data

Use cursor to copy data
Fix mapping of TOAST tables
2026-03-18 07:40:37 +00:00 · 2023-07-05 22:17:06 +03:00 · 2023-07-05 21:49:55 +03:00 · 2023-07-05 19:26:29 +03:00 · 2023-07-05 15:47:13 +03:00
29 changed files with 814 additions and 646 deletions
--- a/.dockerignore
+++ b/.dockerignore
@@ -21,5 +21,4 @@
 !workspace_hack/
 !neon_local/
 !scripts/ninstall.sh
-!scripts/combine_control_files.py
 !vm-cgconfig.conf
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -917,9 +917,9 @@ jobs:
        run: rm -rf ~/.ecr

  upload-postgres-extensions-to-s3:
-    # if: |
-    #   (github.ref_name == 'main' || github.ref_name == 'release') &&
-    #    github.event_name != 'workflow_dispatch'
+    if: |
+      (github.ref_name == 'main' || github.ref_name == 'release') &&
+       github.event_name != 'workflow_dispatch'
    runs-on: ${{ github.ref_name == 'release' && fromJSON('["self-hosted", "prod", "x64"]') || fromJSON('["self-hosted", "gen3", "small"]') }}
    needs: [ tag, promote-images ]
    strategy:
@@ -929,7 +929,7 @@ jobs:

    env:
      # While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
-      # Later all the extensions will be moved to extensions image. (unless that is too slow)
+      # Later all the extensions will be moved to extensions image.
      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
      COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
      AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
@@ -956,23 +956,33 @@ jobs:

      - name: Extract postgres-extensions from container
        run: |
-          rm -rf ./extensions ./control_files # Just in case
-          mkdir extensions
-          mkdir control_files
+          rm -rf ./extensions-to-upload ./custom-extensions # Just in case

-          # TODO: Delete Neon extensitons (they always present on compute-node image)
-          # rm -rf ./extensions/share/extension/neon*
-          # rm -rf ./extensions/lib/neon*
+          # In compute image we have a bit different directory layout
+          mkdir -p extensions-to-upload/share
+          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
+          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib             ./extensions-to-upload/lib

-          docker cp ${{ steps.create-container.outputs.EID }}:/ ./
-          # sh ./scripts/combine_control_files.sh
-          echo '{ "enabled_extensions": { "123454321": [ "anon" ], "public": [ "embedding" ] }, "control_data": { "embedding": "comment = 'hnsw index' \ndefault_version = '0.1.0' \nmodule_pathname = '$libdir/embedding' \nrelocatable = true \ntrusted = true", "anon": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n" } }' > ext_index.json
+          # Delete Neon extensitons (they always present on compute-node image)
+          rm -rf ./extensions-to-upload/share/extension/neon*
+          rm -rf ./extensions-to-upload/lib/neon*
+
+          # Delete leftovers from the extension build step
+          rm -rf ./extensions-to-upload/lib/pgxs
+          rm -rf ./extensions-to-upload/lib/pkgconfig
+
+          docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
+          for EXT_NAME in $(ls ./custom-extensions); do
+            mkdir -p ./extensions-to-upload/${EXT_NAME}/share
+
+            mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
+            mv ./custom-extensions/${EXT_NAME}/lib             ./extensions-to-upload/${EXT_NAME}/lib
+          done

      - name: Upload postgres-extensions to S3
        run: |
          for BUCKET in $(echo ${S3_BUCKETS}); do
-            aws s3 cp --recursive --only-show-errors ./extensions s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
-            aws s3 cp --only-show-errors ./ext_index.json s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
+            aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
          done

      - name: Cleanup
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -515,31 +515,6 @@ RUN wget https://github.com/ChenHuajun/pg_roaringbitmap/archive/refs/tags/v0.5.4
    make -j $(getconf _NPROCESSORS_ONLN) install && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/roaringbitmap.control

-#########################################################################################
-#
-# Layer "pg-embedding-pg-build"
-# compile pg_embedding extension
-#
-#########################################################################################
-FROM build-deps AS pg-embedding-pg-build
-COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
-
-ENV PATH "/usr/local/pgsql/bin/:$PATH"
-# 2465f831ea1f8d49c1d74f8959adb7fc277d70cd made on 05/07/2023
-# There is no release tag yet
-RUN wget https://github.com/neondatabase/pg_embedding/archive/2465f831ea1f8d49c1d74f8959adb7fc277d70cd.tar.gz -O pg_embedding.tar.gz && \
-    echo "047af2b1f664a1e6e37867bd4eeaf5934fa27d6ba3d6c4461efa388ddf7cd1d5 pg_embedding.tar.gz" | sha256sum --check && \
-    mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sort  > /before_embedding.txt && \
-    make -j $(getconf _NPROCESSORS_ONLN) && \
-    make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/embedding.control &&\
-    find /usr/local/pgsql -type f | sort  > /after_embedding.txt &&\
-    /bin/bash -c 'for from in $(comm -13 /before_embedding.txt /after_embedding.txt); do to=/extensions/embedding/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done' && \
-    tar -zcvf /extensions/embedding.tar.gz /extensions/embedding && \
-    mkdir -p /control_files &&\
-    cp /usr/local/pgsql/share/extension/embedding.control /control_files/embedding.control
-
 #########################################################################################
 #
 # Layer "pg-anon-pg-build"
@@ -558,15 +533,7 @@ RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgre
    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
    echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
    find /usr/local/pgsql -type f | sort  > /after.txt && \
-    /bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done' && \
-    mkdir /control_files &&\
-    tar -zcvf /extensions/anon.tar.gz /extensions/anon && \
-    mkdir -p /control_files &&\
-    cp /usr/local/pgsql/share/extension/anon.control /control_files/anon.control
-
-# # TODO: Delete leftovers from the extension build step
-# rm -rf ./extensions/lib/pgxs
-# rm -rf ./extensions/lib/pkgconfig
+    /bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'

 #########################################################################################
 #
@@ -704,7 +671,6 @@ COPY --from=pg-pgx-ulid-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=rdkit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-uuidv7-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=pg-roaringbitmap-pg-build /usr/local/pgsql/ /usr/local/pgsql/
-COPY --from=pg-embedding-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/

 RUN make -j $(getconf _NPROCESSORS_ONLN) \
@@ -755,24 +721,19 @@ RUN rm /usr/local/pgsql/lib/lib*.a

 #########################################################################################
 #
-# Extension only
+# Extenstion only
 #
 #########################################################################################
 FROM scratch AS postgres-extensions
 # After the transition this layer will include all extensitons.
 # As for now, it's only for new custom ones
 #
-# extensions
-COPY --from=pg-anon-pg-build /extensions/anon.tar.gz /extensions/anon.tar.gz
-COPY --from=pg-anon-pg-build /control_files/anon.control /control_files/anon.control
-
-COPY --from=pg-embedding-pg-build /extensions/embedding.tar.gz /extensions/embedding.tar.gz
-COPY --from=pg-embedding-pg-build /control_files/embedding.control /control_files/embedding.control
-
-FROM python:3.11-slim-bookworm AS alekpython
-COPY scripts/combine_control_files.py combine_control_files.py
-COPY --from=postgres-extensions /control_files /control_files
-CMD [ "python3", "./combine_control_files.py"]
+# # Default extensions
+# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
+# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib             /usr/local/pgsql/lib
+# Custom extensions
+COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
+COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension

 #########################################################################################
 #
--- a/docker-compose/docker-compose.yml
+++ b/docker-compose/docker-compose.yml
@@ -189,7 +189,7 @@ services:
      - "/bin/bash"
      - "-c"
    command:
-      - "until pg_isready -h compute -p 55433 -U cloud_admin ; do
+      - "until pg_isready -h compute -p 55433 ; do
            echo 'Waiting to start compute...' && sleep 1;
         done"
    depends_on:
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -48,7 +48,6 @@ Creating docker-compose_storage_broker_1       ... done
 2. connect compute node
 ```
 $ echo "localhost:55433:postgres:cloud_admin:cloud_admin" >> ~/.pgpass
-$ chmod 600 ~/.pgpass
 $ psql -h localhost -p 55433 -U cloud_admin
 postgres=# CREATE TABLE t(key int primary key, value text);
 CREATE TABLE
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -234,18 +234,14 @@ pub async fn collect_metrics_iteration(
        // Note that this metric is calculated in a separate bgworker
        // Here we only use cached value, which may lag behind the real latest one
        let tenant_synthetic_size = tenant.get_cached_synthetic_size();
-
-        if tenant_synthetic_size != 0 {
-            // only send non-zeroes because otherwise these show up as errors in logs
-            current_metrics.push((
-                PageserverConsumptionMetricsKey {
-                    tenant_id,
-                    timeline_id: None,
-                    metric: SYNTHETIC_STORAGE_SIZE,
-                },
-                tenant_synthetic_size,
-            ));
-        }
+        current_metrics.push((
+            PageserverConsumptionMetricsKey {
+                tenant_id,
+                timeline_id: None,
+                metric: SYNTHETIC_STORAGE_SIZE,
+            },
+            tenant_synthetic_size,
+        ));
    }

    // Filter metrics, unless we want to send all metrics, including cached ones.
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -11,7 +11,7 @@
 //! parent timeline, and the last LSN that has been written to disk.
 //!

-use anyhow::{bail, Context};
+use anyhow::{bail, ensure, Context};
 use futures::FutureExt;
 use pageserver_api::models::TimelineState;
 use remote_storage::DownloadError;
@@ -49,8 +49,6 @@ use std::time::{Duration, Instant};
 use self::config::TenantConf;
 use self::metadata::TimelineMetadata;
 use self::remote_timeline_client::RemoteTimelineClient;
-use self::timeline::uninit::TimelineUninitMark;
-use self::timeline::uninit::UninitializedTimeline;
 use self::timeline::EvictionTaskTenantState;
 use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
@@ -70,7 +68,6 @@ use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
 use crate::InitializationOrder;

-use crate::tenant::timeline::uninit::cleanup_timeline_directory;
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
 use crate::walredo::WalRedoManager;
@@ -90,7 +87,6 @@ pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
 pub mod manifest;
-mod span;

 pub mod metadata;
 mod par_fsync;
@@ -106,7 +102,7 @@ mod timeline;

 pub mod size;

-pub(crate) use timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
+pub(crate) use timeline::debug_assert_current_span_has_tenant_and_timeline_id;
 pub use timeline::{
    LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
 };
@@ -165,6 +161,200 @@ pub struct Tenant {
    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
 }

+/// A timeline with some of its files on disk, being initialized.
+/// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
+/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory
+/// to be removed on next restart.
+///
+/// The caller is responsible for proper timeline data filling before the final init.
+#[must_use]
+pub struct UninitializedTimeline<'t> {
+    owning_tenant: &'t Tenant,
+    timeline_id: TimelineId,
+    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
+}
+
+/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
+/// or gets removed eventually.
+///
+/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
+#[must_use]
+struct TimelineUninitMark {
+    uninit_mark_deleted: bool,
+    uninit_mark_path: PathBuf,
+    timeline_path: PathBuf,
+}
+
+impl UninitializedTimeline<'_> {
+    /// Finish timeline creation: insert it into the Tenant's timelines map and remove the
+    /// uninit mark file.
+    ///
+    /// This function launches the flush loop if not already done.
+    ///
+    /// The caller is responsible for activating the timeline (function `.activate()`).
+    fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
+        let timeline_id = self.timeline_id;
+        let tenant_id = self.owning_tenant.tenant_id;
+
+        let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
+            format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
+        })?;
+
+        // Check that the caller initialized disk_consistent_lsn
+        let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
+        ensure!(
+            new_disk_consistent_lsn.is_valid(),
+            "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
+        );
+
+        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
+        match timelines.entry(timeline_id) {
+            Entry::Occupied(_) => anyhow::bail!(
+                "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
+            ),
+            Entry::Vacant(v) => {
+                uninit_mark.remove_uninit_mark().with_context(|| {
+                    format!(
+                        "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
+                    )
+                })?;
+                v.insert(Arc::clone(&new_timeline));
+
+                new_timeline.maybe_spawn_flush_loop();
+            }
+        }
+
+        Ok(new_timeline)
+    }
+
+    /// Prepares timeline data by loading it from the basebackup archive.
+    pub async fn import_basebackup_from_tar(
+        self,
+        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
+        base_lsn: Lsn,
+        broker_client: storage_broker::BrokerClientChannel,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<Arc<Timeline>> {
+        let raw_timeline = self.raw_timeline()?;
+
+        import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx)
+            .await
+            .context("Failed to import basebackup")?;
+
+        // Flush the new layer files to disk, before we make the timeline as available to
+        // the outside world.
+        //
+        // Flush loop needs to be spawned in order to be able to flush.
+        raw_timeline.maybe_spawn_flush_loop();
+
+        fail::fail_point!("before-checkpoint-new-timeline", |_| {
+            bail!("failpoint before-checkpoint-new-timeline");
+        });
+
+        raw_timeline
+            .freeze_and_flush()
+            .await
+            .context("Failed to flush after basebackup import")?;
+
+        // All the data has been imported. Insert the Timeline into the tenant's timelines
+        // map and remove the uninit mark file.
+        let tl = self.finish_creation()?;
+        tl.activate(broker_client, None, ctx);
+        Ok(tl)
+    }
+
+    fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
+        Ok(&self
+            .raw_timeline
+            .as_ref()
+            .with_context(|| {
+                format!(
+                    "No raw timeline {}/{} found",
+                    self.owning_tenant.tenant_id, self.timeline_id
+                )
+            })?
+            .0)
+    }
+}
+
+impl Drop for UninitializedTimeline<'_> {
+    fn drop(&mut self) {
+        if let Some((_, uninit_mark)) = self.raw_timeline.take() {
+            let _entered = info_span!("drop_uninitialized_timeline", tenant = %self.owning_tenant.tenant_id, timeline = %self.timeline_id).entered();
+            error!("Timeline got dropped without initializing, cleaning its files");
+            cleanup_timeline_directory(uninit_mark);
+        }
+    }
+}
+
+fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
+    let timeline_path = &uninit_mark.timeline_path;
+    match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
+        Ok(()) => {
+            info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
+        }
+        Err(e) => {
+            error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
+        }
+    }
+    drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists
+}
+
+impl TimelineUninitMark {
+    fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self {
+        Self {
+            uninit_mark_deleted: false,
+            uninit_mark_path,
+            timeline_path,
+        }
+    }
+
+    fn remove_uninit_mark(mut self) -> anyhow::Result<()> {
+        if !self.uninit_mark_deleted {
+            self.delete_mark_file_if_present()?;
+        }
+
+        Ok(())
+    }
+
+    fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> {
+        let uninit_mark_file = &self.uninit_mark_path;
+        let uninit_mark_parent = uninit_mark_file
+            .parent()
+            .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
+        ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
+            format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
+        })?;
+        crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
+        self.uninit_mark_deleted = true;
+
+        Ok(())
+    }
+}
+
+impl Drop for TimelineUninitMark {
+    fn drop(&mut self) {
+        if !self.uninit_mark_deleted {
+            if self.timeline_path.exists() {
+                error!(
+                    "Uninit mark {} is not removed, timeline {} stays uninitialized",
+                    self.uninit_mark_path.display(),
+                    self.timeline_path.display()
+                )
+            } else {
+                // unblock later timeline creation attempts
+                warn!(
+                    "Removing intermediate uninit mark file {}",
+                    self.uninit_mark_path.display()
+                );
+                if let Err(e) = self.delete_mark_file_if_present() {
+                    error!("Failed to remove the uninit mark file: {e}")
+                }
+            }
+        }
+    }
+}
+
 // We should not blindly overwrite local metadata with remote one.
 // For example, consider the following case:
 //     Image layer is flushed to disk as a new delta layer, we update local metadata and start upload task but after that
@@ -505,7 +695,7 @@ impl Tenant {
    /// No background tasks are started as part of this routine.
    ///
    async fn attach(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
-        span::debug_assert_current_span_has_tenant_id();
+        debug_assert_current_span_has_tenant_id();

        let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id);
        if !tokio::fs::try_exists(&marker_file)
@@ -643,7 +833,7 @@ impl Tenant {
        remote_client: RemoteTimelineClient,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        span::debug_assert_current_span_has_tenant_id();
+        debug_assert_current_span_has_tenant_id();

        info!("downloading index file for timeline {}", timeline_id);
        tokio::fs::create_dir_all(self.conf.timeline_path(&timeline_id, &self.tenant_id))
@@ -722,7 +912,7 @@ impl Tenant {
        init_order: Option<InitializationOrder>,
        ctx: &RequestContext,
    ) -> Arc<Tenant> {
-        span::debug_assert_current_span_has_tenant_id();
+        debug_assert_current_span_has_tenant_id();

        let tenant_conf = match Self::load_tenant_config(conf, tenant_id) {
            Ok(conf) => conf,
@@ -908,7 +1098,7 @@ impl Tenant {
        init_order: Option<&InitializationOrder>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        span::debug_assert_current_span_has_tenant_id();
+        debug_assert_current_span_has_tenant_id();

        debug!("loading tenant task");

@@ -954,7 +1144,7 @@ impl Tenant {
        init_order: Option<&InitializationOrder>,
        ctx: &RequestContext,
    ) -> anyhow::Result<()> {
-        span::debug_assert_current_span_has_tenant_id();
+        debug_assert_current_span_has_tenant_id();

        let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
            RemoteTimelineClient::new(
@@ -1545,7 +1735,7 @@ impl Tenant {
        timeline_id: TimelineId,
        _ctx: &RequestContext,
    ) -> Result<(), DeleteTimelineError> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
+        timeline::debug_assert_current_span_has_tenant_and_timeline_id();

        // Transition the timeline into TimelineState::Stopping.
        // This should prevent new operations from starting.
@@ -1709,7 +1899,7 @@ impl Tenant {
        background_jobs_can_start: Option<&completion::Barrier>,
        ctx: &RequestContext,
    ) {
-        span::debug_assert_current_span_has_tenant_id();
+        debug_assert_current_span_has_tenant_id();

        let mut activating = false;
        self.state.send_modify(|current_state| {
@@ -1780,7 +1970,7 @@ impl Tenant {
    ///
    /// This will attempt to shutdown even if tenant is broken.
    pub(crate) async fn shutdown(&self, freeze_and_flush: bool) -> Result<(), ShutdownError> {
-        span::debug_assert_current_span_has_tenant_id();
+        debug_assert_current_span_has_tenant_id();
        // Set tenant (and its timlines) to Stoppping state.
        //
        // Since we can only transition into Stopping state after activation is complete,
@@ -2822,11 +3012,11 @@ impl Tenant {

        debug!("Successfully created initial files for timeline {tenant_id}/{new_timeline_id}");

-        Ok(UninitializedTimeline::new(
-            self,
-            new_timeline_id,
-            Some((timeline_struct, uninit_mark)),
-        ))
+        Ok(UninitializedTimeline {
+            owning_tenant: self,
+            timeline_id: new_timeline_id,
+            raw_timeline: Some((timeline_struct, uninit_mark)),
+        })
    }

    fn create_timeline_files(
@@ -4381,3 +4571,28 @@ mod tests {
        Ok(())
    }
 }
+
+#[cfg(not(debug_assertions))]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_id() {}
+
+#[cfg(debug_assertions)]
+pub static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<
+    utils::tracing_span_assert::MultiNameExtractor<2>,
+> = once_cell::sync::Lazy::new(|| {
+    utils::tracing_span_assert::MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"])
+});
+
+#[cfg(debug_assertions)]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_id() {
+    use utils::tracing_span_assert;
+
+    match tracing_span_assert::check_fields_present([&*TENANT_ID_EXTRACTOR]) {
+        Ok(()) => (),
+        Err(missing) => panic!(
+            "missing extractors: {:?}",
+            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
+        ),
+    }
+}
--- a/pageserver/src/tenant/remote_timeline_client/download.rs
+++ b/pageserver/src/tenant/remote_timeline_client/download.rs
@@ -16,7 +16,7 @@ use tracing::{info, warn};

 use crate::config::PageServerConf;
 use crate::tenant::storage_layer::LayerFileName;
-use crate::tenant::timeline::span::debug_assert_current_span_has_tenant_and_timeline_id;
+use crate::tenant::timeline::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::{exponential_backoff, DEFAULT_BASE_BACKOFF_SECONDS, DEFAULT_MAX_BACKOFF_SECONDS};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use utils::crashsafe::path_with_suffix_extension;
--- a/pageserver/src/tenant/span.rs
+++ b/pageserver/src/tenant/span.rs
@@ -1,20 +0,0 @@
-#[cfg(debug_assertions)]
-use utils::tracing_span_assert::{check_fields_present, MultiNameExtractor};
-
-#[cfg(not(debug_assertions))]
-pub(crate) fn debug_assert_current_span_has_tenant_id() {}
-
-#[cfg(debug_assertions)]
-pub(crate) static TENANT_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<2>> =
-    once_cell::sync::Lazy::new(|| MultiNameExtractor::new("TenantId", ["tenant_id", "tenant"]));
-
-#[cfg(debug_assertions)]
-#[track_caller]
-pub(crate) fn debug_assert_current_span_has_tenant_id() {
-    if let Err(missing) = check_fields_present([&*TENANT_ID_EXTRACTOR]) {
-        panic!(
-            "missing extractors: {:?}",
-            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
-        )
-    }
-}
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -222,14 +222,13 @@ impl Layer for DeltaLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
-            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} size {} ----",
+            "----- delta layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
            self.desc.tenant_id,
            self.desc.timeline_id,
            self.desc.key_range.start,
            self.desc.key_range.end,
            self.desc.lsn_range.start,
-            self.desc.lsn_range.end,
-            self.desc.file_size,
+            self.desc.lsn_range.end
        );

        if !verbose {
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -153,14 +153,12 @@ impl Layer for ImageLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self, verbose: bool, ctx: &RequestContext) -> Result<()> {
        println!(
-            "----- image layer for ten {} tli {} key {}-{} at {} is_incremental {} size {} ----",
+            "----- image layer for ten {} tli {} key {}-{} at {} ----",
            self.desc.tenant_id,
            self.desc.timeline_id,
            self.desc.key_range.start,
            self.desc.key_range.end,
-            self.lsn,
-            self.desc.is_incremental,
-            self.desc.file_size
+            self.lsn
        );

        if !verbose {
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -174,16 +174,13 @@ impl PersistentLayerDesc {

    pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        println!(
-            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
+            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
            self.tenant_id,
            self.timeline_id,
            self.key_range.start,
            self.key_range.end,
            self.lsn_range.start,
-            self.lsn_range.end,
-            self.is_delta,
-            self.is_incremental,
-            self.file_size,
+            self.lsn_range.end
        );

        Ok(())
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -77,16 +77,13 @@ impl Layer for RemoteLayer {
    /// debugging function to print out the contents of the layer
    fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
        println!(
-            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} is_delta {} is_incremental {} size {} ----",
+            "----- remote layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
            self.desc.tenant_id,
            self.desc.timeline_id,
            self.desc.key_range.start,
            self.desc.key_range.end,
            self.desc.lsn_range.start,
-            self.desc.lsn_range.end,
-            self.desc.is_delta,
-            self.desc.is_incremental,
-            self.desc.file_size,
+            self.desc.lsn_range.end
        );

        Ok(())
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,9 +1,6 @@
 //!

 mod eviction_task;
-mod logical_size;
-pub mod span;
-pub mod uninit;
 mod walreceiver;

 use anyhow::{anyhow, bail, ensure, Context, Result};
@@ -11,6 +8,7 @@ use bytes::Bytes;
 use fail::fail_point;
 use futures::StreamExt;
 use itertools::Itertools;
+use once_cell::sync::OnceCell;
 use pageserver_api::models::{
    DownloadRemoteLayersTaskInfo, DownloadRemoteLayersTaskSpawnRequest,
    DownloadRemoteLayersTaskState, LayerMapInfo, LayerResidenceEventReason, LayerResidenceStatus,
@@ -19,7 +17,7 @@ use pageserver_api::models::{
 use remote_storage::GenericRemoteStorage;
 use serde_with::serde_as;
 use storage_broker::BrokerClientChannel;
-use tokio::sync::{oneshot, watch, TryAcquireError};
+use tokio::sync::{oneshot, watch, Semaphore, TryAcquireError};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::TenantTimelineId;
@@ -30,7 +28,7 @@ use std::fs;
 use std::ops::{Deref, Range};
 use std::path::{Path, PathBuf};
 use std::pin::pin;
-use std::sync::atomic::Ordering as AtomicOrdering;
+use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
 use std::sync::{Arc, Mutex, RwLock, Weak};
 use std::time::{Duration, Instant, SystemTime};

@@ -40,7 +38,6 @@ use crate::tenant::storage_layer::{
    DeltaFileName, DeltaLayerWriter, ImageFileName, ImageLayerWriter, InMemoryLayer,
    LayerAccessStats, LayerFileName, RemoteLayer,
 };
-use crate::tenant::timeline::logical_size::CurrentLogicalSize;
 use crate::tenant::{
    ephemeral_file::is_ephemeral_file,
    layer_map::{LayerMap, SearchResult},
@@ -82,7 +79,6 @@ use crate::{is_temporary, task_mgr};

 pub(super) use self::eviction_task::EvictionTaskTenantState;
 use self::eviction_task::EvictionTaskTimelineState;
-use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};

 use super::config::TenantConf;
@@ -369,6 +365,126 @@ pub struct Timeline {
    initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
 }

+/// Internal structure to hold all data needed for logical size calculation.
+///
+/// Calculation consists of two stages:
+///
+/// 1. Initial size calculation. That might take a long time, because it requires
+/// reading all layers containing relation sizes at `initial_part_end`.
+///
+/// 2. Collecting an incremental part and adding that to the initial size.
+/// Increments are appended on walreceiver writing new timeline data,
+/// which result in increase or decrease of the logical size.
+struct LogicalSize {
+    /// Size, potentially slow to compute. Calculating this might require reading multiple
+    /// layers, and even ancestor's layers.
+    ///
+    /// NOTE: size at a given LSN is constant, but after a restart we will calculate
+    /// the initial size at a different LSN.
+    initial_logical_size: OnceCell<u64>,
+
+    /// Semaphore to track ongoing calculation of `initial_logical_size`.
+    initial_size_computation: Arc<tokio::sync::Semaphore>,
+
+    /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
+    initial_part_end: Option<Lsn>,
+
+    /// All other size changes after startup, combined together.
+    ///
+    /// Size shouldn't ever be negative, but this is signed for two reasons:
+    ///
+    /// 1. If we initialized the "baseline" size lazily, while we already
+    /// process incoming WAL, the incoming WAL records could decrement the
+    /// variable and temporarily make it negative. (This is just future-proofing;
+    /// the initialization is currently not done lazily.)
+    ///
+    /// 2. If there is a bug and we e.g. forget to increment it in some cases
+    /// when size grows, but remember to decrement it when it shrinks again, the
+    /// variable could go negative. In that case, it seems better to at least
+    /// try to keep tracking it, rather than clamp or overflow it. Note that
+    /// get_current_logical_size() will clamp the returned value to zero if it's
+    /// negative, and log an error. Could set it permanently to zero or some
+    /// special value to indicate "broken" instead, but this will do for now.
+    ///
+    /// Note that we also expose a copy of this value as a prometheus metric,
+    /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
+    /// to modify this, it will also keep the prometheus metric in sync.
+    size_added_after_initial: AtomicI64,
+}
+
+/// Normalized current size, that the data in pageserver occupies.
+#[derive(Debug, Clone, Copy)]
+enum CurrentLogicalSize {
+    /// The size is not yet calculated to the end, this is an intermediate result,
+    /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
+    /// yet total logical size cannot be below 0.
+    Approximate(u64),
+    // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
+    // available for observation without any calculations.
+    Exact(u64),
+}
+
+impl CurrentLogicalSize {
+    fn size(&self) -> u64 {
+        *match self {
+            Self::Approximate(size) => size,
+            Self::Exact(size) => size,
+        }
+    }
+}
+
+impl LogicalSize {
+    fn empty_initial() -> Self {
+        Self {
+            initial_logical_size: OnceCell::with_value(0),
+            //  initial_logical_size already computed, so, don't admit any calculations
+            initial_size_computation: Arc::new(Semaphore::new(0)),
+            initial_part_end: None,
+            size_added_after_initial: AtomicI64::new(0),
+        }
+    }
+
+    fn deferred_initial(compute_to: Lsn) -> Self {
+        Self {
+            initial_logical_size: OnceCell::new(),
+            initial_size_computation: Arc::new(Semaphore::new(1)),
+            initial_part_end: Some(compute_to),
+            size_added_after_initial: AtomicI64::new(0),
+        }
+    }
+
+    fn current_size(&self) -> anyhow::Result<CurrentLogicalSize> {
+        let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
+        //                  ^^^ keep this type explicit so that the casts in this function break if
+        //                  we change the type.
+        match self.initial_logical_size.get() {
+            Some(initial_size) => {
+                initial_size.checked_add_signed(size_increment)
+                    .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
+                    .map(CurrentLogicalSize::Exact)
+            }
+            None => {
+                let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
+                Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
+            }
+        }
+    }
+
+    fn increment_size(&self, delta: i64) {
+        self.size_added_after_initial
+            .fetch_add(delta, AtomicOrdering::SeqCst);
+    }
+
+    /// Make the value computed by initial logical size computation
+    /// available for re-use. This doesn't contain the incremental part.
+    fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
+        match self.initial_part_end {
+            Some(v) if v == lsn => self.initial_logical_size.get().copied(),
+            _ => None,
+        }
+    }
+}
+
 pub struct WalReceiverInfo {
    pub wal_source_connconf: PgConnectionConfig,
    pub last_received_msg_lsn: Lsn,
@@ -2123,7 +2239,7 @@ impl Timeline {
        ctx: &RequestContext,
        cancel: CancellationToken,
    ) -> Result<u64, CalculateLogicalSizeError> {
-        span::debug_assert_current_span_has_tenant_and_timeline_id();
+        debug_assert_current_span_has_tenant_and_timeline_id();

        let mut timeline_state_updates = self.subscribe_for_state_updates();
        let self_calculation = Arc::clone(self);
@@ -4429,7 +4545,7 @@ impl Timeline {
        &self,
        remote_layer: Arc<RemoteLayer>,
    ) -> anyhow::Result<()> {
-        span::debug_assert_current_span_has_tenant_and_timeline_id();
+        debug_assert_current_span_has_tenant_and_timeline_id();

        use std::sync::atomic::Ordering::Relaxed;

@@ -4863,6 +4979,33 @@ fn rename_to_backup(path: &Path) -> anyhow::Result<()> {
    bail!("couldn't find an unused backup number for {:?}", path)
 }

+#[cfg(not(debug_assertions))]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
+
+#[cfg(debug_assertions)]
+#[inline]
+pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
+    use utils::tracing_span_assert;
+
+    pub static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<
+        tracing_span_assert::MultiNameExtractor<2>,
+    > = once_cell::sync::Lazy::new(|| {
+        tracing_span_assert::MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"])
+    });
+
+    match tracing_span_assert::check_fields_present([
+        &*super::TENANT_ID_EXTRACTOR,
+        &*TIMELINE_ID_EXTRACTOR,
+    ]) {
+        Ok(()) => (),
+        Err(missing) => panic!(
+            "missing extractors: {:?}",
+            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
+        ),
+    }
+}
+
 /// Similar to `Arc::ptr_eq`, but only compares the object pointers, not vtables.
 ///
 /// Returns `true` if the two `Arc` point to the same layer, false otherwise.
--- a/pageserver/src/tenant/timeline/logical_size.rs
+++ b/pageserver/src/tenant/timeline/logical_size.rs
@@ -1,128 +0,0 @@
-use anyhow::Context;
-use once_cell::sync::OnceCell;
-
-use tokio::sync::Semaphore;
-use utils::lsn::Lsn;
-
-use std::sync::atomic::{AtomicI64, Ordering as AtomicOrdering};
-use std::sync::Arc;
-
-/// Internal structure to hold all data needed for logical size calculation.
-///
-/// Calculation consists of two stages:
-///
-/// 1. Initial size calculation. That might take a long time, because it requires
-/// reading all layers containing relation sizes at `initial_part_end`.
-///
-/// 2. Collecting an incremental part and adding that to the initial size.
-/// Increments are appended on walreceiver writing new timeline data,
-/// which result in increase or decrease of the logical size.
-pub(super) struct LogicalSize {
-    /// Size, potentially slow to compute. Calculating this might require reading multiple
-    /// layers, and even ancestor's layers.
-    ///
-    /// NOTE: size at a given LSN is constant, but after a restart we will calculate
-    /// the initial size at a different LSN.
-    pub initial_logical_size: OnceCell<u64>,
-
-    /// Semaphore to track ongoing calculation of `initial_logical_size`.
-    pub initial_size_computation: Arc<tokio::sync::Semaphore>,
-
-    /// Latest Lsn that has its size uncalculated, could be absent for freshly created timelines.
-    pub initial_part_end: Option<Lsn>,
-
-    /// All other size changes after startup, combined together.
-    ///
-    /// Size shouldn't ever be negative, but this is signed for two reasons:
-    ///
-    /// 1. If we initialized the "baseline" size lazily, while we already
-    /// process incoming WAL, the incoming WAL records could decrement the
-    /// variable and temporarily make it negative. (This is just future-proofing;
-    /// the initialization is currently not done lazily.)
-    ///
-    /// 2. If there is a bug and we e.g. forget to increment it in some cases
-    /// when size grows, but remember to decrement it when it shrinks again, the
-    /// variable could go negative. In that case, it seems better to at least
-    /// try to keep tracking it, rather than clamp or overflow it. Note that
-    /// get_current_logical_size() will clamp the returned value to zero if it's
-    /// negative, and log an error. Could set it permanently to zero or some
-    /// special value to indicate "broken" instead, but this will do for now.
-    ///
-    /// Note that we also expose a copy of this value as a prometheus metric,
-    /// see `current_logical_size_gauge`. Use the `update_current_logical_size`
-    /// to modify this, it will also keep the prometheus metric in sync.
-    pub size_added_after_initial: AtomicI64,
-}
-
-/// Normalized current size, that the data in pageserver occupies.
-#[derive(Debug, Clone, Copy)]
-pub(super) enum CurrentLogicalSize {
-    /// The size is not yet calculated to the end, this is an intermediate result,
-    /// constructed from walreceiver increments and normalized: logical data could delete some objects, hence be negative,
-    /// yet total logical size cannot be below 0.
-    Approximate(u64),
-    // Fully calculated logical size, only other future walreceiver increments are changing it, and those changes are
-    // available for observation without any calculations.
-    Exact(u64),
-}
-
-impl CurrentLogicalSize {
-    pub(super) fn size(&self) -> u64 {
-        *match self {
-            Self::Approximate(size) => size,
-            Self::Exact(size) => size,
-        }
-    }
-}
-
-impl LogicalSize {
-    pub(super) fn empty_initial() -> Self {
-        Self {
-            initial_logical_size: OnceCell::with_value(0),
-            //  initial_logical_size already computed, so, don't admit any calculations
-            initial_size_computation: Arc::new(Semaphore::new(0)),
-            initial_part_end: None,
-            size_added_after_initial: AtomicI64::new(0),
-        }
-    }
-
-    pub(super) fn deferred_initial(compute_to: Lsn) -> Self {
-        Self {
-            initial_logical_size: OnceCell::new(),
-            initial_size_computation: Arc::new(Semaphore::new(1)),
-            initial_part_end: Some(compute_to),
-            size_added_after_initial: AtomicI64::new(0),
-        }
-    }
-
-    pub(super) fn current_size(&self) -> anyhow::Result<CurrentLogicalSize> {
-        let size_increment: i64 = self.size_added_after_initial.load(AtomicOrdering::Acquire);
-        //                  ^^^ keep this type explicit so that the casts in this function break if
-        //                  we change the type.
-        match self.initial_logical_size.get() {
-            Some(initial_size) => {
-                initial_size.checked_add_signed(size_increment)
-                    .with_context(|| format!("Overflow during logical size calculation, initial_size: {initial_size}, size_increment: {size_increment}"))
-                    .map(CurrentLogicalSize::Exact)
-            }
-            None => {
-                let non_negative_size_increment = u64::try_from(size_increment).unwrap_or(0);
-                Ok(CurrentLogicalSize::Approximate(non_negative_size_increment))
-            }
-        }
-    }
-
-    pub(super) fn increment_size(&self, delta: i64) {
-        self.size_added_after_initial
-            .fetch_add(delta, AtomicOrdering::SeqCst);
-    }
-
-    /// Make the value computed by initial logical size computation
-    /// available for re-use. This doesn't contain the incremental part.
-    pub(super) fn initialized_size(&self, lsn: Lsn) -> Option<u64> {
-        match self.initial_part_end {
-            Some(v) if v == lsn => self.initial_logical_size.get().copied(),
-            _ => None,
-        }
-    }
-}
--- a/pageserver/src/tenant/timeline/span.rs
+++ b/pageserver/src/tenant/timeline/span.rs
@@ -1,25 +0,0 @@
-#[cfg(debug_assertions)]
-use utils::tracing_span_assert::{check_fields_present, Extractor, MultiNameExtractor};
-
-#[cfg(not(debug_assertions))]
-pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {}
-
-#[cfg(debug_assertions)]
-#[track_caller]
-pub(crate) fn debug_assert_current_span_has_tenant_and_timeline_id() {
-    static TIMELINE_ID_EXTRACTOR: once_cell::sync::Lazy<MultiNameExtractor<2>> =
-        once_cell::sync::Lazy::new(|| {
-            MultiNameExtractor::new("TimelineId", ["timeline_id", "timeline"])
-        });
-
-    let fields: [&dyn Extractor; 2] = [
-        &*crate::tenant::span::TENANT_ID_EXTRACTOR,
-        &*TIMELINE_ID_EXTRACTOR,
-    ];
-    if let Err(missing) = check_fields_present(fields) {
-        panic!(
-            "missing extractors: {:?}",
-            missing.into_iter().map(|e| e.name()).collect::<Vec<_>>()
-        )
-    }
-}
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -1,219 +0,0 @@
-use std::{collections::hash_map::Entry, fs, path::PathBuf, sync::Arc};
-
-use anyhow::Context;
-use tracing::{error, info, info_span, warn};
-use utils::{crashsafe, id::TimelineId, lsn::Lsn};
-
-use crate::{
-    context::RequestContext,
-    import_datadir,
-    tenant::{ignore_absent_files, Tenant},
-};
-
-use super::Timeline;
-
-/// A timeline with some of its files on disk, being initialized.
-/// This struct ensures the atomicity of the timeline init: it's either properly created and inserted into pageserver's memory, or
-/// its local files are removed. In the worst case of a crash, an uninit mark file is left behind, which causes the directory
-/// to be removed on next restart.
-///
-/// The caller is responsible for proper timeline data filling before the final init.
-#[must_use]
-pub struct UninitializedTimeline<'t> {
-    pub(crate) owning_tenant: &'t Tenant,
-    timeline_id: TimelineId,
-    raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
-}
-
-impl<'t> UninitializedTimeline<'t> {
-    pub(crate) fn new(
-        owning_tenant: &'t Tenant,
-        timeline_id: TimelineId,
-        raw_timeline: Option<(Arc<Timeline>, TimelineUninitMark)>,
-    ) -> Self {
-        Self {
-            owning_tenant,
-            timeline_id,
-            raw_timeline,
-        }
-    }
-
-    /// Finish timeline creation: insert it into the Tenant's timelines map and remove the
-    /// uninit mark file.
-    ///
-    /// This function launches the flush loop if not already done.
-    ///
-    /// The caller is responsible for activating the timeline (function `.activate()`).
-    pub(crate) fn finish_creation(mut self) -> anyhow::Result<Arc<Timeline>> {
-        let timeline_id = self.timeline_id;
-        let tenant_id = self.owning_tenant.tenant_id;
-
-        let (new_timeline, uninit_mark) = self.raw_timeline.take().with_context(|| {
-            format!("No timeline for initalization found for {tenant_id}/{timeline_id}")
-        })?;
-
-        // Check that the caller initialized disk_consistent_lsn
-        let new_disk_consistent_lsn = new_timeline.get_disk_consistent_lsn();
-        anyhow::ensure!(
-            new_disk_consistent_lsn.is_valid(),
-            "new timeline {tenant_id}/{timeline_id} has invalid disk_consistent_lsn"
-        );
-
-        let mut timelines = self.owning_tenant.timelines.lock().unwrap();
-        match timelines.entry(timeline_id) {
-            Entry::Occupied(_) => anyhow::bail!(
-                "Found freshly initialized timeline {tenant_id}/{timeline_id} in the tenant map"
-            ),
-            Entry::Vacant(v) => {
-                uninit_mark.remove_uninit_mark().with_context(|| {
-                    format!(
-                        "Failed to remove uninit mark file for timeline {tenant_id}/{timeline_id}"
-                    )
-                })?;
-                v.insert(Arc::clone(&new_timeline));
-
-                new_timeline.maybe_spawn_flush_loop();
-            }
-        }
-
-        Ok(new_timeline)
-    }
-
-    /// Prepares timeline data by loading it from the basebackup archive.
-    pub(crate) async fn import_basebackup_from_tar(
-        self,
-        copyin_read: &mut (impl tokio::io::AsyncRead + Send + Sync + Unpin),
-        base_lsn: Lsn,
-        broker_client: storage_broker::BrokerClientChannel,
-        ctx: &RequestContext,
-    ) -> anyhow::Result<Arc<Timeline>> {
-        let raw_timeline = self.raw_timeline()?;
-
-        import_datadir::import_basebackup_from_tar(raw_timeline, copyin_read, base_lsn, ctx)
-            .await
-            .context("Failed to import basebackup")?;
-
-        // Flush the new layer files to disk, before we make the timeline as available to
-        // the outside world.
-        //
-        // Flush loop needs to be spawned in order to be able to flush.
-        raw_timeline.maybe_spawn_flush_loop();
-
-        fail::fail_point!("before-checkpoint-new-timeline", |_| {
-            anyhow::bail!("failpoint before-checkpoint-new-timeline");
-        });
-
-        raw_timeline
-            .freeze_and_flush()
-            .await
-            .context("Failed to flush after basebackup import")?;
-
-        // All the data has been imported. Insert the Timeline into the tenant's timelines
-        // map and remove the uninit mark file.
-        let tl = self.finish_creation()?;
-        tl.activate(broker_client, None, ctx);
-        Ok(tl)
-    }
-
-    pub(crate) fn raw_timeline(&self) -> anyhow::Result<&Arc<Timeline>> {
-        Ok(&self
-            .raw_timeline
-            .as_ref()
-            .with_context(|| {
-                format!(
-                    "No raw timeline {}/{} found",
-                    self.owning_tenant.tenant_id, self.timeline_id
-                )
-            })?
-            .0)
-    }
-}
-
-impl Drop for UninitializedTimeline<'_> {
-    fn drop(&mut self) {
-        if let Some((_, uninit_mark)) = self.raw_timeline.take() {
-            let _entered = info_span!("drop_uninitialized_timeline", tenant = %self.owning_tenant.tenant_id, timeline = %self.timeline_id).entered();
-            error!("Timeline got dropped without initializing, cleaning its files");
-            cleanup_timeline_directory(uninit_mark);
-        }
-    }
-}
-
-pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
-    let timeline_path = &uninit_mark.timeline_path;
-    match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
-        Ok(()) => {
-            info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
-        }
-        Err(e) => {
-            error!("Failed to clean up uninitialized timeline directory {timeline_path:?}: {e:?}")
-        }
-    }
-    drop(uninit_mark); // mark handles its deletion on drop, gets retained if timeline dir exists
-}
-
-/// An uninit mark file, created along the timeline dir to ensure the timeline either gets fully initialized and loaded into pageserver's memory,
-/// or gets removed eventually.
-///
-/// XXX: it's important to create it near the timeline dir, not inside it to ensure timeline dir gets removed first.
-#[must_use]
-pub(crate) struct TimelineUninitMark {
-    uninit_mark_deleted: bool,
-    uninit_mark_path: PathBuf,
-    pub(crate) timeline_path: PathBuf,
-}
-
-impl TimelineUninitMark {
-    pub(crate) fn new(uninit_mark_path: PathBuf, timeline_path: PathBuf) -> Self {
-        Self {
-            uninit_mark_deleted: false,
-            uninit_mark_path,
-            timeline_path,
-        }
-    }
-
-    fn remove_uninit_mark(mut self) -> anyhow::Result<()> {
-        if !self.uninit_mark_deleted {
-            self.delete_mark_file_if_present()?;
-        }
-
-        Ok(())
-    }
-
-    fn delete_mark_file_if_present(&mut self) -> anyhow::Result<()> {
-        let uninit_mark_file = &self.uninit_mark_path;
-        let uninit_mark_parent = uninit_mark_file
-            .parent()
-            .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
-        ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
-            format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
-        })?;
-        crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
-        self.uninit_mark_deleted = true;
-
-        Ok(())
-    }
-}
-
-impl Drop for TimelineUninitMark {
-    fn drop(&mut self) {
-        if !self.uninit_mark_deleted {
-            if self.timeline_path.exists() {
-                error!(
-                    "Uninit mark {} is not removed, timeline {} stays uninitialized",
-                    self.uninit_mark_path.display(),
-                    self.timeline_path.display()
-                )
-            } else {
-                // unblock later timeline creation attempts
-                warn!(
-                    "Removing intermediate uninit mark file {}",
-                    self.uninit_mark_path.display()
-                );
-                if let Err(e) = self.delete_mark_file_if_present() {
-                    error!("Failed to remove the uninit mark file: {e}")
-                }
-            }
-        }
-    }
-}
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -34,6 +34,7 @@

 #define PageStoreTrace DEBUG5

+#define MAX_RECONNECT_ATTEMPTS 5
 #define RECONNECT_INTERVAL_USEC 1000000

 bool		connected = false;
@@ -54,15 +55,13 @@ int32		max_cluster_size;
 char	   *page_server_connstring;
 char	   *neon_auth_token;

-int			readahead_buffer_size = 128;
+int			n_unflushed_requests = 0;
 int			flush_every_n_requests = 8;
-
-int			n_reconnect_attempts = 0;
-int			max_reconnect_attempts = 60;
+int			readahead_buffer_size = 128;

 bool	(*old_redo_read_buffer_filter) (XLogReaderState *record, uint8 block_id) = NULL;

-static bool pageserver_flush(void);
+static void pageserver_flush(void);

 static bool
 pageserver_connect(int elevel)
@@ -233,17 +232,16 @@ pageserver_disconnect(void)
 	}
 }

-static bool
+static void
 pageserver_send(NeonRequest * request)
 {
 	StringInfoData req_buff;
+	int n_reconnect_attempts = 0;

 	/* If the connection was lost for some reason, reconnect */
 	if (connected && PQstatus(pageserver_conn) == CONNECTION_BAD)
-	{
-		neon_log(LOG, "pageserver_send disconnect bad connection");
 		pageserver_disconnect();
-	}
+

 	req_buff = nm_pack_request(request);

@@ -254,36 +252,53 @@ pageserver_send(NeonRequest * request)
 	 * See https://github.com/neondatabase/neon/issues/1138
 	 * So try to reestablish connection in case of failure.
 	 */
-	if (!connected)
+	while (true)
 	{
-		while (!pageserver_connect(n_reconnect_attempts < max_reconnect_attempts ? LOG : ERROR))
+		if (!connected)
 		{
-			n_reconnect_attempts += 1;
-			pg_usleep(RECONNECT_INTERVAL_USEC);
+			if (!pageserver_connect(n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS ? LOG : ERROR))
+			{
+				n_reconnect_attempts += 1;
+				pg_usleep(RECONNECT_INTERVAL_USEC);
+				continue;
+			}
 		}
-		n_reconnect_attempts = 0;
-	}

-	/*
-	 * Send request.
-	 *
-	 * In principle, this could block if the output buffer is full, and we
-	 * should use async mode and check for interrupts while waiting. In
-	 * practice, our requests are small enough to always fit in the output and
-	 * TCP buffer.
-	 */
-	if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
-	{
-		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-		pageserver_disconnect();
-		neon_log(LOG, "pageserver_send disconnect because failed to send page request (try to reconnect): %s", msg);
-		pfree(msg);
-		pfree(req_buff.data);
-		return false;
+		/*
+		 * Send request.
+		 *
+		 * In principle, this could block if the output buffer is full, and we
+		 * should use async mode and check for interrupts while waiting. In
+		 * practice, our requests are small enough to always fit in the output and
+		 * TCP buffer.
+		 */
+		if (PQputCopyData(pageserver_conn, req_buff.data, req_buff.len) <= 0)
+		{
+			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+			if (n_reconnect_attempts < MAX_RECONNECT_ATTEMPTS)
+			{
+				neon_log(LOG, "failed to send page request (try to reconnect): %s", msg);
+				if (n_reconnect_attempts != 0) /* do not sleep before first reconnect attempt, assuming that pageserver is already restarted */
+					pg_usleep(RECONNECT_INTERVAL_USEC);
+				n_reconnect_attempts += 1;
+				continue;
+			}
+			else
+			{
+				pageserver_disconnect();
+				neon_log(ERROR, "failed to send page request: %s", msg);
+			}
+		}
+		break;
 	}

 	pfree(req_buff.data);

+	n_unflushed_requests++;
+
+	if (flush_every_n_requests > 0 && n_unflushed_requests >= flush_every_n_requests)
+		pageserver_flush();
+
 	if (message_level_is_interesting(PageStoreTrace))
 	{
 		char	   *msg = nm_to_string((NeonMessage *) request);
@@ -291,7 +306,6 @@ pageserver_send(NeonRequest * request)
 		neon_log(PageStoreTrace, "sent request: %s", msg);
 		pfree(msg);
 	}
-	return true;
 }

 static NeonResponse *
@@ -326,25 +340,16 @@ pageserver_receive(void)
 		}
 		else if (rc == -1)
 		{
-			neon_log(LOG, "pageserver_receive disconnect because call_PQgetCopyData returns -1: %s", pchomp(PQerrorMessage(pageserver_conn)));
 			pageserver_disconnect();
 			resp = NULL;
 		}
 		else if (rc == -2)
-		{
-			char* msg = pchomp(PQerrorMessage(pageserver_conn));
-			pageserver_disconnect();
-			neon_log(ERROR, "pageserver_receive disconnect because could not read COPY data: %s", msg);
-		}
+			neon_log(ERROR, "could not read COPY data: %s", pchomp(PQerrorMessage(pageserver_conn)));
 		else
-		{
-			pageserver_disconnect();
-			neon_log(ERROR, "pageserver_receive disconnect because unexpected PQgetCopyData return value: %d", rc);
-		}
+			neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc);
 	}
 	PG_CATCH();
 	{
-		neon_log(LOG, "pageserver_receive disconnect due to caught exception");
 		pageserver_disconnect();
 		PG_RE_THROW();
 	}
@@ -354,25 +359,21 @@ pageserver_receive(void)
 }


-static bool
+static void
 pageserver_flush(void)
 {
 	if (!connected)
 	{
 		neon_log(WARNING, "Tried to flush while disconnected");
 	}
-	else
+	else if (PQflush(pageserver_conn))
 	{
-		if (PQflush(pageserver_conn))
-		{
-			char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
-			pageserver_disconnect();
-			neon_log(LOG, "pageserver_flush disconnect because failed to flush page requests: %s", msg);
-			pfree(msg);
-			return false;
-		}
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+
+		pageserver_disconnect();
+		neon_log(ERROR, "failed to flush page requests: %s", msg);
 	}
-	return true;
+	n_unflushed_requests = 0;
 }

 page_server_api api = {
@@ -438,14 +439,6 @@ pg_init_libpagestore(void)
 							PGC_USERSET,
 							0,	/* no flags required */
 							NULL, NULL, NULL);
-	DefineCustomIntVariable("neon.max_reconnect_attempts",
-							"Maximal attempts to reconnect to pages server (with 1 second timeout)",
-							NULL,
-							&max_reconnect_attempts,
-							10, 0, INT_MAX,
-							PGC_USERSET,
-							0,
-							NULL, NULL, NULL);
 	DefineCustomIntVariable("neon.readahead_buffer_size",
 							"number of prefetches to buffer",
 							"This buffer is used to hold and manage prefetched "
--- a/pgxn/neon/neon--1.0.sql
+++ b/pgxn/neon/neon--1.0.sql
@@ -32,3 +32,7 @@ CREATE VIEW local_cache AS
 	SELECT P.* FROM local_cache_pages() AS P
 	(pageoffs int8, relfilenode oid, reltablespace oid, reldatabase oid,
 	 relforknumber int2, relblocknumber int8, accesscount int4);
+
+CREATE FUNCTION copy_from(conninfo cstring) RETURNS BIGINT
+AS 'MODULE_PATHNAME', 'copy_from'
+LANGUAGE C;
--- a/pgxn/neon/neon.c
+++ b/pgxn/neon/neon.c
@@ -13,20 +13,32 @@

 #include "access/xact.h"
 #include "access/xlog.h"
+#include "access/relation.h"
+#include "access/xloginsert.h"
 #include "storage/buf_internals.h"
 #include "storage/bufmgr.h"
 #include "catalog/pg_type.h"
+#include "catalog/namespace.h"
 #include "replication/walsender.h"
 #include "funcapi.h"
+#include "miscadmin.h"
 #include "access/htup_details.h"
 #include "utils/pg_lsn.h"
 #include "utils/guc.h"
+#include "utils/wait_event.h"
+#include "utils/rel.h"
+#include "utils/varlena.h"
+#include "utils/builtins.h"

 #include "neon.h"
 #include "walproposer.h"
 #include "pagestore_client.h"
 #include "control_plane_connector.h"

+#include "libpq-fe.h"
+#include "libpq/pqformat.h"
+#include "libpq/libpq.h"
+
 PG_MODULE_MAGIC;
 void		_PG_init(void);

@@ -46,6 +58,7 @@ _PG_init(void)
 PG_FUNCTION_INFO_V1(pg_cluster_size);
 PG_FUNCTION_INFO_V1(backpressure_lsns);
 PG_FUNCTION_INFO_V1(backpressure_throttling_time);
+PG_FUNCTION_INFO_V1(copy_from);

 Datum
 pg_cluster_size(PG_FUNCTION_ARGS)
@@ -91,3 +104,281 @@ backpressure_throttling_time(PG_FUNCTION_ARGS)
 {
 	PG_RETURN_UINT64(BackpressureThrottlingTime());
 }
+
+
+#define N_RAW_PAGE_COLUMNS 4
+#define COPY_FETCH_COUNT   16
+
+
+static void
+report_error(int elevel, PGresult *res, PGconn *conn,
+			 bool clear, const char *sql)
+{
+	/* If requested, PGresult must be released before leaving this function. */
+	PG_TRY();
+	{
+		char	   *diag_sqlstate = PQresultErrorField(res, PG_DIAG_SQLSTATE);
+		char	   *message_primary = PQresultErrorField(res, PG_DIAG_MESSAGE_PRIMARY);
+		char	   *message_detail = PQresultErrorField(res, PG_DIAG_MESSAGE_DETAIL);
+		char	   *message_hint = PQresultErrorField(res, PG_DIAG_MESSAGE_HINT);
+		char	   *message_context = PQresultErrorField(res, PG_DIAG_CONTEXT);
+		int			sqlstate;
+
+		if (diag_sqlstate)
+			sqlstate = MAKE_SQLSTATE(diag_sqlstate[0],
+									 diag_sqlstate[1],
+									 diag_sqlstate[2],
+									 diag_sqlstate[3],
+									 diag_sqlstate[4]);
+		else
+			sqlstate = ERRCODE_CONNECTION_FAILURE;
+
+		/*
+		 * If we don't get a message from the PGresult, try the PGconn.  This
+		 * is needed because for connection-level failures, PQexec may just
+		 * return NULL, not a PGresult at all.
+		 */
+		if (message_primary == NULL)
+			message_primary = pchomp(PQerrorMessage(conn));
+
+		ereport(elevel,
+				(errcode(sqlstate),
+				 (message_primary != NULL && message_primary[0] != '\0') ?
+				 errmsg_internal("%s", message_primary) :
+				 errmsg("could not obtain message string for remote error"),
+				 message_detail ? errdetail_internal("%s", message_detail) : 0,
+				 message_hint ? errhint("%s", message_hint) : 0,
+				 message_context ? errcontext("%s", message_context) : 0,
+				 sql ? errcontext("remote SQL command: %s", sql) : 0));
+	}
+	PG_FINALLY();
+	{
+		if (clear)
+			PQclear(res);
+	}
+	PG_END_TRY();
+}
+
+static PGresult *
+get_result(PGconn *conn, const char *query)
+{
+	PGresult   *volatile last_res = NULL;
+
+	/* In what follows, do not leak any PGresults on an error. */
+	PG_TRY();
+	{
+		for (;;)
+		{
+			PGresult   *res;
+
+			while (PQisBusy(conn))
+			{
+				int			wc;
+
+				/* Sleep until there's something to do */
+				wc = WaitLatchOrSocket(MyLatch,
+									   WL_LATCH_SET | WL_SOCKET_READABLE |
+									   WL_EXIT_ON_PM_DEATH,
+									   PQsocket(conn),
+									   -1L, PG_WAIT_EXTENSION);
+				ResetLatch(MyLatch);
+
+				CHECK_FOR_INTERRUPTS();
+
+				/* Data available in socket? */
+				if (wc & WL_SOCKET_READABLE)
+				{
+					if (!PQconsumeInput(conn))
+						report_error(ERROR, NULL, conn, false, query);
+				}
+			}
+
+			res = PQgetResult(conn);
+			if (res == NULL)
+				break;			/* query is complete */
+
+			PQclear(last_res);
+			last_res = res;
+		}
+	}
+	PG_CATCH();
+	{
+		PQclear(last_res);
+		PG_RE_THROW();
+	}
+	PG_END_TRY();
+
+	return last_res;
+}
+
+#define CREATE_COPYDATA_FUNC "\
+create or replace function copydata() returns setof record as $$ \
+declare \
+    relsize integer; \
+    total_relsize integer; \
+	content bytea; \
+	r record; \
+	fork text; \
+	relname text; \
+	pagesize integer; \
+begin \
+	pagesize = current_setting('block_size'); \
+	for r in select oid,reltoastrelid from pg_class where relnamespace not in (select oid from pg_namespace where nspname in ('pg_catalog','pg_toast','information_schema')) \
+	loop \
+		relname = r.oid::regclass::text; \
+        total_relsize = 0; \
+	    foreach fork in array array['main','vm','fsm'] \
+		loop \
+		    relsize = pg_relation_size(r.oid, fork)/pagesize; \
+			total_relsize = total_relsize + relsize; \
+	        for p in 1..relsize \
+		    loop \
+			    content = get_raw_page(relname, fork, p-1); \
+				return next row(relname,fork,p-1,content); \
+			end loop; \
+		end loop; \
+        if total_relsize <> 0 and r.reltoastrelid <> 0 then \
+            foreach relname in array array ['pg_toast.pg_toast_'||r.oid, 'pg_toast.pg_toast_'||r.oid||'_index'] \
+			loop \
+		    	foreach fork in array array['main','vm','fsm'] \
+				loop \
+			    	relsize = pg_relation_size(relname, fork)/pagesize; \
+	        		for p in 1..relsize \
+		    		loop \
+			    		content = get_raw_page(relname, fork, p-1); \
+						return next row(relname,fork,p-1,content); \
+					end loop; \
+				end loop; \
+			end loop; \
+        end if; \
+	end loop; \
+end; \
+$$ language plpgsql"
+
+Datum
+copy_from(PG_FUNCTION_ARGS)
+{
+	char const* conninfo = PG_GETARG_CSTRING(0);
+	PGconn* conn;
+	char const* declare_cursor = "declare copy_data_cursor no scroll cursor for select * from copydata() as raw_page(relid text, fork text, blkno integer, content bytea)";
+	char* fetch_cursor = psprintf("fetch forward %d copy_data_cursor", COPY_FETCH_COUNT);
+	char const* close_cursor = "close copy_data_cursor";
+	char const* vacuum_freeze = "vacuum freeze";
+	char   *content;
+	char const* relname;
+	BlockNumber blkno;
+	ForkNumber forknum;
+	BlockNumber prev_blkno = InvalidBlockNumber;
+	RangeVar   *relrv;
+	Relation rel = NULL;
+	BlockNumber rel_size;
+	int64_t total = 0;
+	PGresult   *res;
+	char blkno_buf[4];
+	int n_tuples;
+	Buffer buf;
+	char* toast_rel_name;
+	Oid relid = InvalidOid;
+
+	/* Connect to the source database */
+	conn = PQconnectdb(conninfo);
+	if (!conn || PQstatus(conn) != CONNECTION_OK)
+		ereport(ERROR,
+				(errcode(ERRCODE_SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION),
+				 errmsg("could not connect to server \"%s\"",
+						conninfo),
+				 errdetail_internal("%s", pchomp(PQerrorMessage(conn)))));
+
+	/* First create store procedure (assumes that pageinspector extension is already installed) */
+	res = PQexec(conn, CREATE_COPYDATA_FUNC);
+	if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK)
+		report_error(ERROR, res, conn, true, CREATE_COPYDATA_FUNC);
+	PQclear(res);
+
+	/* Freeze all tables to prevent problems with XID mapping */
+	res = PQexec(conn, vacuum_freeze);
+	if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK)
+		report_error(ERROR, res, conn, true, vacuum_freeze);
+	PQclear(res);
+
+	/* Start transaction to use cursor */
+	res = PQexec(conn, "BEGIN");
+	if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK)
+		report_error(ERROR, res, conn, true, "BEGIN");
+	PQclear(res);
+
+	/* Declare cursor (we have to use cursor to avoid materializing all database in memory) */
+	res = PQexec(conn, declare_cursor);
+	if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK)
+		report_error(ERROR, res, conn, true, declare_cursor);
+	PQclear(res);
+
+	/* Get database data */
+	while ((res = PQexecParams(conn, fetch_cursor, 0, NULL, NULL, NULL, NULL, 1)) != NULL)
+	{
+		if (PQresultStatus(res) != PGRES_TUPLES_OK)
+			report_error(ERROR, res, conn, true, fetch_cursor);
+
+		n_tuples = PQntuples(res);
+		if (PQnfields(res) != 4)
+			elog(ERROR, "unexpected result from copydata()");
+
+		for (int i = 0; i < n_tuples; i++)
+		{
+			relname = PQgetvalue(res, i, 0);
+			forknum = forkname_to_number(PQgetvalue(res, i, 1));
+			memcpy(&blkno, PQgetvalue(res, i, 2), sizeof(BlockNumber));
+			blkno = pg_ntoh32(blkno);
+			content = (char*)PQgetvalue(res, i, 3);
+
+			if (blkno <= prev_blkno)
+			{
+				if (forknum == MAIN_FORKNUM)
+				{
+					char* dst_rel_name = strncmp(relname, "pg_toast.", 9) == 0
+						/* Construct correct TOAST table name */
+						? psprintf("pg_toast.pg_toast_%u%s",
+								   relid,
+								   strcmp(relname + strlen(relname) - 5, "index") == 0 ? "_index" : "")
+						: (char*)relname;
+					if (rel)
+						relation_close(rel, AccessExclusiveLock);
+					relrv = makeRangeVarFromNameList(textToQualifiedNameList(cstring_to_text(dst_rel_name)));
+					rel = relation_openrv(relrv, AccessExclusiveLock);
+					if (dst_rel_name != relname)
+						pfree(dst_rel_name);
+					else
+						relid = RelationGetRelid(rel);
+				}
+				rel_size = RelationGetNumberOfBlocksInFork(rel, forknum);
+			}
+			buf = ReadBufferExtended(rel, forknum, blkno < rel_size ? blkno : P_NEW, RBM_ZERO_AND_LOCK, NULL);
+			MarkBufferDirty(buf);
+			memcpy(BufferGetPage(buf), content, BLCKSZ);
+			log_newpage_buffer(buf, forknum == MAIN_FORKNUM);
+			UnlockReleaseBuffer(buf);
+
+			total += 1;
+			prev_blkno = blkno;
+		}
+		PQclear(res);
+		if (n_tuples < COPY_FETCH_COUNT)
+			break;
+	}
+	res = PQexec(conn, close_cursor);
+	if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK)
+		report_error(ERROR, res, conn, true, close_cursor);
+	PQclear(res);
+
+	if (rel)
+		relation_close(rel, AccessExclusiveLock);
+
+	/* Complete transaction */
+	res = PQexec(conn, "END");
+	if (res == NULL || PQresultStatus(res) != PGRES_COMMAND_OK)
+		report_error(ERROR, res, conn, true, "END");
+	PQclear(res);
+
+	PQfinish(conn);
+	PG_RETURN_INT64(total);
+}
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -145,9 +145,9 @@ extern char *nm_to_string(NeonMessage * msg);

 typedef struct
 {
-	bool		(*send) (NeonRequest * request);
+	void		(*send) (NeonRequest * request);
 	NeonResponse *(*receive) (void);
-	bool		(*flush) (void);
+	void		(*flush) (void);
 }			page_server_api;

 extern void prefetch_on_ps_disconnect(void);
--- a/pgxn/neon/pagestore_smgr.c
+++ b/pgxn/neon/pagestore_smgr.c
@@ -489,8 +489,7 @@ prefetch_wait_for(uint64 ring_index)
 	if (MyPState->ring_flush <= ring_index &&
 		MyPState->ring_unused > MyPState->ring_flush)
 	{
-		if (!page_server->flush())
-			return false;
+		page_server->flush();
 		MyPState->ring_flush = MyPState->ring_unused;
 	}

@@ -667,7 +666,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 		 * smaller than the current WAL insert/redo pointer, which is already
 		 * larger than this prefetch_lsn. So in any case, that would
 		 * invalidate this cache.
-		 *
+		 * 
 		 * The best LSN to use for effective_request_lsn would be
 		 * XLogCtl->Insert.RedoRecPtr, but that's expensive to access.
 		 */
@@ -678,8 +677,7 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force

 	Assert(slot->response == NULL);
 	Assert(slot->my_ring_index == MyPState->ring_unused);
-
-	while (!page_server->send((NeonRequest *) &request));
+	page_server->send((NeonRequest *) &request);

 	/* update prefetch state */
 	MyPState->n_requests_inflight += 1;
@@ -689,7 +687,6 @@ prefetch_do_request(PrefetchRequest *slot, bool *force_latest, XLogRecPtr *force
 	/* update slot state */
 	slot->status = PRFS_REQUESTED;

-
 	prfh_insert(MyPState->prf_hash, slot, &found);
 	Assert(!found);
 }
@@ -746,7 +743,6 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 					prefetch_set_unused(ring_index);
 					entry = NULL;
 				}
-
 			}
 			/* if we don't want the latest version, only accept requests with the exact same LSN */
 			else
@@ -760,23 +756,20 @@ prefetch_register_buffer(BufferTag tag, bool *force_latest, XLogRecPtr *force_ls
 			}
 		}

-		if (entry != NULL)
+		/*
+		 * We received a prefetch for a page that was recently read and
+		 * removed from the buffers. Remove that request from the buffers.
+		 */
+		else if (slot->status == PRFS_TAG_REMAINS)
 		{
-			/*
-			 * We received a prefetch for a page that was recently read and
-			 * removed from the buffers. Remove that request from the buffers.
-			 */
-			if (slot->status == PRFS_TAG_REMAINS)
-			{
-				prefetch_set_unused(ring_index);
-				entry = NULL;
-			}
-			else
-			{
-				/* The buffered request is good enough, return that index */
-				pgBufferUsage.prefetch.duplicates++;
-				return ring_index;
-			}
+			prefetch_set_unused(ring_index);
+			entry = NULL;
+		}
+		else
+		{
+			/* The buffered request is good enough, return that index */
+			pgBufferUsage.prefetch.duplicates++;
+			return ring_index;
 		}
 	}

@@ -866,7 +859,8 @@ page_server_request(void const *req)
 {
 	NeonResponse* resp;
 	do {
-		while (!page_server->send((NeonRequest *) req) || !page_server->flush());
+		page_server->send((NeonRequest *) req);
+		page_server->flush();
 		MyPState->ring_flush = MyPState->ring_unused;
 		consume_prefetch_responses();
 		resp = page_server->receive();
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -191,12 +191,6 @@ impl Storage for FileStorage {
                control_partial_path.display()
            )
        })?;
-        control_partial.flush().await.with_context(|| {
-            format!(
-                "failed to flush safekeeper state into control file at: {}",
-                control_partial_path.display()
-            )
-        })?;

        // fsync the file
        if !self.conf.no_sync {
--- a/safekeeper/src/pull_timeline.rs
+++ b/safekeeper/src/pull_timeline.rs
@@ -188,7 +188,6 @@ async fn pull_timeline(status: TimelineStatus, host: String) -> Result<Response>
        let mut response = client.get(&http_url).send().await?;
        while let Some(chunk) = response.chunk().await? {
            file.write_all(&chunk).await?;
-            file.flush().await?;
        }
    }

--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -403,18 +403,16 @@ impl SafekeeperPostgresHandler {
        };

        // take the latest commit_lsn if don't have stop_pos
-        let end_pos = stop_pos.unwrap_or(*commit_lsn_watch_rx.borrow());
+        let mut end_pos = stop_pos.unwrap_or(*commit_lsn_watch_rx.borrow());

        if end_pos < start_pos {
-            warn!(
-                "requested start_pos {} is ahead of available WAL end_pos {}",
-                start_pos, end_pos
-            );
+            warn!("start_pos {} is ahead of end_pos {}", start_pos, end_pos);
+            end_pos = start_pos;
        }

        info!(
-            "starting streaming from {:?} till {:?}, available WAL ends at {}",
-            start_pos, stop_pos, end_pos
+            "starting streaming from {:?} till {:?}",
+            start_pos, stop_pos
        );

        // switch to copy
@@ -549,14 +547,12 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
            self.end_pos = *self.commit_lsn_watch_rx.borrow();
            if self.end_pos > self.start_pos {
                // We have something to send.
-                trace!("got end_pos {:?}, streaming", self.end_pos);
                return Ok(());
            }

            // Wait for WAL to appear, now self.end_pos == self.start_pos.
            if let Some(lsn) = wait_for_lsn(&mut self.commit_lsn_watch_rx, self.start_pos).await? {
                self.end_pos = lsn;
-                trace!("got end_pos {:?}, streaming", self.end_pos);
                return Ok(());
            }

--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -248,10 +248,6 @@ impl PhysicalStorage {
        };

        file.write_all(buf).await?;
-        // Note: flush just ensures write above reaches the OS (this is not
-        // needed in case of sync IO as Write::write there calls directly write
-        // syscall, but needed in case of async). It does *not* fsyncs the file.
-        file.flush().await?;

        if xlogoff + buf.len() == self.wal_seg_size {
            // If we reached the end of a WAL segment, flush and close it.
@@ -720,7 +716,6 @@ async fn write_zeroes(file: &mut File, mut count: usize) -> Result<()> {
        count -= XLOG_BLCKSZ;
    }
    file.write_all(&ZERO_BLOCK[0..count]).await?;
-    file.flush().await?;
    Ok(())
 }

--- a/scripts/combine_control_files.py
+++ b/scripts/combine_control_files.py
@@ -1,20 +0,0 @@
-import json
-import os
-
-with open("SOMETHING", "w") as f:
-    f.write("SOMETHING")
-
-# enable custom extensions for specific tenants
-enabled_extensions = {"123454321": ["anon"], "public": ["embedding"]}
-
-control_data = {}
-os.chdir("control_files")
-for control_file in os.listdir("."):
-    ext_name = control_file.replace(".control", "")
-    with open(control_file, "r") as f:
-        control_data[ext_name] = f.read()
-
-all_data = {"enabled_extensions": enabled_extensions, "control_data": control_data}
-
-with open("ext_index.json", "w") as f:
-    json.dump(all_data, f)
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
Author	SHA1	Message	Date
Konstantin Knizhnik	68488f5c74	Do vacuum freeze before copying data	2023-07-05 22:17:06 +03:00
Konstantin Knizhnik	1389927d36	Use cursor to copy data	2023-07-05 21:49:55 +03:00
Konstantin Knizhnik	06357afe6d	Fix mapping of TOAST tables	2023-07-05 19:26:29 +03:00
Konstantin Knizhnik	6943dac164	Support cloning database without index rebuild	2023-07-05 15:47:13 +03:00