test branch

2026-05-22 15:41:15 +00:00 · 2023-08-09 16:03:18 -04:00
43 changed files with 1428 additions and 1996 deletions
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -814,7 +814,7 @@ jobs:
      run:
        shell: sh -eu {0}
    env:
-      VM_BUILDER_VERSION: v0.15.4
+      VM_BUILDER_VERSION: v0.15.0-alpha1

    steps:
      - name: Checkout
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1111,14 +1111,14 @@ LIMIT 100",
            .as_millis() as u64;
        info!("Prepare extensions took {prep_ext_time_delta}ms");

-        // Don't try to download libraries that are not in the index.
-        // Assume that they are already present locally.
-        libs_vec.retain(|lib| {
-            self.library_index
-                .get()
-                .expect("error accessing ext_remote_paths")
-                .contains_key(lib)
-        });
+        // // Don't try to download libraries that are not in the index.
+        // // Assume that they are already present locally.
+        // libs_vec.retain(|lib| {
+        //     self.library_index
+        //         .get()
+        //         .expect("error accessing ext_remote_paths")
+        //         .contains_key(lib)
+        // });

        info!("Downloading to shared preload libraries: {:?}", &libs_vec);

--- a/compute_tools/src/extension_server.rs
+++ b/compute_tools/src/extension_server.rs
@@ -156,7 +156,7 @@ pub async fn get_available_extensions(
    let ext_index_full = serde_json::from_slice::<Index>(&ext_idx_buffer)?;
    let mut enabled_extensions = ext_index_full.public_extensions;
    enabled_extensions.extend_from_slice(custom_extensions);
-    let mut library_index = ext_index_full.library_index;
+    let library_index = ext_index_full.library_index;
    let all_extension_data = ext_index_full.extension_data;
    info!("library_index: {:?}", library_index);

@@ -179,8 +179,6 @@ pub async fn get_available_extensions(
                file_create_tasks.push(tokio::fs::write(control_path, control_contents));
            } else {
                warn!("control file {:?} exists both locally and remotely. ignoring the remote version.", control_file);
-                // also delete this from library index
-                library_index.retain(|_, value| value != extension_name);
            }
        }
    }
--- a/compute_tools/src/spec.rs
+++ b/compute_tools/src/spec.rs
@@ -270,7 +270,7 @@ pub fn handle_roles(spec: &ComputeSpec, client: &mut Client) -> Result<()> {
            }
            RoleAction::Create => {
                let mut query: String = format!(
-                    "CREATE ROLE {} CREATEROLE CREATEDB BYPASSRLS IN ROLE neon_superuser",
+                    "CREATE ROLE {} CREATEROLE CREATEDB IN ROLE neon_superuser",
                    name.pg_quote()
                );
                info!("role create query: '{}'", &query);
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -24,20 +24,6 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
    Ok(dir.next_entry().await?.is_none())
 }

-pub async fn list_dir(path: impl AsRef<Path>) -> anyhow::Result<Vec<String>> {
-    let mut dir = tokio::fs::read_dir(&path)
-        .await
-        .context(format!("read_dir({})", path.as_ref().display()))?;
-
-    let mut content = vec![];
-    while let Some(next) = dir.next_entry().await? {
-        let file_name = next.file_name();
-        content.push(file_name.to_string_lossy().to_string());
-    }
-
-    Ok(content)
-}
-
 pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
    if e.kind() == io::ErrorKind::NotFound {
        Ok(())
@@ -57,7 +43,7 @@ where
 mod test {
    use std::path::PathBuf;

-    use crate::fs_ext::{is_directory_empty, list_dir};
+    use crate::fs_ext::is_directory_empty;

    use super::ignore_absent_files;

@@ -123,25 +109,4 @@ mod test {

        assert!(!file_path.exists());
    }
-
-    #[tokio::test]
-    async fn list_dir_works() {
-        let dir = tempfile::tempdir().unwrap();
-        let dir_path = dir.path();
-
-        assert!(list_dir(dir_path).await.unwrap().is_empty());
-
-        let file_path: PathBuf = dir_path.join("testfile");
-        let _ = std::fs::File::create(&file_path).unwrap();
-
-        assert_eq!(&list_dir(dir_path).await.unwrap(), &["testfile"]);
-
-        let another_dir_path: PathBuf = dir_path.join("testdir");
-        std::fs::create_dir(another_dir_path).unwrap();
-
-        let expected = &["testdir", "testfile"];
-        let mut actual = list_dir(dir_path).await.unwrap();
-        actual.sort();
-        assert_eq!(actual, expected);
-    }
 }
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -373,7 +373,7 @@ fn start_pageserver(
    let order = pageserver::InitializationOrder {
        initial_tenant_load: Some(init_done_tx),
        initial_logical_size_can_start: init_done_rx.clone(),
-        initial_logical_size_attempt: Some(init_logical_size_done_tx),
+        initial_logical_size_attempt: init_logical_size_done_tx,
        background_jobs_can_start: background_jobs_barrier.clone(),
    };

--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -31,9 +31,7 @@ use utils::{
 use crate::disk_usage_eviction_task::DiskUsageEvictionTaskConfig;
 use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::{
-    TENANT_ATTACHING_MARKER_FILENAME, TENANT_DELETED_MARKER_FILE_NAME, TIMELINES_SEGMENT_NAME,
-};
+use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
 use crate::{
    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
    TIMELINE_UNINIT_MARK_SUFFIX,
@@ -615,11 +613,6 @@ impl PageServerConf {
        )
    }

-    pub fn tenant_deleted_mark_file_path(&self, tenant_id: &TenantId) -> PathBuf {
-        self.tenant_path(tenant_id)
-            .join(TENANT_DELETED_MARKER_FILE_NAME)
-    }
-
    pub fn traces_path(&self) -> PathBuf {
        self.workdir.join("traces")
    }
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -93,47 +93,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-    delete:
-      description: |
-        Attempts to delete specified tenant. 500 and 409 errors should be retried until 404 is retrieved.
-        404 means that deletion successfully finished"
-      responses:
-        "400":
-          description: Error when no tenant id found in path
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"
-        "401":
-          description: Unauthorized Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/UnauthorizedError"
-        "403":
-          description: Forbidden Error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ForbiddenError"
-        "404":
-          description: Tenant not found
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/NotFoundError"
-        "409":
-          description: Deletion is already in progress, continue polling
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/ConflictError"
-        "500":
-          description: Generic operation error
-          content:
-            application/json:
-              schema:
-                $ref: "#/components/schemas/Error"

  /v1/tenant/{tenant_id}/timeline:
    parameters:
@@ -861,7 +820,6 @@ paths:
            application/json:
              schema:
                $ref: "#/components/schemas/Error"
-
  /v1/tenant/config:
    put:
      description: |
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -187,7 +187,7 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
                format!("Cannot delete timeline which has child timelines: {children:?}")
                    .into_boxed_str(),
            ),
-            a @ AlreadyInProgress(_) => ApiError::Conflict(a.to_string()),
+            a @ AlreadyInProgress => ApiError::Conflict(a.to_string()),
            Other(e) => ApiError::InternalServerError(e),
        }
    }
@@ -208,19 +208,6 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
    }
 }

-impl From<crate::tenant::delete::DeleteTenantError> for ApiError {
-    fn from(value: crate::tenant::delete::DeleteTenantError) -> Self {
-        use crate::tenant::delete::DeleteTenantError::*;
-        match value {
-            Get(g) => ApiError::from(g),
-            e @ AlreadyInProgress => ApiError::Conflict(e.to_string()),
-            Timeline(t) => ApiError::from(t),
-            Other(o) => ApiError::InternalServerError(o),
-            e @ InvalidState(_) => ApiError::PreconditionFailed(e.to_string().into_boxed_str()),
-        }
-    }
-}
-
 // Helper function to construct a TimelineInfo struct for a timeline
 async fn build_timeline_info(
    timeline: &Arc<Timeline>,
@@ -630,23 +617,6 @@ async fn tenant_status(
    json_response(StatusCode::OK, tenant_info)
 }

-async fn tenant_delete_handler(
-    request: Request<Body>,
-    _cancel: CancellationToken,
-) -> Result<Response<Body>, ApiError> {
-    // TODO openapi spec
-    let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
-    check_permission(&request, Some(tenant_id))?;
-
-    let state = get_state(&request);
-
-    mgr::delete_tenant(state.conf, state.remote_storage.clone(), tenant_id)
-        .instrument(info_span!("tenant_delete_handler", %tenant_id))
-        .await?;
-
-    json_response(StatusCode::ACCEPTED, ())
-}
-
 /// HTTP endpoint to query the current tenant_size of a tenant.
 ///
 /// This is not used by consumption metrics under [`crate::consumption_metrics`], but can be used
@@ -1375,9 +1345,6 @@ pub fn make_router(
        .get("/v1/tenant", |r| api_handler(r, tenant_list_handler))
        .post("/v1/tenant", |r| api_handler(r, tenant_create_handler))
        .get("/v1/tenant/:tenant_id", |r| api_handler(r, tenant_status))
-        .delete("/v1/tenant/:tenant_id", |r| {
-            api_handler(r, tenant_delete_handler)
-        })
        .get("/v1/tenant/:tenant_id/synthetic_size", |r| {
            api_handler(r, tenant_size_handler)
        })
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -190,7 +190,7 @@ pub struct InitializationOrder {

    /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
    /// attempt. It is important to drop this once the attempt has completed.
-    pub initial_logical_size_attempt: Option<utils::completion::Completion>,
+    pub initial_logical_size_attempt: utils::completion::Completion,

    /// Barrier for when we can start any background jobs.
    ///
@@ -226,7 +226,6 @@ async fn timed<Fut: std::future::Future>(

            let ret = fut.await;

-            // this has a global allowed_errors
            tracing::warn!(
                task = name,
                elapsed_ms = started.elapsed().as_millis(),
--- a/pageserver/src/page_cache.rs
+++ b/pageserver/src/page_cache.rs
@@ -47,11 +47,13 @@ use std::{

 use anyhow::Context;
 use once_cell::sync::OnceCell;
+use tracing::error;
 use utils::{
    id::{TenantId, TimelineId},
    lsn::Lsn,
 };

+use crate::tenant::writeback_ephemeral_file;
 use crate::{metrics::PageCacheSizeMetrics, repository::Key};

 static PAGE_CACHE: OnceCell<PageCache> = OnceCell::new();
@@ -95,6 +97,10 @@ enum CacheKey {
        hash_key: MaterializedPageHashKey,
        lsn: Lsn,
    },
+    EphemeralPage {
+        file_id: u64,
+        blkno: u32,
+    },
    ImmutableFilePage {
        file_id: u64,
        blkno: u32,
@@ -122,6 +128,7 @@ struct Slot {
 struct SlotInner {
    key: Option<CacheKey>,
    buf: &'static mut [u8; PAGE_SZ],
+    dirty: bool,
 }

 impl Slot {
@@ -170,6 +177,8 @@ pub struct PageCache {
    /// can have a separate mapping map, next to this field.
    materialized_page_map: RwLock<HashMap<MaterializedPageHashKey, Vec<Version>>>,

+    ephemeral_page_map: RwLock<HashMap<(u64, u32), usize>>,
+
    immutable_page_map: RwLock<HashMap<(u64, u32), usize>>,

    /// The actual buffers with their metadata.
@@ -249,6 +258,14 @@ impl PageWriteGuard<'_> {
        );
        self.valid = true;
    }
+    pub fn mark_dirty(&mut self) {
+        // only ephemeral pages can be dirty ATM.
+        assert!(matches!(
+            self.inner.key,
+            Some(CacheKey::EphemeralPage { .. })
+        ));
+        self.inner.dirty = true;
+    }
 }

 impl Drop for PageWriteGuard<'_> {
@@ -263,6 +280,7 @@ impl Drop for PageWriteGuard<'_> {
            let self_key = self.inner.key.as_ref().unwrap();
            PAGE_CACHE.get().unwrap().remove_mapping(self_key);
            self.inner.key = None;
+            self.inner.dirty = false;
        }
    }
 }
@@ -370,7 +388,41 @@ impl PageCache {
        Ok(())
    }

-    // Section 1.2: Public interface functions for working with immutable file pages.
+    // Section 1.2: Public interface functions for working with Ephemeral pages.
+
+    pub fn read_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
+        let mut cache_key = CacheKey::EphemeralPage { file_id, blkno };
+
+        self.lock_for_read(&mut cache_key)
+    }
+
+    pub fn write_ephemeral_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<WriteBufResult> {
+        let cache_key = CacheKey::EphemeralPage { file_id, blkno };
+
+        self.lock_for_write(&cache_key)
+    }
+
+    /// Immediately drop all buffers belonging to given file, without writeback
+    pub fn drop_buffers_for_ephemeral(&self, drop_file_id: u64) {
+        for slot_idx in 0..self.slots.len() {
+            let slot = &self.slots[slot_idx];
+
+            let mut inner = slot.inner.write().unwrap();
+            if let Some(key) = &inner.key {
+                match key {
+                    CacheKey::EphemeralPage { file_id, blkno: _ } if *file_id == drop_file_id => {
+                        // remove mapping for old buffer
+                        self.remove_mapping(key);
+                        inner.key = None;
+                        inner.dirty = false;
+                    }
+                    _ => {}
+                }
+            }
+        }
+    }
+
+    // Section 1.3: Public interface functions for working with immutable file pages.

    pub fn read_immutable_buf(&self, file_id: u64, blkno: u32) -> anyhow::Result<ReadBufResult> {
        let mut cache_key = CacheKey::ImmutableFilePage { file_id, blkno };
@@ -392,6 +444,7 @@ impl PageCache {
                        // remove mapping for old buffer
                        self.remove_mapping(key);
                        inner.key = None;
+                        inner.dirty = false;
                    }
                    _ => {}
                }
@@ -469,6 +522,10 @@ impl PageCache {
            CacheKey::MaterializedPage { .. } => {
                unreachable!("Materialized pages use lookup_materialized_page")
            }
+            CacheKey::EphemeralPage { .. } => (
+                &crate::metrics::PAGE_CACHE.read_accesses_ephemeral,
+                &crate::metrics::PAGE_CACHE.read_hits_ephemeral,
+            ),
            CacheKey::ImmutableFilePage { .. } => (
                &crate::metrics::PAGE_CACHE.read_accesses_immutable,
                &crate::metrics::PAGE_CACHE.read_hits_immutable,
@@ -509,6 +566,7 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
+            inner.dirty = false;
            slot.usage_count.store(1, Ordering::Relaxed);

            return Ok(ReadBufResult::NotFound(PageWriteGuard {
@@ -570,6 +628,7 @@ impl PageCache {
            // Make the slot ready
            let slot = &self.slots[slot_idx];
            inner.key = Some(cache_key.clone());
+            inner.dirty = false;
            slot.usage_count.store(1, Ordering::Relaxed);

            return Ok(WriteBufResult::NotFound(PageWriteGuard {
@@ -608,6 +667,10 @@ impl PageCache {
                *lsn = version.lsn;
                Some(version.slot_idx)
            }
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                let map = self.ephemeral_page_map.read().unwrap();
+                Some(*map.get(&(*file_id, *blkno))?)
+            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let map = self.immutable_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
@@ -631,6 +694,10 @@ impl PageCache {
                    None
                }
            }
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                let map = self.ephemeral_page_map.read().unwrap();
+                Some(*map.get(&(*file_id, *blkno))?)
+            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let map = self.immutable_page_map.read().unwrap();
                Some(*map.get(&(*file_id, *blkno))?)
@@ -664,6 +731,12 @@ impl PageCache {
                    panic!("could not find old key in mapping")
                }
            }
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                let mut map = self.ephemeral_page_map.write().unwrap();
+                map.remove(&(*file_id, *blkno))
+                    .expect("could not find old key in mapping");
+                self.size_metrics.current_bytes_ephemeral.sub_page_sz(1);
+            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                map.remove(&(*file_id, *blkno))
@@ -703,7 +776,17 @@ impl PageCache {
                    }
                }
            }
-
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                let mut map = self.ephemeral_page_map.write().unwrap();
+                match map.entry((*file_id, *blkno)) {
+                    Entry::Occupied(entry) => Some(*entry.get()),
+                    Entry::Vacant(entry) => {
+                        entry.insert(slot_idx);
+                        self.size_metrics.current_bytes_ephemeral.add_page_sz(1);
+                        None
+                    }
+                }
+            }
            CacheKey::ImmutableFilePage { file_id, blkno } => {
                let mut map = self.immutable_page_map.write().unwrap();
                match map.entry((*file_id, *blkno)) {
@@ -754,8 +837,25 @@ impl PageCache {
                    }
                };
                if let Some(old_key) = &inner.key {
+                    if inner.dirty {
+                        if let Err(err) = Self::writeback(old_key, inner.buf) {
+                            // Writing the page to disk failed.
+                            //
+                            // FIXME: What to do here, when? We could propagate the error to the
+                            // caller, but victim buffer is generally unrelated to the original
+                            // call. It can even belong to a different tenant. Currently, we
+                            // report the error to the log and continue the clock sweep to find
+                            // a different victim. But if the problem persists, the page cache
+                            // could fill up with dirty pages that we cannot evict, and we will
+                            // loop retrying the writebacks indefinitely.
+                            error!("writeback of buffer {:?} failed: {}", old_key, err);
+                            continue;
+                        }
+                    }
+
                    // remove mapping for old buffer
                    self.remove_mapping(old_key);
+                    inner.dirty = false;
                    inner.key = None;
                }
                return Ok((slot_idx, inner));
@@ -763,6 +863,28 @@ impl PageCache {
        }
    }

+    fn writeback(cache_key: &CacheKey, buf: &[u8]) -> Result<(), std::io::Error> {
+        match cache_key {
+            CacheKey::MaterializedPage {
+                hash_key: _,
+                lsn: _,
+            } => Err(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "unexpected dirty materialized page",
+            )),
+            CacheKey::EphemeralPage { file_id, blkno } => {
+                writeback_ephemeral_file(*file_id, *blkno, buf)
+            }
+            CacheKey::ImmutableFilePage {
+                file_id: _,
+                blkno: _,
+            } => Err(std::io::Error::new(
+                std::io::ErrorKind::Other,
+                "unexpected dirty immutable page",
+            )),
+        }
+    }
+
    /// Initialize a new page cache
    ///
    /// This should be called only once at page server startup.
@@ -773,6 +895,7 @@ impl PageCache {

        let size_metrics = &crate::metrics::PAGE_CACHE_SIZE;
        size_metrics.max_bytes.set_page_sz(num_pages);
+        size_metrics.current_bytes_ephemeral.set_page_sz(0);
        size_metrics.current_bytes_immutable.set_page_sz(0);
        size_metrics.current_bytes_materialized_page.set_page_sz(0);

@@ -782,7 +905,11 @@ impl PageCache {
                let buf: &mut [u8; PAGE_SZ] = chunk.try_into().unwrap();

                Slot {
-                    inner: RwLock::new(SlotInner { key: None, buf }),
+                    inner: RwLock::new(SlotInner {
+                        key: None,
+                        buf,
+                        dirty: false,
+                    }),
                    usage_count: AtomicU8::new(0),
                }
            })
@@ -790,6 +917,7 @@ impl PageCache {

        Self {
            materialized_page_map: Default::default(),
+            ephemeral_page_map: Default::default(),
            immutable_page_map: Default::default(),
            slots,
            next_evict_slot: AtomicUsize::new(0),
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -28,7 +28,6 @@ use std::cmp::min;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
 use std::collections::HashMap;
-use std::fmt::Debug;
 use std::fs;
 use std::fs::File;
 use std::fs::OpenOptions;
@@ -47,10 +46,8 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};

 use self::config::TenantConf;
-use self::delete::DeleteTenantFlow;
 use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
-use self::mgr::TenantsMap;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineUninitMark;
 use self::timeline::uninit::UninitializedTimeline;
@@ -109,7 +106,6 @@ macro_rules! pausable_failpoint {

 pub mod blob_io;
 pub mod block_io;
-
 pub mod disk_btree;
 pub(crate) mod ephemeral_file;
 pub mod layer_map;
@@ -122,7 +118,6 @@ mod remote_timeline_client;
 pub mod storage_layer;

 pub mod config;
-pub mod delete;
 pub mod mgr;
 pub mod tasks;
 pub mod upload_queue;
@@ -136,6 +131,9 @@ pub use timeline::{
    LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
 };

+// re-export this function so that page_cache.rs can use it.
+pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
+
 // re-export for use in remote_timeline_client.rs
 pub use crate::tenant::metadata::save_metadata;

@@ -147,8 +145,6 @@ pub const TIMELINES_SEGMENT_NAME: &str = "timelines";

 pub const TENANT_ATTACHING_MARKER_FILENAME: &str = "attaching";

-pub const TENANT_DELETED_MARKER_FILE_NAME: &str = "deleted";
-
 ///
 /// Tenant consists of multiple timelines. Keep them in a hash table.
 ///
@@ -187,8 +183,6 @@ pub struct Tenant {
    cached_synthetic_tenant_size: Arc<AtomicU64>,

    eviction_task_tenant_state: tokio::sync::Mutex<EvictionTaskTenantState>,
-
-    pub(crate) delete_progress: Arc<tokio::sync::Mutex<DeleteTenantFlow>>,
 }

 // We should not blindly overwrite local metadata with remote one.
@@ -280,7 +274,7 @@ pub enum LoadLocalTimelineError {
    ResumeDeletion(#[source] anyhow::Error),
 }

-#[derive(thiserror::Error)]
+#[derive(Debug, thiserror::Error)]
 pub enum DeleteTimelineError {
    #[error("NotFound")]
    NotFound,
@@ -289,37 +283,17 @@ pub enum DeleteTimelineError {
    HasChildren(Vec<TimelineId>),

    #[error("Timeline deletion is already in progress")]
-    AlreadyInProgress(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>),
+    AlreadyInProgress,

    #[error(transparent)]
    Other(#[from] anyhow::Error),
 }

-impl Debug for DeleteTimelineError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            Self::NotFound => write!(f, "NotFound"),
-            Self::HasChildren(c) => f.debug_tuple("HasChildren").field(c).finish(),
-            Self::AlreadyInProgress(_) => f.debug_tuple("AlreadyInProgress").finish(),
-            Self::Other(e) => f.debug_tuple("Other").field(e).finish(),
-        }
-    }
-}
-
 pub enum SetStoppingError {
    AlreadyStopping(completion::Barrier),
    Broken,
 }

-impl Debug for SetStoppingError {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        match self {
-            Self::AlreadyStopping(_) => f.debug_tuple("AlreadyStopping").finish(),
-            Self::Broken => write!(f, "Broken"),
-        }
-    }
-}
-
 struct RemoteStartupData {
    index_part: IndexPart,
    remote_metadata: TimelineMetadata,
@@ -642,7 +616,7 @@ impl Tenant {
        // For every timeline, download the metadata file, scan the local directory,
        // and build a layer map that contains an entry for each remote and local
        // layer file.
-        let sorted_timelines = tree_sort_timelines(timeline_ancestors, |m| m.ancestor_timeline())?;
+        let sorted_timelines = tree_sort_timelines(timeline_ancestors)?;
        for (timeline_id, remote_metadata) in sorted_timelines {
            let (index_part, remote_client) = remote_index_and_client
                .remove(&timeline_id)
@@ -766,13 +740,12 @@ impl Tenant {
    /// If the loading fails for some reason, the Tenant will go into Broken
    /// state.
    #[instrument(skip_all, fields(tenant_id=%tenant_id))]
-    pub(crate) fn spawn_load(
+    pub fn spawn_load(
        conf: &'static PageServerConf,
        tenant_id: TenantId,
        broker_client: storage_broker::BrokerClientChannel,
        remote_storage: Option<GenericRemoteStorage>,
        init_order: Option<InitializationOrder>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
        ctx: &RequestContext,
    ) -> Arc<Tenant> {
        span::debug_assert_current_span_has_tenant_id();
@@ -792,7 +765,7 @@ impl Tenant {
            tenant_conf,
            wal_redo_manager,
            tenant_id,
-            remote_storage.clone(),
+            remote_storage,
        );
        let tenant = Arc::new(tenant);

@@ -808,83 +781,27 @@ impl Tenant {
            "initial tenant load",
            false,
            async move {
-                let make_broken = |t: &Tenant, err: anyhow::Error| {
-                    error!("load failed, setting tenant state to Broken: {err:?}");
-                    t.state.send_modify(|state| {
-                        assert!(
-                            matches!(*state, TenantState::Loading | TenantState::Stopping { .. }),
-                            "the loading task owns the tenant state until activation is complete"
-                        );
-                        *state = TenantState::broken_from_reason(err.to_string());
-                    });
-                };
-
                let mut init_order = init_order;

                // take the completion because initial tenant loading will complete when all of
                // these tasks complete.
-                let _completion = init_order
-                    .as_mut()
-                    .and_then(|x| x.initial_tenant_load.take());
-
-                // Dont block pageserver startup on figuring out deletion status
-                let pending_deletion = {
-                    match DeleteTenantFlow::should_resume_deletion(
-                        conf,
-                        remote_storage.as_ref(),
-                        &tenant_clone,
-                    )
-                    .await
-                    {
-                        Ok(should_resume_deletion) => should_resume_deletion,
-                        Err(err) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(err));
-                            return Ok(());
-                        }
-                    }
-                };
-
-                info!("pending deletion {}", pending_deletion.is_some());
-
-                if let Some(deletion) = pending_deletion {
-                    // as we are no longer loading, signal completion by dropping
-                    // the completion while we resume deletion
-                    drop(_completion);
-                    // do not hold to initial_logical_size_attempt as it will prevent loading from proceeding without timeout
-                    let _ = init_order
-                        .as_mut()
-                        .and_then(|x| x.initial_logical_size_attempt.take());
-
-                    match DeleteTenantFlow::resume(
-                        deletion,
-                        &tenant_clone,
-                        init_order.as_ref(),
-                        tenants,
-                        &ctx,
-                    )
-                    .await
-                    {
-                        Err(err) => {
-                            make_broken(&tenant_clone, anyhow::anyhow!(err));
-                            return Ok(());
-                        }
-                        Ok(()) => return Ok(()),
-                    }
-                }
-
-                let background_jobs_can_start =
-                    init_order.as_ref().map(|x| &x.background_jobs_can_start);
+                let _completion = init_order.as_mut().and_then(|x| x.initial_tenant_load.take());

                match tenant_clone.load(init_order.as_ref(), &ctx).await {
                    Ok(()) => {
-                        debug!("load finished",);
-
+                        debug!("load finished, activating");
+                        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
                        tenant_clone.activate(broker_client, background_jobs_can_start, &ctx);
                    }
-                    Err(err) => make_broken(&tenant_clone, err),
+                    Err(err) => {
+                        error!("load failed, setting tenant state to Broken: {err:?}");
+                        tenant_clone.state.send_modify(|state| {
+                            assert_eq!(*state, TenantState::Loading, "the loading task owns the tenant state until activation is complete");
+                            *state = TenantState::broken_from_reason(err.to_string());
+                        });
+                    }
                }
-
-                Ok(())
+               Ok(())
            }
            .instrument({
                let span = tracing::info_span!(parent: None, "load", tenant_id=%tenant_id);
@@ -960,8 +877,6 @@ impl Tenant {
                        )
                    })?;

-                info!("Found deletion mark for timeline {}", timeline_id);
-
                match load_metadata(self.conf, &self.tenant_id, &timeline_id) {
                    Ok(metadata) => {
                        timelines_to_resume_deletion.push((timeline_id, Some(metadata)))
@@ -1051,11 +966,9 @@ impl Tenant {

        // Sort the array of timeline IDs into tree-order, so that parent comes before
        // all its children.
-        tree_sort_timelines(timelines_to_load, |m| m.ancestor_timeline()).map(|sorted_timelines| {
-            TenantDirectoryScan {
-                sorted_timelines_to_load: sorted_timelines,
-                timelines_to_resume_deletion,
-            }
+        tree_sort_timelines(timelines_to_load).map(|sorted_timelines| TenantDirectoryScan {
+            sorted_timelines_to_load: sorted_timelines,
+            timelines_to_resume_deletion,
        })
    }

@@ -1769,7 +1682,7 @@ impl Tenant {
        // It's mesed up.
        // we just ignore the failure to stop

-        match self.set_stopping(shutdown_progress, false).await {
+        match self.set_stopping(shutdown_progress).await {
            Ok(()) => {}
            Err(SetStoppingError::Broken) => {
                // assume that this is acceptable
@@ -1809,25 +1722,18 @@ impl Tenant {
    /// This function waits for the tenant to become active if it isn't already, before transitioning it into Stopping state.
    ///
    /// This function is not cancel-safe!
-    ///
-    /// `allow_transition_from_loading` is needed for the special case of loading task deleting the tenant.
-    async fn set_stopping(
-        &self,
-        progress: completion::Barrier,
-        allow_transition_from_loading: bool,
-    ) -> Result<(), SetStoppingError> {
+    async fn set_stopping(&self, progress: completion::Barrier) -> Result<(), SetStoppingError> {
        let mut rx = self.state.subscribe();

        // cannot stop before we're done activating, so wait out until we're done activating
        rx.wait_for(|state| match state {
-            TenantState::Activating(_) | TenantState::Attaching => {
+            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
                info!(
                    "waiting for {} to turn Active|Broken|Stopping",
                    <&'static str>::from(state)
                );
                false
            }
-            TenantState::Loading => allow_transition_from_loading,
            TenantState::Active | TenantState::Broken { .. } | TenantState::Stopping { .. } => true,
        })
        .await
@@ -1836,16 +1742,9 @@ impl Tenant {
        // we now know we're done activating, let's see whether this task is the winner to transition into Stopping
        let mut err = None;
        let stopping = self.state.send_if_modified(|current_state| match current_state {
-            TenantState::Activating(_) | TenantState::Attaching => {
+            TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
                unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
            }
-            TenantState::Loading => {
-                if !allow_transition_from_loading {
-                    unreachable!("we ensured above that we're done with activation, and, there is no re-activation")
-                };
-                *current_state = TenantState::Stopping { progress };
-                true
-            }
            TenantState::Active => {
                // FIXME: due to time-of-check vs time-of-use issues, it can happen that new timelines
                // are created after the transition to Stopping. That's harmless, as the Timelines
@@ -1914,10 +1813,6 @@ impl Tenant {
        .expect("cannot drop self.state while on a &self method");

        // we now know we're done activating, let's see whether this task is the winner to transition into Broken
-        self.set_broken_no_wait(reason)
-    }
-
-    pub(crate) fn set_broken_no_wait(&self, reason: String) {
        self.state.send_modify(|current_state| {
            match *current_state {
                TenantState::Activating(_) | TenantState::Loading | TenantState::Attaching => {
@@ -1983,28 +1878,22 @@ impl Tenant {
 /// Given a Vec of timelines and their ancestors (timeline_id, ancestor_id),
 /// perform a topological sort, so that the parent of each timeline comes
 /// before the children.
-/// E extracts the ancestor from T
-/// This allows for T to be different. It can be TimelineMetadata, can be Timeline itself, etc.
-fn tree_sort_timelines<T, E>(
-    timelines: HashMap<TimelineId, T>,
-    extractor: E,
-) -> anyhow::Result<Vec<(TimelineId, T)>>
-where
-    E: Fn(&T) -> Option<TimelineId>,
-{
+fn tree_sort_timelines(
+    timelines: HashMap<TimelineId, TimelineMetadata>,
+) -> anyhow::Result<Vec<(TimelineId, TimelineMetadata)>> {
    let mut result = Vec::with_capacity(timelines.len());

    let mut now = Vec::with_capacity(timelines.len());
    // (ancestor, children)
-    let mut later: HashMap<TimelineId, Vec<(TimelineId, T)>> =
+    let mut later: HashMap<TimelineId, Vec<(TimelineId, TimelineMetadata)>> =
        HashMap::with_capacity(timelines.len());

-    for (timeline_id, value) in timelines {
-        if let Some(ancestor_id) = extractor(&value) {
+    for (timeline_id, metadata) in timelines {
+        if let Some(ancestor_id) = metadata.ancestor_timeline() {
            let children = later.entry(ancestor_id).or_default();
-            children.push((timeline_id, value));
+            children.push((timeline_id, metadata));
        } else {
-            now.push((timeline_id, value));
+            now.push((timeline_id, metadata));
        }
    }

@@ -2173,7 +2062,7 @@ impl Tenant {
            remote_client,
            pg_version,
            initial_logical_size_can_start.cloned(),
-            initial_logical_size_attempt.cloned().flatten(),
+            initial_logical_size_attempt.cloned(),
            state,
        );

@@ -2257,7 +2146,6 @@ impl Tenant {
            cached_logical_sizes: tokio::sync::Mutex::new(HashMap::new()),
            cached_synthetic_tenant_size: Arc::new(AtomicU64::new(0)),
            eviction_task_tenant_state: tokio::sync::Mutex::new(EvictionTaskTenantState::default()),
-            delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTenantFlow::default())),
        }
    }

@@ -2274,7 +2162,6 @@ impl Tenant {
        // FIXME If the config file is not found, assume that we're attaching
        // a detached tenant and config is passed via attach command.
        // https://github.com/neondatabase/neon/issues/1555
-        // OR: we're loading after incomplete deletion that managed to remove config.
        if !target_config_path.exists() {
            info!("tenant config not found in {target_config_display}");
            return Ok(TenantConfOpt::default());
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -1,546 +0,0 @@
-use std::{
-    path::{Path, PathBuf},
-    sync::Arc,
-};
-
-use anyhow::Context;
-use pageserver_api::models::TenantState;
-use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
-use tokio::sync::OwnedMutexGuard;
-use tracing::{error, info, instrument, warn, Instrument, Span};
-
-use utils::{
-    completion, crashsafe, fs_ext,
-    id::{TenantId, TimelineId},
-};
-
-use crate::{
-    config::PageServerConf,
-    context::RequestContext,
-    task_mgr::{self, TaskKind},
-    InitializationOrder,
-};
-
-use super::{
-    mgr::{GetTenantError, TenantsMap},
-    span,
-    timeline::delete::DeleteTimelineFlow,
-    tree_sort_timelines, DeleteTimelineError, Tenant,
-};
-
-const SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS: u8 = 3;
-
-#[derive(Debug, thiserror::Error)]
-pub enum DeleteTenantError {
-    #[error("GetTenant {0}")]
-    Get(#[from] GetTenantError),
-
-    #[error("Invalid state {0}. Expected Active or Broken")]
-    InvalidState(TenantState),
-
-    #[error("Tenant deletion is already in progress")]
-    AlreadyInProgress,
-
-    #[error("Timeline {0}")]
-    Timeline(#[from] DeleteTimelineError),
-
-    #[error(transparent)]
-    Other(#[from] anyhow::Error),
-}
-
-type DeletionGuard = tokio::sync::OwnedMutexGuard<DeleteTenantFlow>;
-
-fn remote_tenant_delete_mark_path(
-    conf: &PageServerConf,
-    tenant_id: &TenantId,
-) -> anyhow::Result<RemotePath> {
-    let tenant_remote_path = conf
-        .tenant_path(tenant_id)
-        .strip_prefix(&conf.workdir)
-        .context("Failed to strip workdir prefix")
-        .and_then(RemotePath::new)
-        .context("tenant path")?;
-    Ok(tenant_remote_path.join(Path::new("deleted")))
-}
-
-async fn create_remote_delete_mark(
-    conf: &PageServerConf,
-    remote_storage: &GenericRemoteStorage,
-    tenant_id: &TenantId,
-) -> Result<(), DeleteTenantError> {
-    let remote_mark_path = remote_tenant_delete_mark_path(conf, tenant_id)?;
-
-    let data: &[u8] = &[];
-    remote_storage
-        .upload(data, 0, &remote_mark_path, None)
-        .await
-        .context("mark upload")?;
-
-    Ok(())
-}
-
-async fn create_local_delete_mark(
-    conf: &PageServerConf,
-    tenant_id: &TenantId,
-) -> Result<(), DeleteTenantError> {
-    let marker_path = conf.tenant_deleted_mark_file_path(tenant_id);
-
-    // Note: we're ok to replace existing file.
-    let _ = std::fs::OpenOptions::new()
-        .write(true)
-        .create(true)
-        .open(&marker_path)
-        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
-
-    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
-
-    Ok(())
-}
-
-async fn schedule_ordered_timeline_deletions(
-    tenant: &Arc<Tenant>,
-) -> Result<Vec<(Arc<tokio::sync::Mutex<DeleteTimelineFlow>>, TimelineId)>, DeleteTenantError> {
-    // Tenant is stopping at this point. We know it will be deleted.
-    // No new timelines should be created.
-    // Tree sort timelines to delete from leafs to the root.
-    // NOTE: by calling clone we release the mutex which creates a possibility for a race: pending deletion
-    // can complete and remove timeline from the map in between our call to clone
-    // and `DeleteTimelineFlow::run`, so `run` wont find timeline in `timelines` map.
-    // timelines.lock is currently synchronous so we cant hold it across await point.
-    // So just ignore NotFound error if we get it from `run`.
-    // Beware: in case it becomes async and we try to hold it here, `run` also locks it, which can create a deadlock.
-    let timelines = tenant.timelines.lock().unwrap().clone();
-    let sorted =
-        tree_sort_timelines(timelines, |t| t.get_ancestor_timeline_id()).context("tree sort")?;
-
-    let mut already_running_deletions = vec![];
-
-    for (timeline_id, _) in sorted.into_iter().rev() {
-        if let Err(e) = DeleteTimelineFlow::run(tenant, timeline_id, true).await {
-            match e {
-                DeleteTimelineError::NotFound => {
-                    // Timeline deletion finished after call to clone above but before call
-                    // to `DeleteTimelineFlow::run` and removed timeline from the map.
-                    continue;
-                }
-                DeleteTimelineError::AlreadyInProgress(guard) => {
-                    already_running_deletions.push((guard, timeline_id));
-                    continue;
-                }
-                e => return Err(DeleteTenantError::Timeline(e)),
-            }
-        }
-    }
-
-    Ok(already_running_deletions)
-}
-
-async fn ensure_timelines_dir_empty(timelines_path: &Path) -> Result<(), DeleteTenantError> {
-    // Assert timelines dir is empty.
-    if !fs_ext::is_directory_empty(timelines_path).await? {
-        // Display first 10 items in directory
-        let list = &fs_ext::list_dir(timelines_path).await.context("list_dir")?[..10];
-        return Err(DeleteTenantError::Other(anyhow::anyhow!(
-            "Timelines directory is not empty after all timelines deletion: {list:?}"
-        )));
-    }
-
-    Ok(())
-}
-
-async fn remove_tenant_remote_delete_mark(
-    conf: &PageServerConf,
-    remote_storage: Option<&GenericRemoteStorage>,
-    tenant_id: &TenantId,
-) -> Result<(), DeleteTenantError> {
-    if let Some(remote_storage) = remote_storage {
-        remote_storage
-            .delete(&remote_tenant_delete_mark_path(conf, tenant_id)?)
-            .await?;
-    }
-    Ok(())
-}
-
-// Cleanup fs traces: tenant config, timelines dir local delete mark, tenant dir
-async fn cleanup_remaining_fs_traces(
-    conf: &PageServerConf,
-    tenant_id: &TenantId,
-) -> Result<(), DeleteTenantError> {
-    let rm = |p: PathBuf, is_dir: bool| async move {
-        if is_dir {
-            tokio::fs::remove_dir(&p).await
-        } else {
-            tokio::fs::remove_file(&p).await
-        }
-        .or_else(fs_ext::ignore_not_found)
-        .with_context(|| {
-            let to_display = p.display();
-            format!("failed to delete {to_display}")
-        })
-    };
-
-    rm(conf.tenant_config_path(tenant_id), false).await?;
-
-    fail::fail_point!("tenant-delete-before-remove-timelines-dir", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-remove-timelines-dir"
-        ))?
-    });
-
-    rm(conf.timelines_path(tenant_id), true).await?;
-
-    fail::fail_point!("tenant-delete-before-remove-deleted-mark", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-remove-deleted-mark"
-        ))?
-    });
-
-    rm(conf.tenant_deleted_mark_file_path(tenant_id), false).await?;
-
-    fail::fail_point!("tenant-delete-before-remove-tenant-dir", |_| {
-        Err(anyhow::anyhow!(
-            "failpoint: tenant-delete-before-remove-tenant-dir"
-        ))?
-    });
-
-    rm(conf.tenant_path(tenant_id), true).await?;
-
-    Ok(())
-}
-
-/// Orchestrates tenant shut down of all tasks, removes its in-memory structures,
-/// and deletes its data from both disk and s3.
-/// The sequence of steps:
-/// 1. Upload remote deletion mark.
-/// 2. Create local mark file.
-/// 3. Shutdown tasks
-/// 4. Run ordered timeline deletions
-/// 5. Wait for timeline deletion operations that were scheduled before tenant deletion was requested
-/// 6. Remove remote mark
-/// 7. Cleanup remaining fs traces, tenant dir, config, timelines dir, local delete mark
-/// It is resumable from any step in case a crash/restart occurs.
-/// There are three entrypoints to the process:
-/// 1. [`DeleteTenantFlow::run`] this is the main one called by a management api handler.
-/// 2. [`DeleteTenantFlow::resume`] is called during restarts when local or remote deletion marks are still there.
-/// Note the only other place that messes around timeline delete mark is the `Tenant::spawn_load` function.
-#[derive(Default)]
-pub enum DeleteTenantFlow {
-    #[default]
-    NotStarted,
-    InProgress,
-    Finished,
-}
-
-impl DeleteTenantFlow {
-    // These steps are run in the context of management api request handler.
-    // Long running steps are continued to run in the background.
-    // NB: If this fails half-way through, and is retried, the retry will go through
-    // all the same steps again. Make sure the code here is idempotent, and don't
-    // error out if some of the shutdown tasks have already been completed!
-    // NOTE: static needed for background part.
-    // We assume that calling code sets up the span with tenant_id.
-    #[instrument(skip_all)]
-    pub(crate) async fn run(
-        conf: &'static PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        tenant_id: TenantId,
-    ) -> Result<(), DeleteTenantError> {
-        span::debug_assert_current_span_has_tenant_id();
-
-        let (tenant, mut guard) = Self::prepare(tenants, tenant_id).await?;
-
-        if let Err(e) = Self::run_inner(&mut guard, conf, remote_storage.as_ref(), &tenant).await {
-            tenant.set_broken(format!("{e:#}")).await;
-            return Err(e);
-        }
-
-        Self::schedule_background(guard, conf, remote_storage, tenants, tenant);
-
-        Ok(())
-    }
-
-    // Helper function needed to be able to match once on returned error and transition tenant into broken state.
-    // This is needed because tenant.shutwodn is not idempotent. If tenant state is set to stopping another call to tenant.shutdown
-    // will result in an error, but here we need to be able to retry shutdown when tenant deletion is retried.
-    // So the solution is to set tenant state to broken.
-    async fn run_inner(
-        guard: &mut OwnedMutexGuard<Self>,
-        conf: &'static PageServerConf,
-        remote_storage: Option<&GenericRemoteStorage>,
-        tenant: &Tenant,
-    ) -> Result<(), DeleteTenantError> {
-        guard.mark_in_progress()?;
-
-        fail::fail_point!("tenant-delete-before-create-remote-mark", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-create-remote-mark"
-            ))?
-        });
-
-        // IDEA: implement detach as delete without remote storage. Then they would use the same lock (deletion_progress) so wont contend.
-        // Though sounds scary, different mark name?
-        // Detach currently uses remove_dir_all so in case of a crash we can end up in a weird state.
-        if let Some(remote_storage) = &remote_storage {
-            create_remote_delete_mark(conf, remote_storage, &tenant.tenant_id)
-                .await
-                .context("remote_mark")?
-        }
-
-        fail::fail_point!("tenant-delete-before-create-local-mark", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-create-local-mark"
-            ))?
-        });
-
-        create_local_delete_mark(conf, &tenant.tenant_id)
-            .await
-            .context("local delete mark")?;
-
-        fail::fail_point!("tenant-delete-before-background", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-background"
-            ))?
-        });
-
-        Ok(())
-    }
-
-    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
-        match self {
-            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
-            Self::InProgress { .. } => { /* We're in a retry */ }
-            Self::NotStarted => { /* Fresh start */ }
-        }
-
-        *self = Self::InProgress;
-
-        Ok(())
-    }
-
-    pub async fn should_resume_deletion(
-        conf: &'static PageServerConf,
-        remote_storage: Option<&GenericRemoteStorage>,
-        tenant: &Tenant,
-    ) -> Result<Option<DeletionGuard>, DeleteTenantError> {
-        let acquire = |t: &Tenant| {
-            Some(
-                Arc::clone(&t.delete_progress)
-                    .try_lock_owned()
-                    .expect("we're the only owner during init"),
-            )
-        };
-
-        let tenant_id = tenant.tenant_id;
-        // Check local mark first, if its there there is no need to go to s3 to check whether remote one exists.
-        if conf.tenant_deleted_mark_file_path(&tenant_id).exists() {
-            return Ok(acquire(tenant));
-        }
-
-        // If remote storage is there we rely on it
-        if let Some(remote_storage) = remote_storage {
-            let remote_mark_path = remote_tenant_delete_mark_path(conf, &tenant_id)?;
-
-            let attempt = 1;
-            loop {
-                match remote_storage.download(&remote_mark_path).await {
-                    Ok(_) => return Ok(acquire(tenant)),
-                    Err(e) => {
-                        if matches!(e, DownloadError::NotFound) {
-                            return Ok(None);
-                        }
-                        if attempt > SHOULD_RESUME_DELETION_FETCH_MARK_ATTEMPTS {
-                            return Err(anyhow::anyhow!(e))?;
-                        }
-
-                        warn!(
-                            "failed to fetch tenant deletion mark at {} attempt {}",
-                            &remote_mark_path, attempt
-                        )
-                    }
-                }
-            }
-        }
-
-        Ok(None)
-    }
-
-    pub(crate) async fn resume(
-        guard: DeletionGuard,
-        tenant: &Arc<Tenant>,
-        init_order: Option<&InitializationOrder>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        ctx: &RequestContext,
-    ) -> Result<(), DeleteTenantError> {
-        let (_, progress) = completion::channel();
-
-        tenant
-            .set_stopping(progress, true)
-            .await
-            .expect("cant be stopping or broken");
-
-        // Do not consume valuable resources during the load phase, continue deletion once init phase is complete.
-        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
-        if let Some(background) = background_jobs_can_start {
-            info!("waiting for backgound jobs barrier");
-            background.clone().wait().await;
-            info!("ready for backgound jobs barrier");
-        }
-
-        // Tenant may not be loadable if we fail late in cleanup_remaining_fs_traces (e g remove timelines dir)
-        let timelines_path = tenant.conf.timelines_path(&tenant.tenant_id);
-        if timelines_path.exists() {
-            tenant.load(init_order, ctx).await.context("load")?;
-        }
-
-        Self::background(
-            guard,
-            tenant.conf,
-            tenant.remote_storage.clone(),
-            tenants,
-            tenant,
-        )
-        .await
-    }
-
-    async fn prepare(
-        tenants: &tokio::sync::RwLock<TenantsMap>,
-        tenant_id: TenantId,
-    ) -> Result<(Arc<Tenant>, tokio::sync::OwnedMutexGuard<Self>), DeleteTenantError> {
-        let m = tenants.read().await;
-
-        let tenant = m
-            .get(&tenant_id)
-            .ok_or(GetTenantError::NotFound(tenant_id))?;
-
-        // FIXME: unsure about active only. Our init jobs may not be cancellable properly,
-        // so at least for now allow deletions only for active tenants. TODO recheck
-        // Broken and Stopping is needed for retries.
-        if !matches!(
-            tenant.current_state(),
-            TenantState::Active | TenantState::Broken { .. }
-        ) {
-            return Err(DeleteTenantError::InvalidState(tenant.current_state()));
-        }
-
-        let guard = Arc::clone(&tenant.delete_progress)
-            .try_lock_owned()
-            .map_err(|_| DeleteTenantError::AlreadyInProgress)?;
-
-        fail::fail_point!("tenant-delete-before-shutdown", |_| {
-            Err(anyhow::anyhow!("failpoint: tenant-delete-before-shutdown"))?
-        });
-
-        // make pageserver shutdown not to wait for our completion
-        let (_, progress) = completion::channel();
-
-        // It would be good to only set stopping here and continue shutdown in the background, but shutdown is not idempotent.
-        // i e it is an error to do:
-        // tenant.set_stopping
-        // tenant.shutdown
-        // Its also bad that we're holding tenants.read here.
-        // TODO relax set_stopping to be idempotent?
-        if tenant.shutdown(progress, false).await.is_err() {
-            return Err(DeleteTenantError::Other(anyhow::anyhow!(
-                "tenant shutdown is already in progress"
-            )));
-        }
-
-        Ok((Arc::clone(tenant), guard))
-    }
-
-    fn schedule_background(
-        guard: OwnedMutexGuard<Self>,
-        conf: &'static PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        tenant: Arc<Tenant>,
-    ) {
-        let tenant_id = tenant.tenant_id;
-
-        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
-            TaskKind::TimelineDeletionWorker,
-            Some(tenant_id),
-            None,
-            "tenant_delete",
-            false,
-            async move {
-                if let Err(err) =
-                    Self::background(guard, conf, remote_storage, tenants, &tenant).await
-                {
-                    error!("Error: {err:#}");
-                    tenant.set_broken(format!("{err:#}")).await;
-                };
-                Ok(())
-            }
-            .instrument({
-                let span = tracing::info_span!(parent: None, "delete_tenant", tenant_id=%tenant_id);
-                span.follows_from(Span::current());
-                span
-            }),
-        );
-    }
-
-    async fn background(
-        mut guard: OwnedMutexGuard<Self>,
-        conf: &PageServerConf,
-        remote_storage: Option<GenericRemoteStorage>,
-        tenants: &'static tokio::sync::RwLock<TenantsMap>,
-        tenant: &Arc<Tenant>,
-    ) -> Result<(), DeleteTenantError> {
-        // Tree sort timelines, schedule delete for them. Mention retries from the console side.
-        // Note that if deletion fails we dont mark timelines as broken,
-        // the whole tenant will become broken as by `Self::schedule_background` logic
-        let already_running_timeline_deletions = schedule_ordered_timeline_deletions(tenant)
-            .await
-            .context("schedule_ordered_timeline_deletions")?;
-
-        fail::fail_point!("tenant-delete-before-polling-ongoing-deletions", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-polling-ongoing-deletions"
-            ))?
-        });
-
-        // Wait for deletions that were already running at the moment when tenant deletion was requested.
-        // When we can lock deletion guard it means that corresponding timeline deletion finished.
-        for (guard, timeline_id) in already_running_timeline_deletions {
-            let flow = guard.lock().await;
-            if !flow.is_finished() {
-                return Err(DeleteTenantError::Other(anyhow::anyhow!(
-                    "already running timeline deletion failed: {timeline_id}"
-                )));
-            }
-        }
-
-        let timelines_path = conf.timelines_path(&tenant.tenant_id);
-        // May not exist if we fail in cleanup_remaining_fs_traces after removing it
-        if timelines_path.exists() {
-            // sanity check to guard against layout changes
-            ensure_timelines_dir_empty(&timelines_path)
-                .await
-                .context("timelines dir not empty")?;
-        }
-
-        remove_tenant_remote_delete_mark(conf, remote_storage.as_ref(), &tenant.tenant_id).await?;
-
-        fail::fail_point!("tenant-delete-before-cleanup-remaining-fs-traces", |_| {
-            Err(anyhow::anyhow!(
-                "failpoint: tenant-delete-before-cleanup-remaining-fs-traces"
-            ))?
-        });
-
-        cleanup_remaining_fs_traces(conf, &tenant.tenant_id)
-            .await
-            .context("cleanup_remaining_fs_traces")?;
-
-        let mut locked = tenants.write().await;
-        if locked.remove(&tenant.tenant_id).is_none() {
-            warn!("Tenant got removed from tenants map during deletion");
-        };
-
-        *guard = Self::Finished;
-
-        Ok(())
-    }
-}
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -20,7 +20,6 @@
 //!
 use byteorder::{ReadBytesExt, BE};
 use bytes::{BufMut, Bytes, BytesMut};
-use either::Either;
 use hex;
 use std::{cmp::Ordering, io, result};
 use thiserror::Error;
@@ -257,77 +256,63 @@ where
    where
        V: FnMut(&[u8], u64) -> bool,
    {
-        let mut stack = Vec::new();
-        stack.push((self.root_blk, None));
-        while let Some((node_blknum, opt_iter)) = stack.pop() {
-            // Locate the node.
-            let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;
+        self.search_recurse(self.root_blk, search_key, dir, &mut visitor)
+    }

-            let node = OnDiskNode::deparse(node_buf.as_ref())?;
-            let prefix_len = node.prefix_len as usize;
-            let suffix_len = node.suffix_len as usize;
+    fn search_recurse<V>(
+        &self,
+        node_blknum: u32,
+        search_key: &[u8; L],
+        dir: VisitDirection,
+        visitor: &mut V,
+    ) -> Result<bool>
+    where
+        V: FnMut(&[u8], u64) -> bool,
+    {
+        // Locate the node.
+        let node_buf = self.reader.read_blk(self.start_blk + node_blknum)?;

-            assert!(node.num_children > 0);
+        let node = OnDiskNode::deparse(node_buf.as_ref())?;
+        let prefix_len = node.prefix_len as usize;
+        let suffix_len = node.suffix_len as usize;

-            let mut keybuf = Vec::new();
-            keybuf.extend(node.prefix);
-            keybuf.resize(prefix_len + suffix_len, 0);
+        assert!(node.num_children > 0);

-            let mut iter = if let Some(iter) = opt_iter {
-                iter
-            } else if dir == VisitDirection::Forwards {
-                // Locate the first match
-                let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
-                    Ok(idx) => idx,
-                    Err(idx) => {
-                        if node.level == 0 {
-                            // Imagine that the node contains the following keys:
-                            //
-                            // 1
-                            // 3  <-- idx
-                            // 5
-                            //
-                            // If the search key is '2' and there is exact match,
-                            // the binary search would return the index of key
-                            // '3'. That's cool, '3' is the first key to return.
-                            idx
-                        } else {
-                            // This is an internal page, so each key represents a lower
-                            // bound for what's in the child page. If there is no exact
-                            // match, we have to return the *previous* entry.
-                            //
-                            // 1  <-- return this
-                            // 3  <-- idx
-                            // 5
-                            idx.saturating_sub(1)
-                        }
-                    }
-                };
-                Either::Left(idx..node.num_children.into())
-            } else {
-                let idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
-                    Ok(idx) => {
-                        // Exact match. That's the first entry to return, and walk
-                        // backwards from there.
+        let mut keybuf = Vec::new();
+        keybuf.extend(node.prefix);
+        keybuf.resize(prefix_len + suffix_len, 0);
+
+        if dir == VisitDirection::Forwards {
+            // Locate the first match
+            let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
+                Ok(idx) => idx,
+                Err(idx) => {
+                    if node.level == 0 {
+                        // Imagine that the node contains the following keys:
+                        //
+                        // 1
+                        // 3  <-- idx
+                        // 5
+                        //
+                        // If the search key is '2' and there is exact match,
+                        // the binary search would return the index of key
+                        // '3'. That's cool, '3' is the first key to return.
                        idx
+                    } else {
+                        // This is an internal page, so each key represents a lower
+                        // bound for what's in the child page. If there is no exact
+                        // match, we have to return the *previous* entry.
+                        //
+                        // 1  <-- return this
+                        // 3  <-- idx
+                        // 5
+                        idx.saturating_sub(1)
                    }
-                    Err(idx) => {
-                        // No exact match. The binary search returned the index of the
-                        // first key that's > search_key. Back off by one, and walk
-                        // backwards from there.
-                        if let Some(idx) = idx.checked_sub(1) {
-                            idx
-                        } else {
-                            return Ok(false);
-                        }
-                    }
-                };
-                Either::Right((0..=idx).rev())
+                }
            };
-
            // idx points to the first match now. Keep going from there
-            while let Some(idx) = iter.next() {
-                let key_off = idx * suffix_len;
+            let mut key_off = idx * suffix_len;
+            while idx < node.num_children as usize {
                let suffix = &node.keys[key_off..key_off + suffix_len];
                keybuf[prefix_len..].copy_from_slice(suffix);
                let value = node.value(idx);
@@ -338,8 +323,52 @@ where
                        return Ok(false);
                    }
                } else {
-                    stack.push((node_blknum, Some(iter)));
-                    stack.push((value.to_blknum(), None));
+                    #[allow(clippy::collapsible_if)]
+                    if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
+                        return Ok(false);
+                    }
+                }
+                idx += 1;
+                key_off += suffix_len;
+            }
+        } else {
+            let mut idx = match node.binary_search(search_key, keybuf.as_mut_slice()) {
+                Ok(idx) => {
+                    // Exact match. That's the first entry to return, and walk
+                    // backwards from there. (The loop below starts from 'idx -
+                    // 1', so add one here to compensate.)
+                    idx + 1
+                }
+                Err(idx) => {
+                    // No exact match. The binary search returned the index of the
+                    // first key that's > search_key. Back off by one, and walk
+                    // backwards from there. (The loop below starts from idx - 1,
+                    // so we don't need to subtract one here)
+                    idx
+                }
+            };
+
+            // idx points to the first match + 1 now. Keep going from there.
+            let mut key_off = idx * suffix_len;
+            while idx > 0 {
+                idx -= 1;
+                key_off -= suffix_len;
+                let suffix = &node.keys[key_off..key_off + suffix_len];
+                keybuf[prefix_len..].copy_from_slice(suffix);
+                let value = node.value(idx);
+                #[allow(clippy::collapsible_if)]
+                if node.level == 0 {
+                    // leaf
+                    if !visitor(&keybuf, value.to_u64()) {
+                        return Ok(false);
+                    }
+                } else {
+                    #[allow(clippy::collapsible_if)]
+                    if !self.search_recurse(value.to_blknum(), search_key, dir, visitor)? {
+                        return Ok(false);
+                    }
+                }
+                if idx == 0 {
                    break;
                }
            }
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -2,7 +2,7 @@
 //! used to keep in-memory layers spilled on disk.

 use crate::config::PageServerConf;
-use crate::page_cache::PAGE_SZ;
+use crate::page_cache::{self, ReadBufResult, WriteBufResult, PAGE_SZ};
 use crate::tenant::blob_io::BlobWriter;
 use crate::tenant::block_io::BlockReader;
 use crate::virtual_file::VirtualFile;
@@ -10,7 +10,7 @@ use once_cell::sync::Lazy;
 use std::cmp::min;
 use std::collections::HashMap;
 use std::fs::OpenOptions;
-use std::io::{self};
+use std::io::{self, ErrorKind};
 use std::ops::DerefMut;
 use std::path::PathBuf;
 use std::sync::{Arc, RwLock};
@@ -19,9 +19,6 @@ use utils::id::{TenantId, TimelineId};

 use std::os::unix::fs::FileExt;

-mod buffer_pool;
-mod dirty_buffer;
-
 ///
 /// This is the global cache of file descriptors (File objects).
 ///
@@ -97,13 +94,27 @@ impl EphemeralFile {
        Ok(())
    }

-    fn get_buf_for_write(&self, blkno: u32) -> Result<dirty_buffer::Buffer, io::Error> {
-        let pool = buffer_pool::get();
-        let mut buf = pool.get_buffer();
-        // Read the page from disk into the buffer
-        // TODO: if we're overwriting the whole page, no need to read it in first
-        self.fill_buffer(buf.deref_mut(), blkno)?;
-        Ok(dirty_buffer::Buffer::new(self, buf, blkno))
+    fn get_buf_for_write(&self, blkno: u32) -> Result<page_cache::PageWriteGuard, io::Error> {
+        // Look up the right page
+        let cache = page_cache::get();
+        let mut write_guard = match cache
+            .write_ephemeral_buf(self.file_id, blkno)
+            .map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
+        {
+            WriteBufResult::Found(guard) => guard,
+            WriteBufResult::NotFound(mut guard) => {
+                // Read the page from disk into the buffer
+                // TODO: if we're overwriting the whole page, no need to read it in first
+                self.fill_buffer(guard.deref_mut(), blkno)?;
+                guard.mark_valid();
+
+                // And then fall through to modify it.
+                guard
+            }
+        };
+        write_guard.mark_dirty();
+
+        Ok(write_guard)
    }
 }

@@ -116,6 +127,75 @@ pub fn is_ephemeral_file(filename: &str) -> bool {
    }
 }

+impl FileExt for EphemeralFile {
+    fn read_at(&self, dstbuf: &mut [u8], offset: u64) -> Result<usize, io::Error> {
+        // Look up the right page
+        let blkno = (offset / PAGE_SZ as u64) as u32;
+        let off = offset as usize % PAGE_SZ;
+        let len = min(PAGE_SZ - off, dstbuf.len());
+
+        let read_guard;
+        let mut write_guard;
+
+        let cache = page_cache::get();
+        let buf = match cache
+            .read_ephemeral_buf(self.file_id, blkno)
+            .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
+        {
+            ReadBufResult::Found(guard) => {
+                read_guard = guard;
+                read_guard.as_ref()
+            }
+            ReadBufResult::NotFound(guard) => {
+                // Read the page from disk into the buffer
+                write_guard = guard;
+                self.fill_buffer(write_guard.deref_mut(), blkno)?;
+                write_guard.mark_valid();
+
+                // And then fall through to read the requested slice from the
+                // buffer.
+                write_guard.as_ref()
+            }
+        };
+
+        dstbuf[0..len].copy_from_slice(&buf[off..(off + len)]);
+        Ok(len)
+    }
+
+    fn write_at(&self, srcbuf: &[u8], offset: u64) -> Result<usize, io::Error> {
+        // Look up the right page
+        let blkno = (offset / PAGE_SZ as u64) as u32;
+        let off = offset as usize % PAGE_SZ;
+        let len = min(PAGE_SZ - off, srcbuf.len());
+
+        let mut write_guard;
+        let cache = page_cache::get();
+        let buf = match cache
+            .write_ephemeral_buf(self.file_id, blkno)
+            .map_err(|e| to_io_error(e, "Failed to write ephemeral buf"))?
+        {
+            WriteBufResult::Found(guard) => {
+                write_guard = guard;
+                write_guard.deref_mut()
+            }
+            WriteBufResult::NotFound(guard) => {
+                // Read the page from disk into the buffer
+                // TODO: if we're overwriting the whole page, no need to read it in first
+                write_guard = guard;
+                self.fill_buffer(write_guard.deref_mut(), blkno)?;
+                write_guard.mark_valid();
+
+                // And then fall through to modify it.
+                write_guard.deref_mut()
+            }
+        };
+
+        buf[off..(off + len)].copy_from_slice(&srcbuf[0..len]);
+        write_guard.mark_dirty();
+        Ok(len)
+    }
+}
+
 impl BlobWriter for EphemeralFile {
    fn write_blob(&mut self, srcbuf: &[u8]) -> Result<u64, io::Error> {
        let pos = self.size;
@@ -137,7 +217,6 @@ impl BlobWriter for EphemeralFile {
                // it needs to be split across pages
                buf[off..(off + thislen)].copy_from_slice(&len_buf[..thislen]);
                blknum += 1;
-                buf.writeback()?;
                buf = self.get_buf_for_write(blknum)?;
                buf[0..4 - thislen].copy_from_slice(&len_buf[thislen..]);
                off = 4 - thislen;
@@ -153,7 +232,6 @@ impl BlobWriter for EphemeralFile {
            let mut page_remain = PAGE_SZ - off;
            if page_remain == 0 {
                blknum += 1;
-                buf.writeback()?;
                buf = self.get_buf_for_write(blknum)?;
                off = 0;
                page_remain = PAGE_SZ;
@@ -163,8 +241,7 @@ impl BlobWriter for EphemeralFile {
            off += this_blk_len;
            buf_remain = &buf_remain[this_blk_len..];
        }
-
-        buf.writeback()?;
+        drop(buf);

        if srcbuf.len() < 0x80 {
            self.size += 1;
@@ -179,6 +256,10 @@ impl BlobWriter for EphemeralFile {

 impl Drop for EphemeralFile {
    fn drop(&mut self) {
+        // drop all pages from page cache
+        let cache = page_cache::get();
+        cache.drop_buffers_for_ephemeral(self.file_id);
+
        // remove entry from the hash map
        EPHEMERAL_FILES.write().unwrap().files.remove(&self.file_id);

@@ -200,24 +281,62 @@ impl Drop for EphemeralFile {
    }
 }

+pub fn writeback(file_id: u64, blkno: u32, buf: &[u8]) -> Result<(), io::Error> {
+    if let Some(file) = EPHEMERAL_FILES.read().unwrap().files.get(&file_id) {
+        match file.write_all_at(buf, blkno as u64 * PAGE_SZ as u64) {
+            Ok(_) => Ok(()),
+            Err(e) => Err(io::Error::new(
+                ErrorKind::Other,
+                format!(
+                    "failed to write back to ephemeral file at {} error: {}",
+                    file.path.display(),
+                    e
+                ),
+            )),
+        }
+    } else {
+        Err(io::Error::new(
+            ErrorKind::Other,
+            "could not write back page, not found in ephemeral files hash",
+        ))
+    }
+}
+
 impl BlockReader for EphemeralFile {
-    type BlockLease = buffer_pool::Handle;
+    type BlockLease = page_cache::PageReadGuard<'static>;

    fn read_blk(&self, blknum: u32) -> Result<Self::BlockLease, io::Error> {
-        // Read the page from disk into the buffer
-        let pool = buffer_pool::get();
-        let mut buf = pool.get_buffer();
-        self.fill_buffer(buf.deref_mut(), blknum)?;
-        Ok(buf)
+        // Look up the right page
+        let cache = page_cache::get();
+        loop {
+            match cache
+                .read_ephemeral_buf(self.file_id, blknum)
+                .map_err(|e| to_io_error(e, "Failed to read ephemeral buf"))?
+            {
+                ReadBufResult::Found(guard) => return Ok(guard),
+                ReadBufResult::NotFound(mut write_guard) => {
+                    // Read the page from disk into the buffer
+                    self.fill_buffer(write_guard.deref_mut(), blknum)?;
+                    write_guard.mark_valid();
+
+                    // Swap for read lock
+                    continue;
+                }
+            };
+        }
    }
 }

+fn to_io_error(e: anyhow::Error, context: &str) -> io::Error {
+    io::Error::new(ErrorKind::Other, format!("{context}: {e:#}"))
+}
+
 #[cfg(test)]
 mod tests {
    use super::*;
    use crate::tenant::blob_io::BlobWriter;
    use crate::tenant::block_io::BlockCursor;
-    use rand::{thread_rng, RngCore};
+    use rand::{seq::SliceRandom, thread_rng, RngCore};
    use std::fs;
    use std::str::FromStr;

@@ -238,6 +357,50 @@ mod tests {
        Ok((conf, tenant_id, timeline_id))
    }

+    // Helper function to slurp contents of a file, starting at the current position,
+    // into a string
+    fn read_string(efile: &EphemeralFile, offset: u64, len: usize) -> Result<String, io::Error> {
+        let mut buf = Vec::new();
+        buf.resize(len, 0u8);
+
+        efile.read_exact_at(&mut buf, offset)?;
+
+        Ok(String::from_utf8_lossy(&buf)
+            .trim_end_matches('\0')
+            .to_string())
+    }
+
+    #[test]
+    fn test_ephemeral_files() -> Result<(), io::Error> {
+        let (conf, tenant_id, timeline_id) = harness("ephemeral_files")?;
+
+        let file_a = EphemeralFile::create(conf, tenant_id, timeline_id)?;
+
+        file_a.write_all_at(b"foo", 0)?;
+        assert_eq!("foo", read_string(&file_a, 0, 20)?);
+
+        file_a.write_all_at(b"bar", 3)?;
+        assert_eq!("foobar", read_string(&file_a, 0, 20)?);
+
+        // Open a lot of files, enough to cause some page evictions.
+        let mut efiles = Vec::new();
+        for fileno in 0..100 {
+            let efile = EphemeralFile::create(conf, tenant_id, timeline_id)?;
+            efile.write_all_at(format!("file {}", fileno).as_bytes(), 0)?;
+            assert_eq!(format!("file {}", fileno), read_string(&efile, 0, 10)?);
+            efiles.push((fileno, efile));
+        }
+
+        // Check that all the files can still be read from. Use them in random order for
+        // good measure.
+        efiles.as_mut_slice().shuffle(&mut thread_rng());
+        for (fileno, efile) in efiles.iter_mut() {
+            assert_eq!(format!("file {}", fileno), read_string(efile, 0, 10)?);
+        }
+
+        Ok(())
+    }
+
    #[test]
    fn test_ephemeral_blobs() -> Result<(), io::Error> {
        let (conf, tenant_id, timeline_id) = harness("ephemeral_blobs")?;
--- a/pageserver/src/tenant/ephemeral_file/buffer_pool.rs
+++ b/pageserver/src/tenant/ephemeral_file/buffer_pool.rs
@@ -1,66 +0,0 @@
-//! Buffer pool for ephemeral file buffers.
-//!
-//! Currently this is a very simple implementation that just uses `malloc`.
-//! But the interface is such that we can switch to a more sophisticated
-//! implementation later, e.g., one that caps that amount of memory used.
-
-use std::ops::{Deref, DerefMut};
-
-use crate::page_cache::PAGE_SZ;
-
-pub struct BufferPool;
-
-const POOL: BufferPool = BufferPool;
-
-pub(super) fn get() -> &'static BufferPool {
-    &POOL
-}
-
-impl BufferPool {
-    /// Get a [`Handle`] to a buffer in the pool.
-    ///
-    /// The buffer is guaranteed to be zeroed out.
-    ///
-    /// The implementation may block to wait for buffers to become available,
-    /// and a future async version of this method may `.await` internally to
-    /// wait for buffers to become available.
-    ///
-    /// To avoid deadlocks, a thread/task must get all the buffers it needs
-    /// with a single call to `get_buffer`. Without this rule, a deadlock
-    /// can happen. Take for example a buffer pool with 2 buffers X, Y
-    /// and a program with two threads A and B, each requiring 2 buffers.
-    /// If A gets X and B gets Y, then both threads will block forever trying
-    /// to get their second buffer.
-    pub fn get_buffer(&self) -> Handle {
-        Handle {
-            data: vec![0; PAGE_SZ],
-        }
-    }
-}
-
-pub struct Handle {
-    data: Vec<u8>,
-}
-
-impl std::fmt::Debug for Handle {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("Handle")
-            .field("data", &self.data.as_ptr())
-            .finish()
-    }
-}
-
-impl Deref for Handle {
-    type Target = [u8; PAGE_SZ];
-    fn deref(&self) -> &Self::Target {
-        let slice: &[u8] = &self.data[..];
-        slice.try_into().unwrap()
-    }
-}
-
-impl DerefMut for Handle {
-    fn deref_mut(&mut self) -> &mut Self::Target {
-        let slice: &mut [u8] = &mut self.data[..];
-        slice.try_into().unwrap()
-    }
-}
--- a/pageserver/src/tenant/ephemeral_file/dirty_buffer.rs
+++ b/pageserver/src/tenant/ephemeral_file/dirty_buffer.rs
@@ -1,111 +0,0 @@
-//! Newtypes to ensure that dirty buffers are written back to the filesystem before they are dropped.
-
-use std::io::ErrorKind;
-use std::ops::Deref;
-use std::ops::DerefMut;
-use std::os::unix::prelude::FileExt;
-
-use crate::page_cache::PAGE_SZ;
-
-use super::buffer_pool;
-use super::EphemeralFile;
-
-pub(super) struct Buffer<'f> {
-    inner: Inner<'f>,
-}
-
-enum Inner<'f> {
-    Dirty {
-        ephemeral_file: &'f EphemeralFile,
-        buf: buffer_pool::Handle,
-        blkno: u32,
-    },
-    WritebackOngoing,
-    WrittenBack,
-    WritebackError,
-    Dropped,
-}
-
-impl<'f> Buffer<'f> {
-    pub(super) fn new(
-        ephemeral_file: &'f EphemeralFile,
-        buf: buffer_pool::Handle,
-        blkno: u32,
-    ) -> Self {
-        Self {
-            inner: Inner::Dirty {
-                ephemeral_file,
-                buf,
-                blkno,
-            },
-        }
-    }
-    pub(super) fn writeback(mut self) -> Result<(), std::io::Error> {
-        let Inner::Dirty {
-        ephemeral_file,
-        buf,
-        blkno,
-    } = std::mem::replace(&mut self.inner, Inner::WritebackOngoing) else {
-        unreachable!("writeback consumes");
-    };
-        match ephemeral_file
-            .file
-            .write_all_at(buf.deref(), blkno as u64 * PAGE_SZ as u64)
-        {
-            Ok(_) => {
-                self.inner = Inner::WrittenBack;
-                Ok(())
-            }
-            Err(e) => {
-                self.inner = Inner::WritebackError;
-                Err(std::io::Error::new(
-                    ErrorKind::Other,
-                    format!(
-                        "failed to write back to ephemeral file at {} error: {}",
-                        ephemeral_file.file.path.display(),
-                        e
-                    ),
-                ))
-            }
-        }
-    }
-}
-
-impl<'f> Deref for Buffer<'f> {
-    type Target = [u8];
-
-    fn deref(&self) -> &[u8] {
-        match &self.inner {
-            Inner::Dirty { buf, .. } => &**buf,
-            Inner::WritebackOngoing => unreachable!("writeback consumes"),
-            Inner::WrittenBack => unreachable!("writeback consumes"),
-            Inner::WritebackError => unreachable!("writeback consumes"),
-            Inner::Dropped => unreachable!(),
-        }
-    }
-}
-
-impl<'f> DerefMut for Buffer<'f> {
-    fn deref_mut(&mut self) -> &mut [u8] {
-        match &mut self.inner {
-            Inner::Dirty { buf, .. } => &mut **buf,
-            Inner::WritebackOngoing => unreachable!("writeback consumes"),
-            Inner::WrittenBack => unreachable!("writeback consumes"),
-            Inner::WritebackError => unreachable!("writeback consumes"),
-            Inner::Dropped => unreachable!(),
-        }
-    }
-}
-
-impl Drop for Buffer<'_> {
-    fn drop(&mut self) {
-        let prev = std::mem::replace(&mut self.inner, Inner::Dropped);
-        match prev {
-            // TODO: check this at compile time
-            Inner::Dirty { .. } => panic!("dropped dirty buffer, need to writeback() first"),
-            Inner::WritebackOngoing => unreachable!("transitory state"),
-            Inner::WrittenBack | Inner::WritebackError => {}
-            Inner::Dropped => unreachable!("drop only happens once"),
-        }
-    }
-}
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -20,19 +20,17 @@ use crate::config::PageServerConf;
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
-use crate::tenant::delete::DeleteTenantFlow;
 use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
 use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};

 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};

-use super::delete::DeleteTenantError;
 use super::timeline::delete::DeleteTimelineFlow;

 /// The tenants known to the pageserver.
 /// The enum variants are used to distinguish the different states that the pageserver can be in.
-pub(crate) enum TenantsMap {
+enum TenantsMap {
    /// [`init_tenant_mgr`] is not done yet.
    Initializing,
    /// [`init_tenant_mgr`] is done, all on-disk tenants have been loaded.
@@ -44,13 +42,13 @@ pub(crate) enum TenantsMap {
 }

 impl TenantsMap {
-    pub(crate) fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
+    fn get(&self, tenant_id: &TenantId) -> Option<&Arc<Tenant>> {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.get(tenant_id),
        }
    }
-    pub(crate) fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
+    fn remove(&mut self, tenant_id: &TenantId) -> Option<Arc<Tenant>> {
        match self {
            TenantsMap::Initializing => None,
            TenantsMap::Open(m) | TenantsMap::ShuttingDown(m) => m.remove(tenant_id),
@@ -99,9 +97,7 @@ pub async fn init_tenant_mgr(
                        );
                    }
                } else {
-                    // This case happens if we:
-                    // * crash during attach before creating the attach marker file
-                    // * crash during tenant delete before removing tenant directory
+                    // This case happens if we crash during attach before creating the attach marker file
                    let is_empty = tenant_dir_path.is_empty_dir().with_context(|| {
                        format!("Failed to check whether {tenant_dir_path:?} is an empty dir")
                    })?;
@@ -128,7 +124,6 @@ pub async fn init_tenant_mgr(
                        broker_client.clone(),
                        remote_storage.clone(),
                        Some(init_order.clone()),
-                        &TENANTS,
                        &ctx,
                    ) {
                        Ok(tenant) => {
@@ -159,13 +154,12 @@ pub async fn init_tenant_mgr(
    Ok(())
 }

-pub(crate) fn schedule_local_tenant_processing(
+pub fn schedule_local_tenant_processing(
    conf: &'static PageServerConf,
    tenant_path: &Path,
    broker_client: storage_broker::BrokerClientChannel,
    remote_storage: Option<GenericRemoteStorage>,
    init_order: Option<InitializationOrder>,
-    tenants: &'static tokio::sync::RwLock<TenantsMap>,
    ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
    anyhow::ensure!(
@@ -225,7 +219,6 @@ pub(crate) fn schedule_local_tenant_processing(
            broker_client,
            remote_storage,
            init_order,
-            tenants,
            ctx,
        )
    };
@@ -363,7 +356,7 @@ pub async fn create_tenant(
        //       See https://github.com/neondatabase/neon/issues/4233

        let created_tenant =
-            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, &TENANTS, ctx)?;
+            schedule_local_tenant_processing(conf, &tenant_directory, broker_client, remote_storage, None, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

@@ -424,14 +417,6 @@ pub async fn get_tenant(
    }
 }

-pub async fn delete_tenant(
-    conf: &'static PageServerConf,
-    remote_storage: Option<GenericRemoteStorage>,
-    tenant_id: TenantId,
-) -> Result<(), DeleteTenantError> {
-    DeleteTenantFlow::run(conf, remote_storage, &TENANTS, tenant_id).await
-}
-
 #[derive(Debug, thiserror::Error)]
 pub enum DeleteTimelineError {
    #[error("Tenant {0}")]
@@ -447,7 +432,7 @@ pub async fn delete_timeline(
    _ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
    let tenant = get_tenant(tenant_id, true).await?;
-    DeleteTimelineFlow::run(&tenant, timeline_id, false).await?;
+    DeleteTimelineFlow::run(&tenant, timeline_id).await?;
    Ok(())
 }

@@ -522,7 +507,7 @@ pub async fn load_tenant(
                .with_context(|| format!("Failed to remove tenant ignore mark {tenant_ignore_mark:?} during tenant loading"))?;
        }

-        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, &TENANTS, ctx)
+        let new_tenant = schedule_local_tenant_processing(conf, &tenant_path, broker_client, remote_storage, None, ctx)
            .with_context(|| {
                format!("Failed to schedule tenant processing in path {tenant_path:?}")
            })?;
@@ -603,7 +588,7 @@ pub async fn attach_tenant(
            .context("check for attach marker file existence")?;
        anyhow::ensure!(marker_file_exists, "create_tenant_files should have created the attach marker file");

-        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, &TENANTS, ctx)?;
+        let attached_tenant = schedule_local_tenant_processing(conf, &tenant_dir, broker_client, Some(remote_storage), None, ctx)?;
        // TODO: tenant object & its background loops remain, untracked in tenant map, if we fail here.
        //      See https://github.com/neondatabase/neon/issues/4233

--- a/pageserver/src/tenant/storage_layer/inmemory_layer.rs
+++ b/pageserver/src/tenant/storage_layer/inmemory_layer.rs
@@ -16,7 +16,6 @@ use anyhow::{ensure, Result};
 use pageserver_api::models::InMemoryLayerInfo;
 use std::cell::RefCell;
 use std::collections::HashMap;
-use std::sync::OnceLock;
 use tracing::*;
 use utils::{
    bin_ser::BeSer,
@@ -43,16 +42,14 @@ pub struct InMemoryLayer {
    tenant_id: TenantId,
    timeline_id: TimelineId,

+    ///
    /// This layer contains all the changes from 'start_lsn'. The
    /// start is inclusive.
+    ///
    start_lsn: Lsn,

-    /// Frozen layers have an exclusive end LSN.
-    /// Writes are only allowed when this is `None`.
-    end_lsn: OnceLock<Lsn>,
-
-    /// The above fields never change, except for `end_lsn`, which is only set once.
-    /// All other changing parts are in `inner`, and protected by a mutex.
+    /// The above fields never change. The parts that do change are in 'inner',
+    /// and protected by mutex.
    inner: RwLock<InMemoryLayerInner>,
 }

@@ -60,16 +57,21 @@ impl std::fmt::Debug for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
        f.debug_struct("InMemoryLayer")
            .field("start_lsn", &self.start_lsn)
-            .field("end_lsn", &self.end_lsn)
            .field("inner", &self.inner)
            .finish()
    }
 }

 pub struct InMemoryLayerInner {
+    /// Frozen layers have an exclusive end LSN.
+    /// Writes are only allowed when this is None
+    end_lsn: Option<Lsn>,
+
+    ///
    /// All versions of all pages in the layer are kept here.  Indexed
    /// by block number and LSN. The value is an offset into the
    /// ephemeral file where the page version is stored.
+    ///
    index: HashMap<Key, VecMap<Lsn, u64>>,

    /// The values are stored in a serialized format in this file.
@@ -80,7 +82,15 @@ pub struct InMemoryLayerInner {

 impl std::fmt::Debug for InMemoryLayerInner {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        f.debug_struct("InMemoryLayerInner").finish()
+        f.debug_struct("InMemoryLayerInner")
+            .field("end_lsn", &self.end_lsn)
+            .finish()
+    }
+}
+
+impl InMemoryLayerInner {
+    fn assert_writeable(&self) {
+        assert!(self.end_lsn.is_none());
    }
 }

@@ -91,21 +101,13 @@ impl InMemoryLayer {

    pub fn info(&self) -> InMemoryLayerInfo {
        let lsn_start = self.start_lsn;
+        let lsn_end = self.inner.read().unwrap().end_lsn;

-        if let Some(&lsn_end) = self.end_lsn.get() {
-            InMemoryLayerInfo::Frozen { lsn_start, lsn_end }
-        } else {
-            InMemoryLayerInfo::Open { lsn_start }
+        match lsn_end {
+            Some(lsn_end) => InMemoryLayerInfo::Frozen { lsn_start, lsn_end },
+            None => InMemoryLayerInfo::Open { lsn_start },
        }
    }
-
-    fn assert_writable(&self) {
-        assert!(self.end_lsn.get().is_none());
-    }
-
-    fn end_lsn_or_max(&self) -> Lsn {
-        self.end_lsn.get().copied().unwrap_or(Lsn::MAX)
-    }
 }

 #[async_trait::async_trait]
@@ -115,7 +117,14 @@ impl Layer for InMemoryLayer {
    }

    fn get_lsn_range(&self) -> Range<Lsn> {
-        self.start_lsn..self.end_lsn_or_max()
+        let inner = self.inner.read().unwrap();
+
+        let end_lsn = if let Some(end_lsn) = inner.end_lsn {
+            end_lsn
+        } else {
+            Lsn(u64::MAX)
+        };
+        self.start_lsn..end_lsn
    }

    fn is_incremental(&self) -> bool {
@@ -127,7 +136,11 @@ impl Layer for InMemoryLayer {
    async fn dump(&self, verbose: bool, _ctx: &RequestContext) -> Result<()> {
        let inner = self.inner.read().unwrap();

-        let end_str = self.end_lsn_or_max();
+        let end_str = inner
+            .end_lsn
+            .as_ref()
+            .map(Lsn::to_string)
+            .unwrap_or_default();

        println!(
            "----- in-memory layer for tli {} LSNs {}-{} ----",
@@ -223,7 +236,9 @@ impl Layer for InMemoryLayer {

 impl std::fmt::Display for InMemoryLayer {
    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        let end_lsn = self.end_lsn_or_max();
+        let inner = self.inner.read().unwrap();
+
+        let end_lsn = inner.end_lsn.unwrap_or(Lsn(u64::MAX));
        write!(f, "inmem-{:016X}-{:016X}", self.start_lsn.0, end_lsn.0)
    }
 }
@@ -255,8 +270,8 @@ impl InMemoryLayer {
            timeline_id,
            tenant_id,
            start_lsn,
-            end_lsn: OnceLock::new(),
            inner: RwLock::new(InMemoryLayerInner {
+                end_lsn: None,
                index: HashMap::new(),
                file,
            }),
@@ -270,7 +285,7 @@ impl InMemoryLayer {
    pub fn put_value(&self, key: Key, lsn: Lsn, val: &Value) -> Result<()> {
        trace!("put_value key {} at {}/{}", key, self.timeline_id, lsn);
        let mut inner = self.inner.write().unwrap();
-        self.assert_writable();
+        inner.assert_writeable();

        let off = {
            SER_BUFFER.with(|x| -> Result<_> {
@@ -302,10 +317,10 @@ impl InMemoryLayer {
    /// Records the end_lsn for non-dropped layers.
    /// `end_lsn` is exclusive
    pub fn freeze(&self, end_lsn: Lsn) {
-        let inner = self.inner.write().unwrap();
+        let mut inner = self.inner.write().unwrap();

        assert!(self.start_lsn < end_lsn);
-        self.end_lsn.set(end_lsn).expect("end_lsn set only once");
+        inner.end_lsn = Some(end_lsn);

        for vec_map in inner.index.values() {
            for (lsn, _pos) in vec_map.as_slice() {
@@ -329,14 +344,12 @@ impl InMemoryLayer {
        // rare though, so we just accept the potential latency hit for now.
        let inner = self.inner.read().unwrap();

-        let end_lsn = *self.end_lsn.get().unwrap();
-
        let mut delta_layer_writer = DeltaLayerWriter::new(
            self.conf,
            self.timeline_id,
            self.tenant_id,
            Key::MIN,
-            self.start_lsn..end_lsn,
+            self.start_lsn..inner.end_lsn.unwrap(),
        )?;

        let mut buf = Vec::new();
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -919,7 +919,7 @@ impl Timeline {
    pub fn set_state(&self, new_state: TimelineState) {
        match (self.current_state(), new_state) {
            (equal_state_1, equal_state_2) if equal_state_1 == equal_state_2 => {
-                info!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
+                warn!("Ignoring new state, equal to the existing one: {equal_state_2:?}");
            }
            (st, TimelineState::Loading) => {
                error!("ignoring transition from {st:?} into Loading state");
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -219,13 +219,27 @@ async fn delete_local_layer_files(
            }
        };

-        if metadata.is_dir() {
-            warn!(path=%entry.path().display(), "unexpected directory under timeline dir");
+        let r = if metadata.is_dir() {
+            // There shouldnt be any directories inside timeline dir as of current layout.
            tokio::fs::remove_dir(entry.path()).await
        } else {
            tokio::fs::remove_file(entry.path()).await
+        };
+
+        if let Err(e) = r {
+            if e.kind() == std::io::ErrorKind::NotFound {
+                warn!(
+                    timeline_dir=?local_timeline_directory,
+                    path=?entry.path().display(),
+                    "got not found err while removing timeline dir, proceeding anyway"
+                );
+                continue;
+            }
+            anyhow::bail!(anyhow::anyhow!(
+                "Failed to remove: {}. Error: {e}",
+                entry.path().display()
+            ));
        }
-        .with_context(|| format!("Failed to remove: {}", entry.path().display()))?;
    }

    info!("finished deleting layer files, releasing layer_removal_cs.lock()");
@@ -345,11 +359,10 @@ impl DeleteTimelineFlow {
    // NB: If this fails half-way through, and is retried, the retry will go through
    // all the same steps again. Make sure the code here is idempotent, and don't
    // error out if some of the shutdown tasks have already been completed!
-    #[instrument(skip(tenant), fields(tenant_id=%tenant.tenant_id))]
+    #[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))]
    pub async fn run(
        tenant: &Arc<Tenant>,
        timeline_id: TimelineId,
-        inplace: bool,
    ) -> Result<(), DeleteTimelineError> {
        let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;

@@ -367,11 +380,7 @@ impl DeleteTimelineFlow {
            ))?
        });

-        if inplace {
-            Self::background(guard, tenant.conf, tenant, &timeline).await?
-        } else {
-            Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
-        }
+        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);

        Ok(())
    }
@@ -389,8 +398,6 @@ impl DeleteTimelineFlow {
    }

    /// Shortcut to create Timeline in stopping state and spawn deletion task.
-    /// See corresponding parts of [`crate::tenant::delete::DeleteTenantFlow`]
-    #[instrument(skip_all, fields(%timeline_id))]
    pub async fn resume_deletion(
        tenant: Arc<Tenant>,
        timeline_id: TimelineId,
@@ -437,15 +444,11 @@ impl DeleteTimelineFlow {
        Ok(())
    }

-    #[instrument(skip_all, fields(%timeline_id))]
    pub async fn cleanup_remaining_timeline_fs_traces(
        tenant: &Tenant,
        timeline_id: TimelineId,
    ) -> anyhow::Result<()> {
-        let r =
-            cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await;
-        info!("Done");
-        r
+        cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await
    }

    fn prepare(
@@ -491,17 +494,11 @@ impl DeleteTimelineFlow {
        // At the end of the operation we're holding the guard and need to lock timelines map
        // to remove the timeline from it.
        // Always if you have two locks that are taken in different order this can result in a deadlock.
-
-        let delete_progress = Arc::clone(&timeline.delete_progress);
-        let delete_lock_guard = match delete_progress.try_lock_owned() {
-            Ok(guard) => DeletionGuard(guard),
-            Err(_) => {
-                // Unfortunately if lock fails arc is consumed.
-                return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
-                    &timeline.delete_progress,
-                )));
-            }
-        };
+        let delete_lock_guard = DeletionGuard(
+            Arc::clone(&timeline.delete_progress)
+                .try_lock_owned()
+                .map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
+        );

        timeline.set_state(TimelineState::Stopping);

@@ -556,14 +553,10 @@ impl DeleteTimelineFlow {

        remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;

-        *guard = Self::Finished;
+        *guard.0 = Self::Finished;

        Ok(())
    }
-
-    pub(crate) fn is_finished(&self) -> bool {
-        matches!(self, Self::Finished)
-    }
 }

 struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
--- a/pgxn/neon/walproposer_utils.c
+++ b/pgxn/neon/walproposer_utils.c
@@ -37,14 +37,68 @@ static XLogSegNo walpropSegNo = 0;

 /* START cloned file-local variables and functions from walsender.c */

+/*
+ * xlogreader used for replication.  Note that a WAL sender doing physical
+ * replication does not need xlogreader to read WAL, but it needs one to
+ * keep a state of its work.
+ */
+static XLogReaderState *xlogreader = NULL;
+
+/*
+ * These variables keep track of the state of the timeline we're currently
+ * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric,
+ * the timeline is not the latest timeline on this server, and the server's
+ * history forked off from that timeline at sendTimeLineValidUpto.
+ */
+static TimeLineID sendTimeLine = 0;
+static TimeLineID sendTimeLineNextTLI = 0;
+static bool sendTimeLineIsHistoric = false;
+static XLogRecPtr sendTimeLineValidUpto = InvalidXLogRecPtr;
+
+/*
+ * Timestamp of last ProcessRepliesIfAny() that saw a reply from the
+ * standby. Set to 0 if wal_sender_timeout doesn't need to be active.
+ */
+static TimestampTz last_reply_timestamp = 0;
+
+/* Have we sent a heartbeat message asking for reply, since last reply? */
+static bool waiting_for_ping_response = false;
+
+static bool streamingDoneSending;
+static bool streamingDoneReceiving;
+
+/* Are we there yet? */
+static bool WalSndCaughtUp = false;
+
+/* Flags set by signal handlers for later service in main loop */
+static volatile sig_atomic_t got_STOPPING = false;
+
 /*
 * How far have we sent WAL already? This is also advertised in
 * MyWalSnd->sentPtr.  (Actually, this is the next WAL location to send.)
 */
 static XLogRecPtr sentPtr = InvalidXLogRecPtr;

-static void WalSndLoop(void);
-static void XLogBroadcastWalProposer(void);
+/*
+ * This is set while we are streaming. When not set
+ * PROCSIG_WALSND_INIT_STOPPING signal will be handled like SIGTERM. When set,
+ * the main loop is responsible for checking got_STOPPING and terminating when
+ * it's set (after streaming any remaining WAL).
+ */
+static volatile sig_atomic_t replication_active = false;
+
+typedef void (*WalSndSendDataCallback) (void);
+static void WalSndLoop(WalSndSendDataCallback send_data);
+static void XLogSendPhysical(void);
+#if PG_VERSION_NUM >= 150000
+static XLogRecPtr GetStandbyFlushRecPtr(TimeLineID *tli);
+#else
+static XLogRecPtr GetStandbyFlushRecPtr(void);
+#endif
+
+static void WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
+							  TimeLineID *tli_p);
+
 /* END cloned file-level variables and functions from walsender.c */

 int
@@ -452,7 +506,7 @@ XLogWalPropClose(XLogRecPtr recptr)
 /* START of cloned functions from walsender.c */

 /*
- * Subscribe for new WAL and stream it in the loop to safekeepers.
+ * Handle START_REPLICATION command.
 *
 * At the moment, this never returns, but an ereport(ERROR) will take us back
 * to the main loop.
@@ -470,6 +524,18 @@ StartProposerReplication(StartReplicationCmd *cmd)
 				 errmsg("IDENTIFY_SYSTEM has not been run before START_REPLICATION")));
 #endif

+	/* create xlogreader for physical replication */
+	xlogreader =
+		XLogReaderAllocate(wal_segment_size, NULL,
+						   XL_ROUTINE(.segment_open = WalSndSegmentOpen,
+									  .segment_close = wal_segment_close),
+						   NULL);
+
+	if (!xlogreader)
+		ereport(ERROR,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
+
 	/*
 	 * We assume here that we're logging enough information in the WAL for
 	 * log-shipping, since this is checked in PostmasterMain().
@@ -503,61 +569,341 @@ StartProposerReplication(StartReplicationCmd *cmd)
 	 * we keep this code around to lighten the load for when we need it.
 	 */
 #if PG_VERSION_NUM >= 150000
-	FlushPtr = GetFlushRecPtr(&currTLI);
+	if (am_cascading_walsender)
+	{
+		/* this also updates ThisTimeLineID */
+		FlushPtr = GetStandbyFlushRecPtr(&currTLI);
+	}
+	else
+		FlushPtr = GetFlushRecPtr(&currTLI);
 #else
-	FlushPtr = GetFlushRecPtr();
+	if (am_cascading_walsender)
+	{
+		/* this also updates ThisTimeLineID */
+		FlushPtr = GetStandbyFlushRecPtr();
+	}
+	else
+		FlushPtr = GetFlushRecPtr();
+
 	currTLI = ThisTimeLineID;
 #endif

-	/*
-	 * When we first start replication the standby will be behind the
-	 * primary. For some applications, for example synchronous
-	 * replication, it is important to have a clear state for this initial
-	 * catchup mode, so we can trigger actions when we change streaming
-	 * state later. We may stay in this state for a long time, which is
-	 * exactly why we want to be able to monitor whether or not we are
-	 * still here.
-	 */
-	WalSndSetState(WALSNDSTATE_CATCHUP);

-	/*
-	 * Don't allow a request to stream from a future point in WAL that
-	 * hasn't been flushed to disk in this server yet.
-	 */
-	if (FlushPtr < cmd->startpoint)
+	if (cmd->timeline != 0)
 	{
-		ereport(ERROR,
-				(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
-						LSN_FORMAT_ARGS(cmd->startpoint),
-						LSN_FORMAT_ARGS(FlushPtr))));
+		XLogRecPtr	switchpoint;
+
+		sendTimeLine = cmd->timeline;
+		if (sendTimeLine == currTLI)
+		{
+			sendTimeLineIsHistoric = false;
+			sendTimeLineValidUpto = InvalidXLogRecPtr;
+		}
+		else
+		{
+			List	   *timeLineHistory;
+
+			sendTimeLineIsHistoric = true;
+
+			/*
+			 * Check that the timeline the client requested exists, and the
+			 * requested start location is on that timeline.
+			 */
+			timeLineHistory = readTimeLineHistory(currTLI);
+			switchpoint = tliSwitchPoint(cmd->timeline, timeLineHistory,
+										 &sendTimeLineNextTLI);
+			list_free_deep(timeLineHistory);
+
+			/*
+			 * Found the requested timeline in the history. Check that
+			 * requested startpoint is on that timeline in our history.
+			 *
+			 * This is quite loose on purpose. We only check that we didn't
+			 * fork off the requested timeline before the switchpoint. We
+			 * don't check that we switched *to* it before the requested
+			 * starting point. This is because the client can legitimately
+			 * request to start replication from the beginning of the WAL
+			 * segment that contains switchpoint, but on the new timeline, so
+			 * that it doesn't end up with a partial segment. If you ask for
+			 * too old a starting point, you'll get an error later when we
+			 * fail to find the requested WAL segment in pg_wal.
+			 *
+			 * XXX: we could be more strict here and only allow a startpoint
+			 * that's older than the switchpoint, if it's still in the same
+			 * WAL segment.
+			 */
+			if (!XLogRecPtrIsInvalid(switchpoint) &&
+				switchpoint < cmd->startpoint)
+			{
+				ereport(ERROR,
+						(errmsg("requested starting point %X/%X on timeline %u is not in this server's history",
+								LSN_FORMAT_ARGS(cmd->startpoint),
+								cmd->timeline),
+						 errdetail("This server's history forked from timeline %u at %X/%X.",
+								   cmd->timeline,
+								   LSN_FORMAT_ARGS(switchpoint))));
+			}
+			sendTimeLineValidUpto = switchpoint;
+		}
+	}
+	else
+	{
+		sendTimeLine = currTLI;
+		sendTimeLineValidUpto = InvalidXLogRecPtr;
+		sendTimeLineIsHistoric = false;
 	}

-	/* Start streaming from the requested point */
-	sentPtr = cmd->startpoint;
+	streamingDoneSending = streamingDoneReceiving = false;

-	/* Initialize shared memory status, too */
-	SpinLockAcquire(&MyWalSnd->mutex);
-	MyWalSnd->sentPtr = sentPtr;
-	SpinLockRelease(&MyWalSnd->mutex);
+	/* If there is nothing to stream, don't even enter COPY mode */
+	if (!sendTimeLineIsHistoric || cmd->startpoint < sendTimeLineValidUpto)
+	{
+		/*
+		 * When we first start replication the standby will be behind the
+		 * primary. For some applications, for example synchronous
+		 * replication, it is important to have a clear state for this initial
+		 * catchup mode, so we can trigger actions when we change streaming
+		 * state later. We may stay in this state for a long time, which is
+		 * exactly why we want to be able to monitor whether or not we are
+		 * still here.
+		 */
+		WalSndSetState(WALSNDSTATE_CATCHUP);

-	SyncRepInitConfig();
+		/*
+		 * Don't allow a request to stream from a future point in WAL that
+		 * hasn't been flushed to disk in this server yet.
+		 */
+		if (FlushPtr < cmd->startpoint)
+		{
+			ereport(ERROR,
+					(errmsg("requested starting point %X/%X is ahead of the WAL flush position of this server %X/%X",
+							LSN_FORMAT_ARGS(cmd->startpoint),
+							LSN_FORMAT_ARGS(FlushPtr))));
+		}

-	/* Infinite send loop, never returns */
-	WalSndLoop();
+		/* Start streaming from the requested point */
+		sentPtr = cmd->startpoint;

-	WalSndSetState(WALSNDSTATE_STARTUP);
+		/* Initialize shared memory status, too */
+		SpinLockAcquire(&MyWalSnd->mutex);
+		MyWalSnd->sentPtr = sentPtr;
+		SpinLockRelease(&MyWalSnd->mutex);
+
+		SyncRepInitConfig();
+
+		/* Main loop of walsender */
+		replication_active = true;
+
+		WalSndLoop(XLogSendPhysical);
+
+		replication_active = false;
+		if (got_STOPPING)
+			proc_exit(0);
+		WalSndSetState(WALSNDSTATE_STARTUP);
+
+		Assert(streamingDoneSending && streamingDoneReceiving);
+	}

 	if (cmd->slotname)
 		ReplicationSlotRelease();
+
+	/*
+	 * Copy is finished now. Send a single-row result set indicating the next
+	 * timeline.
+	 */
+	if (sendTimeLineIsHistoric)
+	{
+		char		startpos_str[8 + 1 + 8 + 1];
+		DestReceiver *dest;
+		TupOutputState *tstate;
+		TupleDesc	tupdesc;
+		Datum		values[2];
+		bool		nulls[2];
+
+		snprintf(startpos_str, sizeof(startpos_str), "%X/%X",
+				 LSN_FORMAT_ARGS(sendTimeLineValidUpto));
+
+		dest = CreateDestReceiver(DestRemoteSimple);
+		MemSet(nulls, false, sizeof(nulls));
+
+		/*
+		 * Need a tuple descriptor representing two columns. int8 may seem
+		 * like a surprising data type for this, but in theory int4 would not
+		 * be wide enough for this, as TimeLineID is unsigned.
+		 */
+		tupdesc = CreateTemplateTupleDesc(2);
+		TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 1, "next_tli",
+								  INT8OID, -1, 0);
+		TupleDescInitBuiltinEntry(tupdesc, (AttrNumber) 2, "next_tli_startpos",
+								  TEXTOID, -1, 0);
+
+		/* prepare for projection of tuple */
+		tstate = begin_tup_output_tupdesc(dest, tupdesc, &TTSOpsVirtual);
+
+		values[0] = Int64GetDatum((int64) sendTimeLineNextTLI);
+		values[1] = CStringGetTextDatum(startpos_str);
+
+		/* send it to dest */
+		do_tup_output(tstate, values, nulls);
+
+		end_tup_output(tstate);
+	}
+
+	/* Send CommandComplete message */
+	EndReplicationCommand("START_STREAMING");
 }

-/*
- * Main loop that waits for LSN updates and calls the walproposer.
- * Synchronous replication sets latch in WalSndWakeup at walsender.c
- */
-static void
-WalSndLoop(void)
+#if PG_VERSION_NUM >= 150000
+static XLogRecPtr
+GetStandbyFlushRecPtr(TimeLineID *tli)
 {
+	XLogRecPtr	replayPtr;
+	TimeLineID	replayTLI;
+	XLogRecPtr	receivePtr;
+	TimeLineID	receiveTLI;
+	XLogRecPtr	result;
+
+	/*
+	 * We can safely send what's already been replayed. Also, if walreceiver
+	 * is streaming WAL from the same timeline, we can send anything that it
+	 * has streamed, but hasn't been replayed yet.
+	 */
+
+	receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
+	replayPtr = GetXLogReplayRecPtr(&replayTLI);
+
+	*tli = replayTLI;
+
+	result = replayPtr;
+	if (receiveTLI == replayTLI && receivePtr > replayPtr)
+		result = receivePtr;
+
+	return result;
+}
+#else
+/*
+ * Returns the latest point in WAL that has been safely flushed to disk, and
+ * can be sent to the standby. This should only be called when in recovery,
+ * ie. we're streaming to a cascaded standby.
+ *
+ * As a side-effect, ThisTimeLineID is updated to the TLI of the last
+ * replayed WAL record.
+ */
+static XLogRecPtr
+GetStandbyFlushRecPtr(void)
+{
+	XLogRecPtr	replayPtr;
+	TimeLineID	replayTLI;
+	XLogRecPtr	receivePtr;
+	TimeLineID	receiveTLI;
+	XLogRecPtr	result;
+
+	/*
+	 * We can safely send what's already been replayed. Also, if walreceiver
+	 * is streaming WAL from the same timeline, we can send anything that it
+	 * has streamed, but hasn't been replayed yet.
+	 */
+
+	receivePtr = GetWalRcvFlushRecPtr(NULL, &receiveTLI);
+	replayPtr = GetXLogReplayRecPtr(&replayTLI);
+
+	ThisTimeLineID = replayTLI;
+
+	result = replayPtr;
+	if (receiveTLI == ThisTimeLineID && receivePtr > replayPtr)
+		result = receivePtr;
+
+	return result;
+}
+#endif
+
+
+
+/* XLogReaderRoutine->segment_open callback */
+static void
+WalSndSegmentOpen(XLogReaderState *state, XLogSegNo nextSegNo,
+				  TimeLineID *tli_p)
+{
+	char		path[MAXPGPATH];
+
+	/*-------
+	 * When reading from a historic timeline, and there is a timeline switch
+	 * within this segment, read from the WAL segment belonging to the new
+	 * timeline.
+	 *
+	 * For example, imagine that this server is currently on timeline 5, and
+	 * we're streaming timeline 4. The switch from timeline 4 to 5 happened at
+	 * 0/13002088. In pg_wal, we have these files:
+	 *
+	 * ...
+	 * 000000040000000000000012
+	 * 000000040000000000000013
+	 * 000000050000000000000013
+	 * 000000050000000000000014
+	 * ...
+	 *
+	 * In this situation, when requested to send the WAL from segment 0x13, on
+	 * timeline 4, we read the WAL from file 000000050000000000000013. Archive
+	 * recovery prefers files from newer timelines, so if the segment was
+	 * restored from the archive on this server, the file belonging to the old
+	 * timeline, 000000040000000000000013, might not exist. Their contents are
+	 * equal up to the switchpoint, because at a timeline switch, the used
+	 * portion of the old segment is copied to the new file.  -------
+	 */
+	*tli_p = sendTimeLine;
+	if (sendTimeLineIsHistoric)
+	{
+		XLogSegNo	endSegNo;
+
+		XLByteToSeg(sendTimeLineValidUpto, endSegNo, state->segcxt.ws_segsize);
+		if (nextSegNo == endSegNo)
+			*tli_p = sendTimeLineNextTLI;
+	}
+
+	XLogFilePath(path, *tli_p, nextSegNo, state->segcxt.ws_segsize);
+	state->seg.ws_file = BasicOpenFile(path, O_RDONLY | PG_BINARY);
+	if (state->seg.ws_file >= 0)
+		return;
+
+	/*
+	 * If the file is not found, assume it's because the standby asked for a
+	 * too old WAL segment that has already been removed or recycled.
+	 */
+	if (errno == ENOENT)
+	{
+		char		xlogfname[MAXFNAMELEN];
+		int			save_errno = errno;
+
+		XLogFileName(xlogfname, *tli_p, nextSegNo, wal_segment_size);
+		errno = save_errno;
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("requested WAL segment %s has already been removed",
+						xlogfname)));
+	}
+	else
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m",
+						path)));
+}
+
+
+/* Main loop of walsender process that streams the WAL over Copy messages. */
+static void
+WalSndLoop(WalSndSendDataCallback send_data)
+{
+	/*
+	 * Initialize the last reply timestamp. That enables timeout processing
+	 * from hereon.
+	 */
+	last_reply_timestamp = GetCurrentTimestamp();
+	waiting_for_ping_response = false;
+
+	/*
+	 * Loop until we reach the end of this timeline or the client requests to
+	 * stop streaming.
+	 */
 	for (;;)
 	{
 		/* Clear any already-pending wakeups */
@@ -565,41 +911,153 @@ WalSndLoop(void)

 		CHECK_FOR_INTERRUPTS();

-		XLogBroadcastWalProposer();
+		/* Process any requests or signals received recently */
+		if (ConfigReloadPending)
+		{
+			ConfigReloadPending = false;
+			ProcessConfigFile(PGC_SIGHUP);
+			SyncRepInitConfig();
+		}

-		if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
-			WalSndSetState(WALSNDSTATE_STREAMING);
-		WalProposerPoll();
+		/* always true */
+		if (am_wal_proposer)
+		{
+			send_data();
+			if (WalSndCaughtUp)
+			{
+				if (MyWalSnd->state == WALSNDSTATE_CATCHUP)
+					WalSndSetState(WALSNDSTATE_STREAMING);
+				WalProposerPoll();
+				WalSndCaughtUp = false;
+			}
+			continue;
+		}
 	}
 }

 /*
- * Notify walproposer about the new WAL position.
+ * Send out the WAL in its normal physical/stored form.
+ *
+ * Read up to MAX_SEND_SIZE bytes of WAL that's been flushed to disk,
+ * but not yet sent to the client, and buffer it in the libpq output
+ * buffer.
+ *
+ * If there is no unsent WAL remaining, WalSndCaughtUp is set to true,
+ * otherwise WalSndCaughtUp is set to false.
 */
 static void
-XLogBroadcastWalProposer(void)
+XLogSendPhysical(void)
 {
+	XLogRecPtr	SendRqstPtr;
 	XLogRecPtr	startptr;
 	XLogRecPtr	endptr;
+	Size		nbytes PG_USED_FOR_ASSERTS_ONLY;
+	TimeLineID	currTLI;

-	/* Start from the last sent position */
-	startptr = sentPtr;
+	/* If requested switch the WAL sender to the stopping state. */
+	if (got_STOPPING)
+		WalSndSetState(WALSNDSTATE_STOPPING);

-	/*
-	 * Streaming the current timeline on a primary.
-	 *
-	 * Attempt to send all data that's already been written out and
-	 * fsync'd to disk.  We cannot go further than what's been written out
-	 * given the current implementation of WALRead().  And in any case
-	 * it's unsafe to send WAL that is not securely down to disk on the
-	 * primary: if the primary subsequently crashes and restarts, standbys
-	 * must not have applied any WAL that got lost on the primary.
-	 */
+	if (streamingDoneSending)
+	{
+		WalSndCaughtUp = true;
+		return;
+	}
+
+	/* Figure out how far we can safely send the WAL. */
+	if (sendTimeLineIsHistoric)
+	{
+		/*
+		 * Streaming an old timeline that's in this server's history, but is
+		 * not the one we're currently inserting or replaying. It can be
+		 * streamed up to the point where we switched off that timeline.
+		 */
+		SendRqstPtr = sendTimeLineValidUpto;
+	}
+	else if (am_cascading_walsender)
+	{
+		/*
+		 * Streaming the latest timeline on a standby.
+		 *
+		 * Attempt to send all WAL that has already been replayed, so that we
+		 * know it's valid. If we're receiving WAL through streaming
+		 * replication, it's also OK to send any WAL that has been received
+		 * but not replayed.
+		 *
+		 * The timeline we're recovering from can change, or we can be
+		 * promoted. In either case, the current timeline becomes historic. We
+		 * need to detect that so that we don't try to stream past the point
+		 * where we switched to another timeline. We check for promotion or
+		 * timeline switch after calculating FlushPtr, to avoid a race
+		 * condition: if the timeline becomes historic just after we checked
+		 * that it was still current, it's still be OK to stream it up to the
+		 * FlushPtr that was calculated before it became historic.
+		 */
+		bool		becameHistoric = false;
 #if PG_VERSION_NUM >= 150000
-	endptr = GetFlushRecPtr(NULL);
+		SendRqstPtr = GetStandbyFlushRecPtr(&currTLI);
 #else
-	endptr = GetFlushRecPtr();
+		SendRqstPtr = GetStandbyFlushRecPtr();
+		currTLI = ThisTimeLineID;
 #endif
+		if (!RecoveryInProgress())
+		{
+			/*
+			 * We have been promoted. RecoveryInProgress() updated
+			 * ThisTimeLineID to the new current timeline.
+			 */
+			am_cascading_walsender = false;
+			becameHistoric = true;
+		}
+		else
+		{
+			/*
+			 * Still a cascading standby. But is the timeline we're sending
+			 * still the one recovery is recovering from? currTLI was updated
+			 * by the GetStandbyFlushRecPtr() call above.
+			 */
+			if (sendTimeLine != currTLI)
+				becameHistoric = true;
+		}
+
+		if (becameHistoric)
+		{
+			/*
+			 * The timeline we were sending has become historic. Read the
+			 * timeline history file of the new timeline to see where exactly
+			 * we forked off from the timeline we were sending.
+			 */
+			List	   *history;
+
+			history = readTimeLineHistory(currTLI);
+			sendTimeLineValidUpto = tliSwitchPoint(sendTimeLine, history, &sendTimeLineNextTLI);
+
+			Assert(sendTimeLine < sendTimeLineNextTLI);
+			list_free_deep(history);
+
+			sendTimeLineIsHistoric = true;
+
+			SendRqstPtr = sendTimeLineValidUpto;
+		}
+	}
+	else
+	{
+		/*
+		 * Streaming the current timeline on a primary.
+		 *
+		 * Attempt to send all data that's already been written out and
+		 * fsync'd to disk.  We cannot go further than what's been written out
+		 * given the current implementation of WALRead().  And in any case
+		 * it's unsafe to send WAL that is not securely down to disk on the
+		 * primary: if the primary subsequently crashes and restarts, standbys
+		 * must not have applied any WAL that got lost on the primary.
+		 */
+#if PG_VERSION_NUM >= 150000
+		SendRqstPtr = GetFlushRecPtr(NULL);
+#else
+		SendRqstPtr = GetFlushRecPtr();
+#endif
+	}

 	/*
 	 * Record the current system time as an approximation of the time at which
@@ -625,14 +1083,91 @@ XLogBroadcastWalProposer(void)
 	 * that arbitrary LSN is eventually reported as written, flushed and
 	 * applied, so that it can measure the elapsed time.
 	 */
-	LagTrackerWrite(endptr, GetCurrentTimestamp());
+	LagTrackerWrite(SendRqstPtr, GetCurrentTimestamp());
+
+	/*
+	 * If this is a historic timeline and we've reached the point where we
+	 * forked to the next timeline, stop streaming.
+	 *
+	 * Note: We might already have sent WAL > sendTimeLineValidUpto. The
+	 * startup process will normally replay all WAL that has been received
+	 * from the primary, before promoting, but if the WAL streaming is
+	 * terminated at a WAL page boundary, the valid portion of the timeline
+	 * might end in the middle of a WAL record. We might've already sent the
+	 * first half of that partial WAL record to the cascading standby, so that
+	 * sentPtr > sendTimeLineValidUpto. That's OK; the cascading standby can't
+	 * replay the partial WAL record either, so it can still follow our
+	 * timeline switch.
+	 */
+	if (sendTimeLineIsHistoric && sendTimeLineValidUpto <= sentPtr)
+	{
+		/* close the current file. */
+		if (xlogreader->seg.ws_file >= 0)
+			wal_segment_close(xlogreader);
+
+		/* Send CopyDone */
+		pq_putmessage_noblock('c', NULL, 0);
+		streamingDoneSending = true;
+
+		WalSndCaughtUp = true;
+
+		elog(DEBUG1, "walsender reached end of timeline at %X/%X (sent up to %X/%X)",
+			 LSN_FORMAT_ARGS(sendTimeLineValidUpto),
+			 LSN_FORMAT_ARGS(sentPtr));
+		return;
+	}

 	/* Do we have any work to do? */
-	Assert(startptr <= endptr);
-	if (endptr <= startptr)
+	Assert(sentPtr <= SendRqstPtr);
+	if (SendRqstPtr <= sentPtr)
+	{
+		WalSndCaughtUp = true;
 		return;
+	}

-	WalProposerBroadcast(startptr, endptr);
+	/*
+	 * Figure out how much to send in one message. If there's no more than
+	 * MAX_SEND_SIZE bytes to send, send everything. Otherwise send
+	 * MAX_SEND_SIZE bytes, but round back to logfile or page boundary.
+	 *
+	 * The rounding is not only for performance reasons. Walreceiver relies on
+	 * the fact that we never split a WAL record across two messages. Since a
+	 * long WAL record is split at page boundary into continuation records,
+	 * page boundary is always a safe cut-off point. We also assume that
+	 * SendRqstPtr never points to the middle of a WAL record.
+	 */
+	startptr = sentPtr;
+	endptr = startptr;
+	endptr += MAX_SEND_SIZE;
+
+	/* if we went beyond SendRqstPtr, back off */
+	if (SendRqstPtr <= endptr)
+	{
+		endptr = SendRqstPtr;
+		if (sendTimeLineIsHistoric)
+			WalSndCaughtUp = false;
+		else
+			WalSndCaughtUp = true;
+	}
+	else
+	{
+		/* round down to page boundary. */
+		endptr -= (endptr % XLOG_BLCKSZ);
+		WalSndCaughtUp = false;
+	}
+
+	nbytes = endptr - startptr;
+	Assert(nbytes <= MAX_SEND_SIZE);
+
+	/* always true */
+	if (am_wal_proposer)
+	{
+		WalProposerBroadcast(startptr, endptr);
+	}
+	else
+	{
+		/* code removed for brevity */
+	}
 	sentPtr = endptr;

 	/* Update shared memory status */
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -8,7 +8,6 @@ use super::{
 use crate::{auth::ClientCredentials, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use tokio::time::Instant;
 use tokio_postgres::config::SslMode;
 use tracing::{error, info, info_span, warn, Instrument};

@@ -48,9 +47,7 @@ impl Api {
                .build()?;

            info!(url = request.url().as_str(), "sending http request");
-            let start = Instant::now();
            let response = self.endpoint.execute(request).await?;
-            info!(duration = ?start.elapsed(), "received http response");
            let body = match parse_body::<GetRoleSecret>(response).await {
                Ok(body) => body,
                // Error 404 is special: it's ok not to have a secret.
@@ -91,9 +88,7 @@ impl Api {
                .build()?;

            info!(url = request.url().as_str(), "sending http request");
-            let start = Instant::now();
            let response = self.endpoint.execute(request).await?;
-            info!(duration = ?start.elapsed(), "received http response");
            let body = parse_body::<WakeCompute>(response).await?;

            // Unfortunately, ownership won't let us use `Option::ok_or` here.
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -7,14 +7,11 @@ pub mod server;
 pub mod sql_over_http;
 pub mod websocket;

-use std::{sync::Arc, time::Duration};
+use std::time::Duration;

-use futures::FutureExt;
 pub use reqwest::{Request, Response, StatusCode};
 pub use reqwest_middleware::{ClientWithMiddleware, Error};
 pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware};
-use tokio::time::Instant;
-use tracing::trace;

 use crate::url::ApiUrl;
 use reqwest_middleware::RequestBuilder;
@@ -23,21 +20,13 @@ use reqwest_middleware::RequestBuilder;
 /// because it takes care of observability (OpenTelemetry).
 /// We deliberately don't want to replace this with a public static.
 pub fn new_client() -> ClientWithMiddleware {
-    let client = reqwest::ClientBuilder::new()
-        .dns_resolver(Arc::new(GaiResolver::default()))
-        .connection_verbose(true)
-        .build()
-        .expect("Failed to create http client");
-
-    reqwest_middleware::ClientBuilder::new(client)
+    reqwest_middleware::ClientBuilder::new(reqwest::Client::new())
        .with(reqwest_tracing::TracingMiddleware::default())
        .build()
 }

 pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware {
    let timeout_client = reqwest::ClientBuilder::new()
-        .dns_resolver(Arc::new(GaiResolver::default()))
-        .connection_verbose(true)
        .timeout(default_timout)
        .build()
        .expect("Failed to create http client with timeout");
@@ -50,10 +39,6 @@ pub fn new_client_with_timeout(default_timout: Duration) -> ClientWithMiddleware
        // As per docs, "This middleware always errors when given requests with streaming bodies".
        // That's all right because we only use this client to send `serde_json::RawValue`, which
        // is not a stream.
-        //
-        // ex-maintainer note:
-        // this limitation can be fixed if streaming is necessary.
-        // retries will still not be performed, but it wont error immediately
        .with(RetryTransientMiddleware::new_with_policy(retry_policy))
        .build()
 }
@@ -96,37 +81,6 @@ impl Endpoint {
    }
 }

-/// https://docs.rs/reqwest/0.11.18/src/reqwest/dns/gai.rs.html
-use hyper::{
-    client::connect::dns::{GaiResolver as HyperGaiResolver, Name},
-    service::Service,
-};
-use reqwest::dns::{Addrs, Resolve, Resolving};
-#[derive(Debug)]
-pub struct GaiResolver(HyperGaiResolver);
-
-impl Default for GaiResolver {
-    fn default() -> Self {
-        Self(HyperGaiResolver::new())
-    }
-}
-
-impl Resolve for GaiResolver {
-    fn resolve(&self, name: Name) -> Resolving {
-        let this = &mut self.0.clone();
-        let start = Instant::now();
-        Box::pin(
-            Service::<Name>::call(this, name.clone()).map(move |result| {
-                let resolve_duration = start.elapsed();
-                trace!(duration = ?resolve_duration, addr = %name, "resolve host complete");
-                result
-                    .map(|addrs| -> Addrs { Box::new(addrs) })
-                    .map_err(|err| -> Box<dyn std::error::Error + Send + Sync> { Box::new(err) })
-            }),
-        )
-    }
-}
-
 #[cfg(test)]
 mod tests {
    use super::*;
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -34,7 +34,7 @@ enum Payload {
    Batch(Vec<QueryData>),
 }

-pub const MAX_RESPONSE_SIZE: usize = 10 * 1024 * 1024; // 10 MB
+pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB

 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
@@ -214,7 +214,7 @@ pub async fn handle(

    if request_content_length > MAX_REQUEST_SIZE {
        return Err(anyhow::anyhow!(
-            "request is too large (max is {MAX_REQUEST_SIZE} bytes)"
+            "request is too large (max {MAX_REQUEST_SIZE} bytes)"
        ));
    }

@@ -292,15 +292,13 @@ async fn query_to_json<T: GenericClient>(
    // big.
    pin_mut!(row_stream);
    let mut rows: Vec<tokio_postgres::Row> = Vec::new();
-    let mut current_size = 0;
+    let mut curret_size = 0;
    while let Some(row) = row_stream.next().await {
        let row = row?;
-        current_size += row.body_len();
+        curret_size += row.body_len();
        rows.push(row);
-        if current_size > MAX_RESPONSE_SIZE {
-            return Err(anyhow::anyhow!(
-                "response is too large (max is {MAX_RESPONSE_SIZE} bytes)"
-            ));
+        if curret_size > MAX_RESPONSE_SIZE {
+            return Err(anyhow::anyhow!("response too large"));
        }
    }

--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -187,16 +187,12 @@ async fn ws_handler(
        let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
            .map_err(|e| ApiError::BadRequest(e.into()))?;

-        tokio::spawn(
-            async move {
-                if let Err(e) =
-                    serve_websocket(websocket, config, &cancel_map, session_id, host).await
-                {
-                    error!(session_id = ?session_id, "error in websocket connection: {e:#}");
-                }
+        tokio::spawn(async move {
+            if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
+            {
+                error!(session_id = ?session_id, "error in websocket connection: {e:?}");
            }
-            .in_current_span(),
-        );
+        });

        // Return the response so the spawned future can continue.
        Ok(response)
@@ -221,10 +217,6 @@ async fn ws_handler(
                    },
                    None => Value::Null,
                };
-                error!(
-                    ?code,
-                    "sql-over-http per-client task finished with an error: {e:#}"
-                );
                (
                    json!({ "message": message, "code": code }),
                    HashMap::default(),
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -2,8 +2,8 @@
 //! protocol commands.

 use anyhow::Context;
+use std::str;
 use std::str::FromStr;
-use std::str::{self};
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, info_span, Instrument};

@@ -11,7 +11,6 @@ use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};

 use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED};
-use crate::safekeeper::Term;
 use crate::timeline::TimelineError;
 use crate::wal_service::ConnectionId;
 use crate::{GlobalTimelines, SafeKeeperConf};
@@ -45,7 +44,7 @@ pub struct SafekeeperPostgresHandler {
 /// Parsed Postgres command.
 enum SafekeeperPostgresCommand {
    StartWalPush,
-    StartReplication { start_lsn: Lsn, term: Option<Term> },
+    StartReplication { start_lsn: Lsn },
    IdentifySystem,
    TimelineStatus,
    JSONCtrl { cmd: AppendLogicalMessage },
@@ -56,21 +55,15 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
        Ok(SafekeeperPostgresCommand::StartWalPush)
    } else if cmd.starts_with("START_REPLICATION") {
        let re = Regex::new(
-            // We follow postgres START_REPLICATION LOGICAL options to pass term.
-            r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)(?: \(term='(\d+)'\))?",
+            r"START_REPLICATION(?: SLOT [^ ]+)?(?: PHYSICAL)? ([[:xdigit:]]+/[[:xdigit:]]+)",
        )
        .unwrap();
-        let caps = re
-            .captures(cmd)
-            .context(format!("failed to parse START_REPLICATION command {}", cmd))?;
-        let start_lsn =
-            Lsn::from_str(&caps[1]).context("parse start LSN from START_REPLICATION command")?;
-        let term = if let Some(m) = caps.get(2) {
-            Some(m.as_str().parse::<u64>().context("invalid term")?)
-        } else {
-            None
-        };
-        Ok(SafekeeperPostgresCommand::StartReplication { start_lsn, term })
+        let mut caps = re.captures_iter(cmd);
+        let start_lsn = caps
+            .next()
+            .map(|cap| Lsn::from_str(&cap[1]))
+            .context("parse start LSN from START_REPLICATION command")??;
+        Ok(SafekeeperPostgresCommand::StartReplication { start_lsn })
    } else if cmd.starts_with("IDENTIFY_SYSTEM") {
        Ok(SafekeeperPostgresCommand::IdentifySystem)
    } else if cmd.starts_with("TIMELINE_STATUS") {
@@ -225,8 +218,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                    .instrument(info_span!("WAL receiver", ttid = %span_ttid))
                    .await
            }
-            SafekeeperPostgresCommand::StartReplication { start_lsn, term } => {
-                self.handle_start_replication(pgb, start_lsn, term)
+            SafekeeperPostgresCommand::StartReplication { start_lsn } => {
+                self.handle_start_replication(pgb, start_lsn)
                    .instrument(info_span!("WAL sender", ttid = %span_ttid))
                    .await
            }
--- a/safekeeper/src/send_wal.rs
+++ b/safekeeper/src/send_wal.rs
@@ -2,7 +2,6 @@
 //! with the "START_REPLICATION" message, and registry of walsenders.

 use crate::handler::SafekeeperPostgresHandler;
-use crate::safekeeper::Term;
 use crate::timeline::Timeline;
 use crate::wal_service::ConnectionId;
 use crate::wal_storage::WalReader;
@@ -360,12 +359,8 @@ impl SafekeeperPostgresHandler {
        &mut self,
        pgb: &mut PostgresBackend<IO>,
        start_pos: Lsn,
-        term: Option<Term>,
    ) -> Result<(), QueryError> {
-        if let Err(end) = self
-            .handle_start_replication_guts(pgb, start_pos, term)
-            .await
-        {
+        if let Err(end) = self.handle_start_replication_guts(pgb, start_pos).await {
            // Log the result and probably send it to the client, closing the stream.
            pgb.handle_copy_stream_end(end).await;
        }
@@ -376,7 +371,6 @@ impl SafekeeperPostgresHandler {
        &mut self,
        pgb: &mut PostgresBackend<IO>,
        start_pos: Lsn,
-        term: Option<Term>,
    ) -> Result<(), CopyStreamHandlerEnd> {
        let appname = self.appname.clone();
        let tli =
@@ -446,7 +440,6 @@ impl SafekeeperPostgresHandler {
            start_pos,
            end_pos,
            stop_pos,
-            term,
            commit_lsn_watch_rx,
            ws_guard: ws_guard.clone(),
            wal_reader,
@@ -483,10 +476,6 @@ struct WalSender<'a, IO> {
    // If present, terminate after reaching this position; used by walproposer
    // in recovery.
    stop_pos: Option<Lsn>,
-    /// When streaming uncommitted part, the term the client acts as the leader
-    /// in. Streaming is stopped if local term changes to a different (higher)
-    /// value.
-    term: Option<Term>,
    commit_lsn_watch_rx: Receiver<Lsn>,
    ws_guard: Arc<WalSenderGuard>,
    wal_reader: WalReader,
@@ -529,18 +518,8 @@ impl<IO: AsyncRead + AsyncWrite + Unpin> WalSender<'_, IO> {
                .0 as usize;
            send_size = min(send_size, self.send_buf.len());
            let send_buf = &mut self.send_buf[..send_size];
-            let send_size: usize;
-            {
-                // If uncommitted part is being pulled, check that the term is
-                // still the expected one.
-                let _term_guard = if let Some(t) = self.term {
-                    Some(self.tli.acquire_term(t).await?)
-                } else {
-                    None
-                };
-                // read wal into buffer
-                send_size = self.wal_reader.read(send_buf).await?
-            };
+            // read wal into buffer
+            send_size = self.wal_reader.read(send_buf).await?;
            let send_buf = &send_buf[..send_size];

            // and send it
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -499,19 +499,6 @@ impl Timeline {
        false
    }

-    /// Ensure taht current term is t, erroring otherwise, and lock the state.
-    pub async fn acquire_term(&self, t: Term) -> Result<MutexGuard<SharedState>> {
-        let ss = self.write_shared_state().await;
-        if ss.sk.state.acceptor_state.term != t {
-            bail!(
-                "failed to acquire term {}, current term {}",
-                t,
-                ss.sk.state.acceptor_state.term
-            );
-        }
-        Ok(ss)
-    }
-
    /// Returns whether s3 offloading is required and sets current status as
    /// matching it.
    pub async fn wal_backup_attend(&self) -> bool {
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -32,7 +32,6 @@ import requests
 from _pytest.config import Config
 from _pytest.config.argparsing import Parser
 from _pytest.fixtures import FixtureRequest
-from mypy_boto3_s3 import S3Client

 # Type-related stuff
 from psycopg2.extensions import connection as PgConnection
@@ -433,7 +432,7 @@ class NeonEnvBuilder:
        self.port_distributor = port_distributor
        self.remote_storage = remote_storage
        self.ext_remote_storage: Optional[S3Storage] = None
-        self.remote_storage_client: Optional[S3Client] = None
+        self.remote_storage_client: Optional[Any] = None
        self.remote_storage_users = remote_storage_users
        self.broker = broker
        self.run_id = run_id
@@ -876,14 +875,7 @@ class NeonEnv:

    def timeline_dir(self, tenant_id: TenantId, timeline_id: TimelineId) -> Path:
        """Get a timeline directory's path based on the repo directory of the test environment"""
-        return self.tenant_dir(tenant_id) / "timelines" / str(timeline_id)
-
-    def tenant_dir(
-        self,
-        tenant_id: TenantId,
-    ) -> Path:
-        """Get a tenant directory's path based on the repo directory of the test environment"""
-        return self.repo_dir / "tenants" / str(tenant_id)
+        return self.repo_dir / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)

    def get_pageserver_version(self) -> str:
        bin_pageserver = str(self.neon_binpath / "pageserver")
@@ -1526,8 +1518,6 @@ class NeonPageserver(PgProtocol):
            # Pageserver timeline deletion should be polled until it gets 404, so ignore it globally
            ".*Error processing HTTP request: NotFound: Timeline .* was not found",
            ".*took more than expected to complete.*",
-            # these can happen during shutdown, but it should not be a reason to fail a test
-            ".*completed, took longer than expected.*",
        ]

    def start(
@@ -2827,15 +2817,8 @@ def check_restored_datadir_content(
    endpoint: Endpoint,
 ):
    # Get the timeline ID. We need it for the 'basebackup' command
-    timeline_id = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])
+    timeline = TimelineId(endpoint.safe_psql("SHOW neon.timeline_id")[0][0])

-    # many tests already checkpoint, but do it just in case
-    with closing(endpoint.connect()) as conn:
-        with conn.cursor() as cur:
-            cur.execute("CHECKPOINT")
-
-    # wait for pageserver to catch up
-    wait_for_last_flush_lsn(env, endpoint, endpoint.tenant_id, timeline_id)
    # stop postgres to ensure that files won't change
    endpoint.stop()

@@ -2850,7 +2833,7 @@ def check_restored_datadir_content(
        {psql_path}                                    \
            --no-psqlrc                                \
            postgres://localhost:{env.pageserver.service_port.pg}  \
-            -c 'basebackup {endpoint.tenant_id} {timeline_id}'  \
+            -c 'basebackup {endpoint.tenant_id} {timeline}'  \
         | tar -x -C {restored_dir_path}
    """

--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -210,10 +210,6 @@ class PageserverHttpClient(requests.Session):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/detach", params=params)
        self.verbose_error(res)

-    def tenant_delete(self, tenant_id: TenantId):
-        res = self.delete(f"http://localhost:{self.port}/v1/tenant/{tenant_id}")
-        self.verbose_error(res)
-
    def tenant_load(self, tenant_id: TenantId):
        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/load")
        self.verbose_error(res)
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -1,11 +1,9 @@
 import time
-from typing import TYPE_CHECKING, Any, Dict, Optional
+from typing import Any, Dict, Optional

 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
-from fixtures.remote_storage import RemoteStorageKind, S3Storage
 from fixtures.types import Lsn, TenantId, TimelineId
-from fixtures.utils import wait_until


 def assert_tenant_state(
@@ -19,6 +17,15 @@ def assert_tenant_state(
    assert tenant_status["state"]["slug"] == expected_state, message or tenant_status


+def tenant_exists(pageserver_http: PageserverHttpClient, tenant_id: TenantId):
+    tenants = pageserver_http.tenant_list()
+    matching = [t for t in tenants if TenantId(t["id"]) == tenant_id]
+    assert len(matching) < 2
+    if len(matching) == 0:
+        return None
+    return matching[0]
+
+
 def remote_consistent_lsn(
    pageserver_http: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
 ) -> Lsn:
@@ -192,19 +199,20 @@ def wait_timeline_detail_404(
    timeline_id: TimelineId,
    iterations: int,
 ):
-    def timeline_is_missing():
-        data = {}
+    last_exc = None
+    for _ in range(iterations):
+        time.sleep(0.250)
        try:
            data = pageserver_http.timeline_detail(tenant_id, timeline_id)
-            log.info(f"timeline detail {data}")
+            log.info(f"detail {data}")
        except PageserverApiException as e:
            log.debug(e)
            if e.status_code == 404:
                return

-        raise RuntimeError(f"Timeline exists state {data.get('state')}")
+            last_exc = e

-    wait_until(iterations, interval=0.250, func=timeline_is_missing)
+    raise last_exc or RuntimeError(f"Timeline wasnt deleted in time, state: {data['state']}")


 def timeline_delete_wait_completed(
@@ -216,72 +224,3 @@ def timeline_delete_wait_completed(
 ):
    pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, iterations)
-
-
-if TYPE_CHECKING:
-    # TODO avoid by combining remote storage related stuff in single type
-    # and just passing in this type instead of whole builder
-    from fixtures.neon_fixtures import NeonEnvBuilder
-
-
-def assert_prefix_empty(neon_env_builder: "NeonEnvBuilder", prefix: Optional[str] = None):
-    # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
-    assert neon_env_builder.remote_storage_kind in (
-        RemoteStorageKind.MOCK_S3,
-        RemoteStorageKind.REAL_S3,
-    )
-    # For mypy
-    assert isinstance(neon_env_builder.remote_storage, S3Storage)
-    assert neon_env_builder.remote_storage_client is not None
-
-    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
-    response = neon_env_builder.remote_storage_client.list_objects_v2(
-        Bucket=neon_env_builder.remote_storage.bucket_name,
-        Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
-    )
-    objects = response.get("Contents")
-    assert (
-        response["KeyCount"] == 0
-    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
-
-
-def wait_tenant_status_404(
-    pageserver_http: PageserverHttpClient,
-    tenant_id: TenantId,
-    iterations: int,
-    interval: float = 0.250,
-):
-    def tenant_is_missing():
-        data = {}
-        try:
-            data = pageserver_http.tenant_status(tenant_id)
-            log.info(f"tenant status {data}")
-        except PageserverApiException as e:
-            log.debug(e)
-            if e.status_code == 404:
-                return
-
-        raise RuntimeError(f"Timeline exists state {data.get('state')}")
-
-    wait_until(iterations, interval=interval, func=tenant_is_missing)
-
-
-def tenant_delete_wait_completed(
-    pageserver_http: PageserverHttpClient,
-    tenant_id: TenantId,
-    iterations: int,
-):
-    pageserver_http.tenant_delete(tenant_id=tenant_id)
-    wait_tenant_status_404(pageserver_http, tenant_id=tenant_id, iterations=iterations)
-
-
-MANY_SMALL_LAYERS_TENANT_CONFIG = {
-    "gc_period": "0s",
-    "compaction_period": "0s",
-    "checkpoint_distance": f"{1024**2}",
-    "image_creation_threshold": "100",
-}
-
-
-def poll_for_remote_storage_iterations(remote_storage_kind: RemoteStorageKind) -> int:
-    return 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 6
--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -6,16 +6,13 @@ import subprocess
 import tarfile
 import time
 from pathlib import Path
-from typing import TYPE_CHECKING, Any, Callable, Dict, List, Tuple, TypeVar
+from typing import Any, Callable, Dict, List, Tuple, TypeVar
 from urllib.parse import urlencode

 import allure
 from psycopg2.extensions import cursor

 from fixtures.log_helper import log
-
-if TYPE_CHECKING:
-    from fixtures.neon_fixtures import PgBin
 from fixtures.types import TimelineId

 Fn = TypeVar("Fn", bound=Callable[..., Any])
@@ -303,13 +300,17 @@ def wait_until(number_of_iterations: int, interval: float, func: Fn):
    raise Exception("timed out while waiting for %s" % func) from last_exception


-def run_pg_bench_small(pg_bin: "PgBin", connstr: str):
+def wait_while(number_of_iterations: int, interval: float, func):
    """
-    Fast way to populate data.
-    For more layers consider combining with these tenant settings:
-    {
-        "checkpoint_distance": 1024 ** 2,
-        "image_creation_threshold": 100,
-    }
+    Wait until 'func' returns false, or throws an exception.
    """
-    pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", connstr])
+    for i in range(number_of_iterations):
+        try:
+            if not func():
+                return
+            log.info("waiting for %s iteration %s failed", func, i + 1)
+            time.sleep(interval)
+            continue
+        except Exception:
+            return
+    raise Exception("timed out while waiting for %s" % func)
--- a/test_runner/regress/test_download_extensions.py
+++ b/test_runner/regress/test_download_extensions.py
@@ -82,7 +82,6 @@ def upload_files(env):

 # Test downloading remote extension.
@pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
-@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
 def test_remote_extensions(
    neon_env_builder: NeonEnvBuilder,
    remote_storage_kind: RemoteStorageKind,
@@ -111,7 +110,7 @@ def test_remote_extensions(
        "test_remote_extensions",
        tenant_id=tenant_id,
        remote_ext_config=env.ext_remote_storage.to_string(),
-        # config_lines=["log_min_messages=debug3"],
+        config_lines=["shared_preload_libraries='neon,ololo'"],
    )
    try:
        with closing(endpoint.connect()) as conn:
@@ -122,206 +121,203 @@ def test_remote_extensions(
                log.info(all_extensions)
                assert "anon" in all_extensions

-                # postgis is on real s3 but not mock s3.
-                # it's kind of a big file, would rather not upload to github
-                if remote_storage_kind == RemoteStorageKind.REAL_S3:
-                    assert "postgis" in all_extensions
-                    # this may fail locally if dependency is missing
-                    # we don't really care about the error,
-                    # we just want to make sure it downloaded
-                    try:
-                        cur.execute("CREATE EXTENSION postgis")
-                    except Exception as err:
-                        log.info(f"(expected) error creating postgis extension: {err}")
-                        # we do not check the error, so this is basically a NO-OP
-                        # however checking the log you can make sure that it worked
-                        # and also get valuable information about how long loading the extension took
+                # # postgis is on real s3 but not mock s3.
+                # # it's kind of a big file, would rather not upload to github
+                # if remote_storage_kind == RemoteStorageKind.REAL_S3:
+                #     assert "postgis" in all_extensions
+                #     # this may fail locally if dependency is missing
+                #     # we don't really care about the error,
+                #     # we just want to make sure it downloaded
+                #     try:
+                #         cur.execute("CREATE EXTENSION postgis")
+                #     except Exception as err:
+                #         log.info(f"(expected) error creating postgis extension: {err}")
+                #         # we do not check the error, so this is basically a NO-OP
+                #         # however checking the log you can make sure that it worked
+                #         # and also get valuable information about how long loading the extension took

-                # this is expected to fail on my computer because I don't have the pgcrypto extension
-                try:
-                    cur.execute("CREATE EXTENSION anon")
-                except Exception as err:
-                    log.info("error creating anon extension")
-                    assert "pgcrypto" in str(err), "unexpected error creating anon extension"
+                # # this is expected to fail on my computer because I don't have the pgcrypto extension
+                # try:
+                #     cur.execute("CREATE EXTENSION anon")
+                # except Exception as err:
+                #     log.info("error creating anon extension")
+                #     assert "pgcrypto" in str(err), "unexpected error creating anon extension"
    finally:
        cleanup(pg_version)


-# Test downloading remote library.
-@pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
-@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
-def test_remote_library(
-    neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
-    pg_version: PgVersion,
-):
-    neon_env_builder.enable_remote_storage(
-        remote_storage_kind=remote_storage_kind,
-        test_name="test_remote_library",
-        enable_remote_extensions=True,
-    )
-    env = neon_env_builder.init_start()
-    tenant_id, _ = env.neon_cli.create_tenant()
-    env.neon_cli.create_timeline("test_remote_library", tenant_id=tenant_id)
+# # Test downloading remote library.
+# @pytest.mark.parametrize("remote_storage_kind", available_s3_storages())
+# def test_remote_library(
+#     neon_env_builder: NeonEnvBuilder,
+#     remote_storage_kind: RemoteStorageKind,
+#     pg_version: PgVersion,
+# ):
+#     neon_env_builder.enable_remote_storage(
+#         remote_storage_kind=remote_storage_kind,
+#         test_name="test_remote_library",
+#         enable_remote_extensions=True,
+#     )
+#     env = neon_env_builder.init_start()
+#     tenant_id, _ = env.neon_cli.create_tenant()
+#     env.neon_cli.create_timeline("test_remote_library", tenant_id=tenant_id)

-    assert env.ext_remote_storage is not None  # satisfy mypy
-    assert env.remote_storage_client is not None  # satisfy mypy
+#     assert env.ext_remote_storage is not None  # satisfy mypy
+#     assert env.remote_storage_client is not None  # satisfy mypy

-    # For MOCK_S3 we upload test files.
-    # For REAL_S3 we use the files already in the bucket
-    if remote_storage_kind == RemoteStorageKind.MOCK_S3:
-        upload_files(env)
+#     # For MOCK_S3 we upload test files.
+#     # For REAL_S3 we use the files already in the bucket
+#     if remote_storage_kind == RemoteStorageKind.MOCK_S3:
+#         upload_files(env)

-    # and use them to run LOAD library
-    endpoint = env.endpoints.create_start(
-        "test_remote_library",
-        tenant_id=tenant_id,
-        remote_ext_config=env.ext_remote_storage.to_string(),
-        # config_lines=["log_min_messages=debug3"],
-    )
-    try:
-        with closing(endpoint.connect()) as conn:
-            with conn.cursor() as cur:
-                # try to load library
-                try:
-                    cur.execute("LOAD 'anon'")
-                except Exception as err:
-                    log.info(f"error loading anon library: {err}")
-                    raise AssertionError("unexpected error loading anon library") from err
+#     # and use them to run LOAD library
+#     endpoint = env.endpoints.create_start(
+#         "test_remote_library",
+#         tenant_id=tenant_id,
+#         remote_ext_config=env.ext_remote_storage.to_string(),
+#         # config_lines=["log_min_messages=debug3"],
+#     )
+#     try:
+#         with closing(endpoint.connect()) as conn:
+#             with conn.cursor() as cur:
+#                 # try to load library
+#                 try:
+#                     cur.execute("LOAD 'anon'")
+#                 except Exception as err:
+#                     log.info(f"error loading anon library: {err}")
+#                     raise AssertionError("unexpected error loading anon library") from err

-                # test library which name is different from extension name
-                # this may fail locally if dependency is missing
-                # however, it does successfully download the postgis archive
-                if remote_storage_kind == RemoteStorageKind.REAL_S3:
-                    try:
-                        cur.execute("LOAD 'postgis_topology-3'")
-                    except Exception as err:
-                        log.info("error loading postgis_topology-3")
-                        assert "No such file or directory" in str(
-                            err
-                        ), "unexpected error loading postgis_topology-3"
-    finally:
-        cleanup(pg_version)
+#                 # test library which name is different from extension name
+#                 # this may fail locally if dependency is missing
+#                 # however, it does successfully download the postgis archive
+#                 if remote_storage_kind == RemoteStorageKind.REAL_S3:
+#                     try:
+#                         cur.execute("LOAD 'postgis_topology-3'")
+#                     except Exception as err:
+#                         log.info("error loading postgis_topology-3")
+#                         assert "No such file or directory" in str(
+#                             err
+#                         ), "unexpected error loading postgis_topology-3"
+#     finally:
+#         cleanup(pg_version)


-# Here we test a complex extension
-# which has multiple extensions in one archive
-# using postgis as an example
+# # Here we test a complex extension
+# # which has multiple extensions in one archive
+# # using postgis as an example
 # @pytest.mark.skipif(
-#    RemoteStorageKind.REAL_S3 not in available_s3_storages(),
-#    reason="skipping test because real s3 not enabled",
+#     RemoteStorageKind.REAL_S3 not in available_s3_storages(),
+#     reason="skipping test because real s3 not enabled",
 # )
-@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
-def test_multiple_extensions_one_archive(
-    neon_env_builder: NeonEnvBuilder,
-    pg_version: PgVersion,
-):
-    neon_env_builder.enable_remote_storage(
-        remote_storage_kind=RemoteStorageKind.REAL_S3,
-        test_name="test_multiple_extensions_one_archive",
-        enable_remote_extensions=True,
-    )
-    env = neon_env_builder.init_start()
-    tenant_id, _ = env.neon_cli.create_tenant()
-    env.neon_cli.create_timeline("test_multiple_extensions_one_archive", tenant_id=tenant_id)
+# def test_multiple_extensions_one_archive(
+#     neon_env_builder: NeonEnvBuilder,
+#     pg_version: PgVersion,
+# ):
+#     neon_env_builder.enable_remote_storage(
+#         remote_storage_kind=RemoteStorageKind.REAL_S3,
+#         test_name="test_multiple_extensions_one_archive",
+#         enable_remote_extensions=True,
+#     )
+#     env = neon_env_builder.init_start()
+#     tenant_id, _ = env.neon_cli.create_tenant()
+#     env.neon_cli.create_timeline("test_multiple_extensions_one_archive", tenant_id=tenant_id)

-    assert env.ext_remote_storage is not None  # satisfy mypy
-    assert env.remote_storage_client is not None  # satisfy mypy
+#     assert env.ext_remote_storage is not None  # satisfy mypy
+#     assert env.remote_storage_client is not None  # satisfy mypy

-    endpoint = env.endpoints.create_start(
-        "test_multiple_extensions_one_archive",
-        tenant_id=tenant_id,
-        remote_ext_config=env.ext_remote_storage.to_string(),
-    )
-    with closing(endpoint.connect()) as conn:
-        with conn.cursor() as cur:
-            cur.execute("CREATE EXTENSION address_standardizer;")
-            cur.execute("CREATE EXTENSION address_standardizer_data_us;")
-            # execute query to ensure that it works
-            cur.execute(
-                "SELECT house_num, name, suftype, city, country, state, unit \
-                        FROM standardize_address('us_lex', 'us_gaz', 'us_rules', \
-                        'One Rust Place, Boston, MA 02109');"
-            )
-            res = cur.fetchall()
-            log.info(res)
-            assert len(res) > 0
+#     endpoint = env.endpoints.create_start(
+#         "test_multiple_extensions_one_archive",
+#         tenant_id=tenant_id,
+#         remote_ext_config=env.ext_remote_storage.to_string(),
+#     )
+#     with closing(endpoint.connect()) as conn:
+#         with conn.cursor() as cur:
+#             cur.execute("CREATE EXTENSION address_standardizer;")
+#             cur.execute("CREATE EXTENSION address_standardizer_data_us;")
+#             # execute query to ensure that it works
+#             cur.execute(
+#                 "SELECT house_num, name, suftype, city, country, state, unit \
+#                         FROM standardize_address('us_lex', 'us_gaz', 'us_rules', \
+#                         'One Rust Place, Boston, MA 02109');"
+#             )
+#             res = cur.fetchall()
+#             log.info(res)
+#             assert len(res) > 0

-    cleanup(pg_version)
+#     cleanup(pg_version)


-# Test that extension is downloaded after endpoint restart,
-# when the library is used in the query.
-#
-# Run the test with mutliple simultaneous connections to an endpoint.
-# to ensure that the extension is downloaded only once.
-#
-@pytest.mark.skip(reason="https://github.com/neondatabase/neon/issues/4949")
-def test_extension_download_after_restart(
-    neon_env_builder: NeonEnvBuilder,
-    pg_version: PgVersion,
-):
-    if "15" in pg_version:  # SKIP v15 for now because test set only has extension built for v14
-        return None
+# # Test that extension is downloaded after endpoint restart,
+# # when the library is used in the query.
+# #
+# # Run the test with mutliple simultaneous connections to an endpoint.
+# # to ensure that the extension is downloaded only once.
+# #
+# def test_extension_download_after_restart(
+#     neon_env_builder: NeonEnvBuilder,
+#     pg_version: PgVersion,
+# ):
+#     if "15" in pg_version:  # SKIP v15 for now because test set only has extension built for v14
+#         return None

-    neon_env_builder.enable_remote_storage(
-        remote_storage_kind=RemoteStorageKind.MOCK_S3,
-        test_name="test_extension_download_after_restart",
-        enable_remote_extensions=True,
-    )
-    env = neon_env_builder.init_start()
-    tenant_id, _ = env.neon_cli.create_tenant()
-    env.neon_cli.create_timeline("test_extension_download_after_restart", tenant_id=tenant_id)
+#     neon_env_builder.enable_remote_storage(
+#         remote_storage_kind=RemoteStorageKind.MOCK_S3,
+#         test_name="test_extension_download_after_restart",
+#         enable_remote_extensions=True,
+#     )
+#     env = neon_env_builder.init_start()
+#     tenant_id, _ = env.neon_cli.create_tenant()
+#     env.neon_cli.create_timeline("test_extension_download_after_restart", tenant_id=tenant_id)

-    assert env.ext_remote_storage is not None  # satisfy mypy
-    assert env.remote_storage_client is not None  # satisfy mypy
+#     assert env.ext_remote_storage is not None  # satisfy mypy
+#     assert env.remote_storage_client is not None  # satisfy mypy

-    # For MOCK_S3 we upload test files.
-    upload_files(env)
+#     # For MOCK_S3 we upload test files.
+#     upload_files(env)

-    endpoint = env.endpoints.create_start(
-        "test_extension_download_after_restart",
-        tenant_id=tenant_id,
-        remote_ext_config=env.ext_remote_storage.to_string(),
-        config_lines=["log_min_messages=debug3"],
-    )
-    with closing(endpoint.connect()) as conn:
-        with conn.cursor() as cur:
-            cur.execute("CREATE extension pg_buffercache;")
-            cur.execute("SELECT * from pg_buffercache;")
-            res = cur.fetchall()
-            assert len(res) > 0
-            log.info(res)
+#     endpoint = env.endpoints.create_start(
+#         "test_extension_download_after_restart",
+#         tenant_id=tenant_id,
+#         remote_ext_config=env.ext_remote_storage.to_string(),
+#         config_lines=["log_min_messages=debug3"],
+#     )
+#     with closing(endpoint.connect()) as conn:
+#         with conn.cursor() as cur:
+#             cur.execute("CREATE extension pg_buffercache;")
+#             cur.execute("SELECT * from pg_buffercache;")
+#             res = cur.fetchall()
+#             assert len(res) > 0
+#             log.info(res)

-    # shutdown compute node
-    endpoint.stop()
-    # remove extension files locally
-    cleanup(pg_version)
+#     # shutdown compute node
+#     endpoint.stop()
+#     # remove extension files locally
+#     cleanup(pg_version)

-    # spin up compute node again (there are no extension files available, because compute is stateless)
-    endpoint = env.endpoints.create_start(
-        "test_extension_download_after_restart",
-        tenant_id=tenant_id,
-        remote_ext_config=env.ext_remote_storage.to_string(),
-        config_lines=["log_min_messages=debug3"],
-    )
+#     # spin up compute node again (there are no extension files available, because compute is stateless)
+#     endpoint = env.endpoints.create_start(
+#         "test_extension_download_after_restart",
+#         tenant_id=tenant_id,
+#         remote_ext_config=env.ext_remote_storage.to_string(),
+#         config_lines=["log_min_messages=debug3"],
+#     )

-    # connect to compute node and run the query
-    # that will trigger the download of the extension
-    def run_query(endpoint, thread_id: int):
-        log.info("thread_id {%d} starting", thread_id)
-        with closing(endpoint.connect()) as conn:
-            with conn.cursor() as cur:
-                cur.execute("SELECT * from pg_buffercache;")
-                res = cur.fetchall()
-                assert len(res) > 0
-                log.info("thread_id {%d}, res = %s", thread_id, res)
+#     # connect to compute node and run the query
+#     # that will trigger the download of the extension
+#     def run_query(endpoint, thread_id: int):
+#         log.info("thread_id {%d} starting", thread_id)
+#         with closing(endpoint.connect()) as conn:
+#             with conn.cursor() as cur:
+#                 cur.execute("SELECT * from pg_buffercache;")
+#                 res = cur.fetchall()
+#                 assert len(res) > 0
+#                 log.info("thread_id {%d}, res = %s", thread_id, res)

-    threads = [threading.Thread(target=run_query, args=(endpoint, i)) for i in range(2)]
+#     threads = [threading.Thread(target=run_query, args=(endpoint, i)) for i in range(2)]

-    for thread in threads:
-        thread.start()
-    for thread in threads:
-        thread.join()
+#     for thread in threads:
+#         thread.start()
+#     for thread in threads:
+#         thread.join()

-    cleanup(pg_version)
+#     cleanup(pg_version)
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -56,6 +56,10 @@ def test_pg_regress(
    with capsys.disabled():
        pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)

+        # checkpoint one more time to ensure that the lsn we get is the latest one
+        endpoint.safe_psql("CHECKPOINT")
+
+        # Check that we restore the content of the datadir correctly
        check_restored_datadir_content(test_output_dir, env, endpoint)


@@ -162,4 +166,9 @@ def test_sql_regress(
    with capsys.disabled():
        pg_bin.run(pg_regress_command, env=env_vars, cwd=runpath)

+        # checkpoint one more time to ensure that the lsn we get is the latest one
+        endpoint.safe_psql("CHECKPOINT")
+        endpoint.safe_psql("select pg_current_wal_insert_lsn()")[0][0]
+
+        # Check that we restore the content of the datadir correctly
        check_restored_datadir_content(test_output_dir, env, endpoint)
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -97,11 +97,6 @@ def test_remote_storage_backup_and_restore(
    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])

-    # Thats because of UnreliableWrapper's injected failures
-    env.pageserver.allowed_errors.append(
-        f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*"
-    )
-
    checkpoint_numbers = range(1, 3)

    for checkpoint_number in checkpoint_numbers:
--- a/test_runner/regress/test_subxacts.py
+++ b/test_runner/regress/test_subxacts.py
@@ -33,4 +33,8 @@ def test_subxacts(neon_simple_env: NeonEnv, test_output_dir):
            cur.execute(f"insert into t1 values ({i}, {j})")
        cur.execute("commit")

+    # force wal flush
+    cur.execute("checkpoint")
+
+    # Check that we can restore the content of the datadir correctly
    check_restored_datadir_content(test_output_dir, env, endpoint)
--- a/test_runner/regress/test_tenant_delete.py
+++ b/test_runner/regress/test_tenant_delete.py
@@ -1,250 +0,0 @@
-import enum
-import os
-
-import pytest
-from fixtures.log_helper import log
-from fixtures.neon_fixtures import (
-    NeonEnvBuilder,
-    PgBin,
-    last_flush_lsn_upload,
-    wait_for_last_flush_lsn,
-)
-from fixtures.pageserver.http import PageserverApiException
-from fixtures.pageserver.utils import (
-    MANY_SMALL_LAYERS_TENANT_CONFIG,
-    assert_prefix_empty,
-    poll_for_remote_storage_iterations,
-    tenant_delete_wait_completed,
-    wait_tenant_status_404,
-    wait_until_tenant_active,
-    wait_until_tenant_state,
-)
-from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
-from fixtures.types import TenantId
-from fixtures.utils import run_pg_bench_small
-
-
-@pytest.mark.parametrize(
-    "remote_storage_kind", [RemoteStorageKind.NOOP, *available_remote_storages()]
-)
-def test_tenant_delete_smoke(
-    neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
-    pg_bin: PgBin,
-):
-    neon_env_builder.enable_remote_storage(
-        remote_storage_kind=remote_storage_kind,
-        test_name="test_tenant_delete_smoke",
-    )
-
-    env = neon_env_builder.init_start()
-
-    ps_http = env.pageserver.http_client()
-
-    # first try to delete non existing tenant
-    tenant_id = TenantId.generate()
-    env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
-    with pytest.raises(PageserverApiException, match=f"NotFound: tenant {tenant_id}"):
-        ps_http.tenant_delete(tenant_id=tenant_id)
-
-    env.neon_cli.create_tenant(
-        tenant_id=tenant_id,
-        conf=MANY_SMALL_LAYERS_TENANT_CONFIG,
-    )
-
-    # create two timelines one being the parent of another
-    parent = None
-    for timeline in ["first", "second"]:
-        timeline_id = env.neon_cli.create_branch(
-            timeline, tenant_id=tenant_id, ancestor_branch_name=parent
-        )
-        with env.endpoints.create_start(timeline, tenant_id=tenant_id) as endpoint:
-            run_pg_bench_small(pg_bin, endpoint.connstr())
-            wait_for_last_flush_lsn(env, endpoint, tenant=tenant_id, timeline=timeline_id)
-
-        parent = timeline
-
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
-    tenant_delete_wait_completed(ps_http, tenant_id, iterations)
-
-    tenant_path = env.tenant_dir(tenant_id=tenant_id)
-    assert not tenant_path.exists()
-
-    if remote_storage_kind in [RemoteStorageKind.MOCK_S3, RemoteStorageKind.REAL_S3]:
-        assert_prefix_empty(
-            neon_env_builder,
-            prefix="/".join(
-                (
-                    "tenants",
-                    str(tenant_id),
-                )
-            ),
-        )
-
-
-class Check(enum.Enum):
-    RETRY_WITHOUT_RESTART = enum.auto()
-    RETRY_WITH_RESTART = enum.auto()
-
-
-FAILPOINTS = [
-    "tenant-delete-before-shutdown",
-    "tenant-delete-before-create-remote-mark",
-    "tenant-delete-before-create-local-mark",
-    "tenant-delete-before-background",
-    "tenant-delete-before-polling-ongoing-deletions",
-    "tenant-delete-before-cleanup-remaining-fs-traces",
-    "tenant-delete-before-remove-timelines-dir",
-    "tenant-delete-before-remove-deleted-mark",
-    "tenant-delete-before-remove-tenant-dir",
-    # Some failpoints from timeline deletion
-    "timeline-delete-before-index-deleted-at",
-    "timeline-delete-before-rm",
-    "timeline-delete-before-index-delete",
-    "timeline-delete-after-rm-dir",
-]
-
-FAILPOINTS_BEFORE_BACKGROUND = [
-    "timeline-delete-before-schedule",
-    "tenant-delete-before-shutdown",
-    "tenant-delete-before-create-remote-mark",
-    "tenant-delete-before-create-local-mark",
-    "tenant-delete-before-background",
-]
-
-
-def combinations():
-    result = []
-
-    remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
-    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
-        remotes.append(RemoteStorageKind.REAL_S3)
-
-    for remote_storage_kind in remotes:
-        for delete_failpoint in FAILPOINTS:
-            if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in (
-                "timeline-delete-before-index-delete",
-            ):
-                # the above failpoint are not relevant for config without remote storage
-                continue
-
-            result.append((remote_storage_kind, delete_failpoint))
-    return result
-
-
-@pytest.mark.parametrize("remote_storage_kind, failpoint", combinations())
-@pytest.mark.parametrize("check", list(Check))
-def test_delete_tenant_exercise_crash_safety_failpoints(
-    neon_env_builder: NeonEnvBuilder,
-    remote_storage_kind: RemoteStorageKind,
-    failpoint: str,
-    check: Check,
-    pg_bin: PgBin,
-):
-    neon_env_builder.enable_remote_storage(
-        remote_storage_kind, "test_delete_tenant_exercise_crash_safety_failpoints"
-    )
-
-    env = neon_env_builder.init_start(initial_tenant_conf=MANY_SMALL_LAYERS_TENANT_CONFIG)
-
-    tenant_id = env.initial_tenant
-
-    env.pageserver.allowed_errors.extend(
-        [
-            # From deletion polling
-            f".*NotFound: tenant {env.initial_tenant}.*",
-            # allow errors caused by failpoints
-            f".*failpoint: {failpoint}",
-            # It appears when we stopped flush loop during deletion (attempt) and then pageserver is stopped
-            ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited",
-            # We may leave some upload tasks in the queue. They're likely deletes.
-            # For uploads we explicitly wait with `last_flush_lsn_upload` below.
-            # So by ignoring these instead of waiting for empty upload queue
-            # we execute more distinct code paths.
-            '.*stopping left-over name="remote upload".*',
-        ]
-    )
-
-    ps_http = env.pageserver.http_client()
-
-    timeline_id = env.neon_cli.create_timeline("delete", tenant_id=tenant_id)
-    with env.endpoints.create_start("delete", tenant_id=tenant_id) as endpoint:
-        # generate enough layers
-        run_pg_bench_small(pg_bin, endpoint.connstr())
-        if remote_storage_kind is RemoteStorageKind.NOOP:
-            wait_for_last_flush_lsn(env, endpoint, tenant_id, timeline_id)
-        else:
-            last_flush_lsn_upload(env, endpoint, tenant_id, timeline_id)
-
-    ps_http.configure_failpoints((failpoint, "return"))
-
-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
-
-    # These failpoints are earlier than background task is spawned.
-    # so they result in api request failure.
-    if failpoint in FAILPOINTS_BEFORE_BACKGROUND:
-        with pytest.raises(PageserverApiException, match=failpoint):
-            ps_http.tenant_delete(tenant_id)
-
-    else:
-        ps_http.tenant_delete(tenant_id)
-        tenant_info = wait_until_tenant_state(
-            pageserver_http=ps_http,
-            tenant_id=tenant_id,
-            expected_state="Broken",
-            iterations=iterations,
-        )
-
-        reason = tenant_info["state"]["data"]["reason"]
-        log.info(f"tenant broken: {reason}")
-
-        # failpoint may not be the only error in the stack
-        assert reason.endswith(f"failpoint: {failpoint}"), reason
-
-    if check is Check.RETRY_WITH_RESTART:
-        env.pageserver.stop()
-        env.pageserver.start()
-
-        if (
-            remote_storage_kind is RemoteStorageKind.NOOP
-            and failpoint == "tenant-delete-before-create-local-mark"
-        ):
-            tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
-        elif failpoint in (
-            "tenant-delete-before-shutdown",
-            "tenant-delete-before-create-remote-mark",
-        ):
-            wait_until_tenant_active(
-                ps_http, tenant_id=tenant_id, iterations=iterations, period=0.25
-            )
-            tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
-        else:
-            # Pageserver should've resumed deletion after restart.
-            wait_tenant_status_404(ps_http, tenant_id, iterations=iterations + 10)
-    elif check is Check.RETRY_WITHOUT_RESTART:
-        # this should succeed
-        # this also checks that delete can be retried even when tenant is in Broken state
-        ps_http.configure_failpoints((failpoint, "off"))
-
-        tenant_delete_wait_completed(ps_http, tenant_id, iterations=iterations)
-
-    # Check remote is impty
-    if remote_storage_kind is RemoteStorageKind.MOCK_S3:
-        assert_prefix_empty(
-            neon_env_builder,
-            prefix="/".join(
-                (
-                    "tenants",
-                    str(tenant_id),
-                )
-            ),
-        )
-
-    tenant_dir = env.tenant_dir(tenant_id)
-    # Check local is empty
-    assert not tenant_dir.exists()
-
-
-# TODO test concurrent deletions with "hang" failpoint
-# TODO test tenant delete continues after attach
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -66,10 +66,6 @@ def test_tenant_reattach(
    env.pageserver.allowed_errors.append(
        f".*Tenant {tenant_id} will not become active\\. Current state: Stopping.*"
    )
-    # Thats because of UnreliableWrapper's injected failures
-    env.pageserver.allowed_errors.append(
-        f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*"
-    )

    with env.endpoints.create_start("main", tenant_id=tenant_id) as endpoint:
        with endpoint.cursor() as cur:
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -17,9 +17,9 @@ from fixtures.neon_fixtures import (
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
    assert_tenant_state,
+    tenant_exists,
    wait_for_last_record_lsn,
    wait_for_upload,
-    wait_tenant_status_404,
 )
 from fixtures.port_distributor import PortDistributor
 from fixtures.remote_storage import RemoteStorageKind, available_remote_storages
@@ -29,6 +29,7 @@ from fixtures.utils import (
    start_in_background,
    subprocess_capture,
    wait_until,
+    wait_while,
 )


@@ -268,16 +269,11 @@ def test_tenant_relocation(

    env = neon_env_builder.init_start()

-    tenant_id = TenantId("74ee8b079a0e437eb0afea7d26a07209")
-
    # FIXME: Is this expected?
    env.pageserver.allowed_errors.append(
        ".*init_tenant_mgr: marking .* as locally complete, while it doesnt exist in remote index.*"
    )

-    # Needed for detach polling.
-    env.pageserver.allowed_errors.append(f".*NotFound: tenant {tenant_id}.*")
-
    # create folder for remote storage mock
    remote_storage_mock_path = env.repo_dir / "local_fs_remote_storage"

@@ -287,7 +283,9 @@ def test_tenant_relocation(

    pageserver_http = env.pageserver.http_client()

-    _, initial_timeline_id = env.neon_cli.create_tenant(tenant_id)
+    tenant_id, initial_timeline_id = env.neon_cli.create_tenant(
+        TenantId("74ee8b079a0e437eb0afea7d26a07209")
+    )
    log.info("tenant to relocate %s initial_timeline_id %s", tenant_id, initial_timeline_id)

    env.neon_cli.create_branch("test_tenant_relocation_main", tenant_id=tenant_id)
@@ -471,8 +469,11 @@ def test_tenant_relocation(
        pageserver_http.tenant_detach(tenant_id)

        # Wait a little, so that the detach operation has time to finish.
-        wait_tenant_status_404(pageserver_http, tenant_id, iterations=100, interval=1)
-
+        wait_while(
+            number_of_iterations=100,
+            interval=1,
+            func=lambda: tenant_exists(pageserver_http, tenant_id),
+        )
        post_migration_check(ep_main, 500500, old_local_path_main)
        post_migration_check(ep_second, 1001000, old_local_path_second)

--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -146,11 +146,6 @@ def test_tenants_attached_after_download(
    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])

-    # Thats because of UnreliableWrapper's injected failures
-    env.pageserver.allowed_errors.append(
-        f".*failed to fetch tenant deletion mark at tenants/({tenant_id}|{env.initial_tenant})/deleted attempt 1.*"
-    )
-
    for checkpoint_number in range(1, 3):
        with endpoint.cursor() as cur:
            cur.execute(
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -4,6 +4,7 @@ import queue
 import shutil
 import threading
 from pathlib import Path
+from typing import Optional

 import pytest
 import requests
@@ -17,8 +18,6 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
-    assert_prefix_empty,
-    poll_for_remote_storage_iterations,
    timeline_delete_wait_completed,
    wait_for_last_record_lsn,
    wait_for_upload,
@@ -28,6 +27,7 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.remote_storage import (
    RemoteStorageKind,
+    S3Storage,
    available_remote_storages,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
@@ -187,9 +187,10 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
    8. Retry or restart without the failpoint and check the result.
    """

-    neon_env_builder.enable_remote_storage(
-        remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints"
-    )
+    if remote_storage_kind is not None:
+        neon_env_builder.enable_remote_storage(
+            remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints"
+        )

    env = neon_env_builder.init_start(
        initial_tenant_conf={
@@ -230,7 +231,7 @@ def test_delete_timeline_exercise_crash_safety_failpoints(

    ps_http.configure_failpoints((failpoint, "return"))

-    iterations = poll_for_remote_storage_iterations(remote_storage_kind)
+    iterations = 20 if remote_storage_kind is RemoteStorageKind.REAL_S3 else 4

    # These failpoints are earlier than background task is spawned.
    # so they result in api request failure.
@@ -279,14 +280,14 @@ def test_delete_timeline_exercise_crash_safety_failpoints(
                        "remote_storage_s3_request_seconds_count",
                        filter={"request_type": "get_object", "result": "err"},
                    ).value
-                    == 2  # One is missing tenant deletion mark, second is missing index part
+                    == 1
                )
                assert (
                    m.query_one(
                        "remote_storage_s3_request_seconds_count",
                        filter={"request_type": "get_object", "result": "ok"},
                    ).value
-                    == 1  # index part for initial timeline
+                    == 1
                )
    elif check is Check.RETRY_WITHOUT_RESTART:
        # this should succeed
@@ -412,6 +413,27 @@ def test_timeline_resurrection_on_attach(
    assert all([tl["state"] == "Active" for tl in timelines])


+def assert_prefix_empty(neon_env_builder: NeonEnvBuilder, prefix: Optional[str] = None):
+    # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
+    assert neon_env_builder.remote_storage_kind in (
+        RemoteStorageKind.MOCK_S3,
+        RemoteStorageKind.REAL_S3,
+    )
+    # For mypy
+    assert isinstance(neon_env_builder.remote_storage, S3Storage)
+
+    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
+    assert neon_env_builder.remote_storage_client is not None
+    response = neon_env_builder.remote_storage_client.list_objects_v2(
+        Bucket=neon_env_builder.remote_storage.bucket_name,
+        Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
+    )
+    objects = response.get("Contents")
+    assert (
+        response["KeyCount"] == 0
+    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+
+
 def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuilder):
    """
    When deleting a timeline, if we succeed in setting the deleted flag remotely
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -869,49 +869,6 @@ def test_timeline_status(neon_env_builder: NeonEnvBuilder, auth_enabled: bool):
    assert debug_dump_1["config"]["id"] == env.safekeepers[0].id


-class DummyConsumer(object):
-    def __call__(self, msg):
-        pass
-
-
-def test_start_replication_term(neon_env_builder: NeonEnvBuilder):
-    """
-    Test START_REPLICATION of uncommitted part specifying leader term. It must
-    error if safekeeper switched to different term.
-    """
-
-    env = neon_env_builder.init_start()
-
-    env.neon_cli.create_branch("test_start_replication_term")
-    endpoint = env.endpoints.create_start("test_start_replication_term")
-
-    endpoint.safe_psql("CREATE TABLE t(key int primary key, value text)")
-
-    # learn neon timeline from compute
-    tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
-    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
-
-    sk = env.safekeepers[0]
-    sk_http_cli = sk.http_client()
-    tli_status = sk_http_cli.timeline_status(tenant_id, timeline_id)
-    timeline_start_lsn = tli_status.timeline_start_lsn
-
-    conn_opts = {
-        "host": "127.0.0.1",
-        "options": f"-c timeline_id={timeline_id} tenant_id={tenant_id}",
-        "port": sk.port.pg,
-        "connection_factory": psycopg2.extras.PhysicalReplicationConnection,
-    }
-    sk_pg_conn = psycopg2.connect(**conn_opts)  # type: ignore
-    with sk_pg_conn.cursor() as cur:
-        # should fail, as first start has term 2
-        cur.start_replication_expert(f"START_REPLICATION {timeline_start_lsn} (term='3')")
-        dummy_consumer = DummyConsumer()
-        with pytest.raises(psycopg2.errors.InternalError_) as excinfo:
-            cur.consume_stream(dummy_consumer)
-        assert "failed to acquire term 3" in str(excinfo.value)
-
-
 # Test auth on WAL service (postgres protocol) ports.
 def test_sk_auth(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.auth_enabled = True