From e8ae37652bc548a197315e448550670698aef4e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Wed, 9 Oct 2024 01:33:39 +0200
Subject: [PATCH 01/12] Add timeline offload mechanism (#8907)

Implements an initial mechanism for offloading of archived timelines.

Offloading is implemented as specified in the RFC.

For now, there is no persistence, so a restart of the pageserver will
retrigger downloads until the timeline is offloaded again.

We trigger offloading in the compaction loop because we need the signal
for whether compaction is done and everything has been uploaded or not.

Part of #8088
---
 pageserver/src/http/routes.rs             |   4 +-
 pageserver/src/tenant.rs                  | 336 ++++++++++++++++++----
 pageserver/src/tenant/gc_block.rs         |   4 +-
 pageserver/src/tenant/timeline.rs         |  13 +-
 pageserver/src/tenant/timeline/delete.rs  | 105 ++++---
 pageserver/src/tenant/timeline/offload.rs |  69 +++++
 6 files changed, 436 insertions(+), 95 deletions(-)
 create mode 100644 pageserver/src/tenant/timeline/offload.rs
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 94375e62b6..5fc8272074 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -703,6 +703,8 @@ async fn timeline_archival_config_handler(
     let tenant_shard_id: TenantShardId = parse_request_param(&request, "tenant_shard_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
 
+    let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Warn);
+
     let request_data: TimelineArchivalConfigRequest = json_request(&mut request).await?;
     check_permission(&request, Some(tenant_shard_id.tenant_id))?;
     let state = get_state(&request);
@@ -713,7 +715,7 @@ async fn timeline_archival_config_handler(
             .get_attached_tenant_shard(tenant_shard_id)?;
 
         tenant
-            .apply_timeline_archival_config(timeline_id, request_data.state)
+            .apply_timeline_archival_config(timeline_id, request_data.state, ctx)
             .await?;
         Ok::<_, ApiError>(())
     }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 29f682c62a..d2818d04dc 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -38,6 +38,7 @@ use std::future::Future;
 use std::sync::Weak;
 use std::time::SystemTime;
 use storage_broker::BrokerClientChannel;
+use timeline::offload::offload_timeline;
 use tokio::io::BufReader;
 use tokio::sync::watch;
 use tokio::task::JoinSet;
@@ -287,9 +288,13 @@ pub struct Tenant {
 
     /// During timeline creation, we first insert the TimelineId to the
     /// creating map, then `timelines`, then remove it from the creating map.
-    /// **Lock order**: if acquring both, acquire`timelines` before `timelines_creating`
+    /// **Lock order**: if acquiring both, acquire`timelines` before `timelines_creating`
     timelines_creating: std::sync::Mutex<HashSet<TimelineId>>,
 
+    /// Possibly offloaded and archived timelines
+    /// **Lock order**: if acquiring both, acquire`timelines` before `timelines_offloaded`
+    timelines_offloaded: Mutex<HashMap<TimelineId, Arc<OffloadedTimeline>>>,
+
     // This mutex prevents creation of new timelines during GC.
     // Adding yet another mutex (in addition to `timelines`) is needed because holding
     // `timelines` mutex during all GC iteration
@@ -484,6 +489,65 @@ impl WalRedoManager {
     }
 }
 
+pub struct OffloadedTimeline {
+    pub tenant_shard_id: TenantShardId,
+    pub timeline_id: TimelineId,
+    pub ancestor_timeline_id: Option<TimelineId>,
+
+    // TODO: once we persist offloaded state, make this lazily constructed
+    pub remote_client: Arc<RemoteTimelineClient>,
+
+    /// Prevent two tasks from deleting the timeline at the same time. If held, the
+    /// timeline is being deleted. If 'true', the timeline has already been deleted.
+    pub delete_progress: Arc<tokio::sync::Mutex<DeleteTimelineFlow>>,
+}
+
+impl OffloadedTimeline {
+    fn from_timeline(timeline: &Timeline) -> Self {
+        Self {
+            tenant_shard_id: timeline.tenant_shard_id,
+            timeline_id: timeline.timeline_id,
+            ancestor_timeline_id: timeline.get_ancestor_timeline_id(),
+
+            remote_client: timeline.remote_client.clone(),
+            delete_progress: timeline.delete_progress.clone(),
+        }
+    }
+}
+
+#[derive(Clone)]
+pub enum TimelineOrOffloaded {
+    Timeline(Arc<Timeline>),
+    Offloaded(Arc<OffloadedTimeline>),
+}
+
+impl TimelineOrOffloaded {
+    pub fn tenant_shard_id(&self) -> TenantShardId {
+        match self {
+            TimelineOrOffloaded::Timeline(timeline) => timeline.tenant_shard_id,
+            TimelineOrOffloaded::Offloaded(offloaded) => offloaded.tenant_shard_id,
+        }
+    }
+    pub fn timeline_id(&self) -> TimelineId {
+        match self {
+            TimelineOrOffloaded::Timeline(timeline) => timeline.timeline_id,
+            TimelineOrOffloaded::Offloaded(offloaded) => offloaded.timeline_id,
+        }
+    }
+    pub fn delete_progress(&self) -> &Arc<tokio::sync::Mutex<DeleteTimelineFlow>> {
+        match self {
+            TimelineOrOffloaded::Timeline(timeline) => &timeline.delete_progress,
+            TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.delete_progress,
+        }
+    }
+    pub fn remote_client(&self) -> &Arc<RemoteTimelineClient> {
+        match self {
+            TimelineOrOffloaded::Timeline(timeline) => &timeline.remote_client,
+            TimelineOrOffloaded::Offloaded(offloaded) => &offloaded.remote_client,
+        }
+    }
+}
+
 #[derive(Debug, thiserror::Error, PartialEq, Eq)]
 pub enum GetTimelineError {
     #[error("Timeline is shutting down")]
@@ -1406,52 +1470,192 @@ impl Tenant {
         }
     }
 
-    pub(crate) async fn apply_timeline_archival_config(
-        &self,
+    fn check_to_be_archived_has_no_unarchived_children(
         timeline_id: TimelineId,
-        state: TimelineArchivalState,
+        timelines: &std::sync::MutexGuard<'_, HashMap<TimelineId, Arc<Timeline>>>,
+    ) -> Result<(), TimelineArchivalError> {
+        let children: Vec<TimelineId> = timelines
+            .iter()
+            .filter_map(|(id, entry)| {
+                if entry.get_ancestor_timeline_id() != Some(timeline_id) {
+                    return None;
+                }
+                if entry.is_archived() == Some(true) {
+                    return None;
+                }
+                Some(*id)
+            })
+            .collect();
+
+        if !children.is_empty() {
+            return Err(TimelineArchivalError::HasUnarchivedChildren(children));
+        }
+        Ok(())
+    }
+
+    fn check_ancestor_of_to_be_unarchived_is_not_archived(
+        ancestor_timeline_id: TimelineId,
+        timelines: &std::sync::MutexGuard<'_, HashMap<TimelineId, Arc<Timeline>>>,
+        offloaded_timelines: &std::sync::MutexGuard<
+            '_,
+            HashMap<TimelineId, Arc<OffloadedTimeline>>,
+        >,
+    ) -> Result<(), TimelineArchivalError> {
+        let has_archived_parent =
+            if let Some(ancestor_timeline) = timelines.get(&ancestor_timeline_id) {
+                ancestor_timeline.is_archived() == Some(true)
+            } else if offloaded_timelines.contains_key(&ancestor_timeline_id) {
+                true
+            } else {
+                error!("ancestor timeline {ancestor_timeline_id} not found");
+                if cfg!(debug_assertions) {
+                    panic!("ancestor timeline {ancestor_timeline_id} not found");
+                }
+                return Err(TimelineArchivalError::NotFound);
+            };
+        if has_archived_parent {
+            return Err(TimelineArchivalError::HasArchivedParent(
+                ancestor_timeline_id,
+            ));
+        }
+        Ok(())
+    }
+
+    fn check_to_be_unarchived_timeline_has_no_archived_parent(
+        timeline: &Arc<Timeline>,
+    ) -> Result<(), TimelineArchivalError> {
+        if let Some(ancestor_timeline) = timeline.ancestor_timeline() {
+            if ancestor_timeline.is_archived() == Some(true) {
+                return Err(TimelineArchivalError::HasArchivedParent(
+                    ancestor_timeline.timeline_id,
+                ));
+            }
+        }
+        Ok(())
+    }
+
+    /// Loads the specified (offloaded) timeline from S3 and attaches it as a loaded timeline
+    async fn unoffload_timeline(
+        self: &Arc<Self>,
+        timeline_id: TimelineId,
+        ctx: RequestContext,
+    ) -> Result<Arc<Timeline>, TimelineArchivalError> {
+        let cancel = self.cancel.clone();
+        let timeline_preload = self
+            .load_timeline_metadata(timeline_id, self.remote_storage.clone(), cancel)
+            .await;
+
+        let index_part = match timeline_preload.index_part {
+            Ok(index_part) => {
+                debug!("remote index part exists for timeline {timeline_id}");
+                index_part
+            }
+            Err(DownloadError::NotFound) => {
+                error!(%timeline_id, "index_part not found on remote");
+                return Err(TimelineArchivalError::NotFound);
+            }
+            Err(e) => {
+                // Some (possibly ephemeral) error happened during index_part download.
+                warn!(%timeline_id, "Failed to load index_part from remote storage, failed creation? ({e})");
+                return Err(TimelineArchivalError::Other(
+                    anyhow::Error::new(e).context("downloading index_part from remote storage"),
+                ));
+            }
+        };
+        let index_part = match index_part {
+            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
+            MaybeDeletedIndexPart::Deleted(_index_part) => {
+                info!("timeline is deleted according to index_part.json");
+                return Err(TimelineArchivalError::NotFound);
+            }
+        };
+        let remote_metadata = index_part.metadata.clone();
+        let timeline_resources = self.build_timeline_resources(timeline_id);
+        self.load_remote_timeline(
+            timeline_id,
+            index_part,
+            remote_metadata,
+            timeline_resources,
+            &ctx,
+        )
+        .await
+        .with_context(|| {
+            format!(
+                "failed to load remote timeline {} for tenant {}",
+                timeline_id, self.tenant_shard_id
+            )
+        })?;
+        let timelines = self.timelines.lock().unwrap();
+        if let Some(timeline) = timelines.get(&timeline_id) {
+            let mut offloaded_timelines = self.timelines_offloaded.lock().unwrap();
+            if offloaded_timelines.remove(&timeline_id).is_none() {
+                warn!("timeline already removed from offloaded timelines");
+            }
+            Ok(Arc::clone(timeline))
+        } else {
+            warn!("timeline not available directly after attach");
+            Err(TimelineArchivalError::Other(anyhow::anyhow!(
+                "timeline not available directly after attach"
+            )))
+        }
+    }
+
+    pub(crate) async fn apply_timeline_archival_config(
+        self: &Arc<Self>,
+        timeline_id: TimelineId,
+        new_state: TimelineArchivalState,
+        ctx: RequestContext,
     ) -> Result<(), TimelineArchivalError> {
         info!("setting timeline archival config");
-        let timeline = {
+        // First part: figure out what is needed to do, and do validation
+        let timeline_or_unarchive_offloaded = 'outer: {
             let timelines = self.timelines.lock().unwrap();
 
             let Some(timeline) = timelines.get(&timeline_id) else {
-                return Err(TimelineArchivalError::NotFound);
+                let offloaded_timelines = self.timelines_offloaded.lock().unwrap();
+                let Some(offloaded) = offloaded_timelines.get(&timeline_id) else {
+                    return Err(TimelineArchivalError::NotFound);
+                };
+                if new_state == TimelineArchivalState::Archived {
+                    // It's offloaded already, so nothing to do
+                    return Ok(());
+                }
+                if let Some(ancestor_timeline_id) = offloaded.ancestor_timeline_id {
+                    Self::check_ancestor_of_to_be_unarchived_is_not_archived(
+                        ancestor_timeline_id,
+                        &timelines,
+                        &offloaded_timelines,
+                    )?;
+                }
+                break 'outer None;
             };
 
-            if state == TimelineArchivalState::Unarchived {
-                if let Some(ancestor_timeline) = timeline.ancestor_timeline() {
-                    if ancestor_timeline.is_archived() == Some(true) {
-                        return Err(TimelineArchivalError::HasArchivedParent(
-                            ancestor_timeline.timeline_id,
-                        ));
-                    }
+            // Do some validation. We release the timelines lock below, so there is potential
+            // for race conditions: these checks are more present to prevent misunderstandings of
+            // the API's capabilities, instead of serving as the sole way to defend their invariants.
+            match new_state {
+                TimelineArchivalState::Unarchived => {
+                    Self::check_to_be_unarchived_timeline_has_no_archived_parent(timeline)?
+                }
+                TimelineArchivalState::Archived => {
+                    Self::check_to_be_archived_has_no_unarchived_children(timeline_id, &timelines)?
                 }
             }
-
-            // Ensure that there are no non-archived child timelines
-            let children: Vec<TimelineId> = timelines
-                .iter()
-                .filter_map(|(id, entry)| {
-                    if entry.get_ancestor_timeline_id() != Some(timeline_id) {
-                        return None;
-                    }
-                    if entry.is_archived() == Some(true) {
-                        return None;
-                    }
-                    Some(*id)
-                })
-                .collect();
-
-            if !children.is_empty() && state == TimelineArchivalState::Archived {
-                return Err(TimelineArchivalError::HasUnarchivedChildren(children));
-            }
-            Arc::clone(timeline)
+            Some(Arc::clone(timeline))
         };
 
+        // Second part: unarchive timeline (if needed)
+        let timeline = if let Some(timeline) = timeline_or_unarchive_offloaded {
+            timeline
+        } else {
+            // Turn offloaded timeline into a non-offloaded one
+            self.unoffload_timeline(timeline_id, ctx).await?
+        };
+
+        // Third part: upload new timeline archival state and block until it is present in S3
         let upload_needed = timeline
             .remote_client
-            .schedule_index_upload_for_timeline_archival_state(state)?;
+            .schedule_index_upload_for_timeline_archival_state(new_state)?;
 
         if upload_needed {
             info!("Uploading new state");
@@ -1884,7 +2088,7 @@ impl Tenant {
     ///
     /// Returns whether we have pending compaction task.
     async fn compaction_iteration(
-        &self,
+        self: &Arc<Self>,
         cancel: &CancellationToken,
         ctx: &RequestContext,
     ) -> Result<bool, timeline::CompactionError> {
@@ -1905,21 +2109,28 @@ impl Tenant {
         // while holding the lock. Then drop the lock and actually perform the
         // compactions.  We don't want to block everything else while the
         // compaction runs.
-        let timelines_to_compact = {
+        let timelines_to_compact_or_offload;
+        {
             let timelines = self.timelines.lock().unwrap();
-            let timelines_to_compact = timelines
+            timelines_to_compact_or_offload = timelines
                 .iter()
                 .filter_map(|(timeline_id, timeline)| {
-                    if timeline.is_active() {
-                        Some((*timeline_id, timeline.clone()))
-                    } else {
+                    let (is_active, can_offload) = (timeline.is_active(), timeline.can_offload());
+                    let has_no_unoffloaded_children = {
+                        !timelines
+                            .iter()
+                            .any(|(_id, tl)| tl.get_ancestor_timeline_id() == Some(*timeline_id))
+                    };
+                    let can_offload = can_offload && has_no_unoffloaded_children;
+                    if (is_active, can_offload) == (false, false) {
                         None
+                    } else {
+                        Some((*timeline_id, timeline.clone(), (is_active, can_offload)))
                     }
                 })
                 .collect::<Vec<_>>();
             drop(timelines);
-            timelines_to_compact
-        };
+        }
 
         // Before doing any I/O work, check our circuit breaker
         if self.compaction_circuit_breaker.lock().unwrap().is_broken() {
@@ -1929,20 +2140,34 @@ impl Tenant {
 
         let mut has_pending_task = false;
 
-        for (timeline_id, timeline) in &timelines_to_compact {
-            has_pending_task |= timeline
-                .compact(cancel, EnumSet::empty(), ctx)
-                .instrument(info_span!("compact_timeline", %timeline_id))
-                .await
-                .inspect_err(|e| match e {
-                    timeline::CompactionError::ShuttingDown => (),
-                    timeline::CompactionError::Other(e) => {
-                        self.compaction_circuit_breaker
-                            .lock()
-                            .unwrap()
-                            .fail(&CIRCUIT_BREAKERS_BROKEN, e);
-                    }
-                })?;
+        for (timeline_id, timeline, (can_compact, can_offload)) in &timelines_to_compact_or_offload
+        {
+            let pending_task_left = if *can_compact {
+                Some(
+                    timeline
+                        .compact(cancel, EnumSet::empty(), ctx)
+                        .instrument(info_span!("compact_timeline", %timeline_id))
+                        .await
+                        .inspect_err(|e| match e {
+                            timeline::CompactionError::ShuttingDown => (),
+                            timeline::CompactionError::Other(e) => {
+                                self.compaction_circuit_breaker
+                                    .lock()
+                                    .unwrap()
+                                    .fail(&CIRCUIT_BREAKERS_BROKEN, e);
+                            }
+                        })?,
+                )
+            } else {
+                None
+            };
+            has_pending_task |= pending_task_left.unwrap_or(false);
+            if pending_task_left == Some(false) && *can_offload {
+                offload_timeline(self, timeline)
+                    .instrument(info_span!("offload_timeline", %timeline_id))
+                    .await
+                    .map_err(timeline::CompactionError::Other)?;
+            }
         }
 
         self.compaction_circuit_breaker
@@ -2852,6 +3077,7 @@ impl Tenant {
             constructed_at: Instant::now(),
             timelines: Mutex::new(HashMap::new()),
             timelines_creating: Mutex::new(HashSet::new()),
+            timelines_offloaded: Mutex::new(HashMap::new()),
             gc_cs: tokio::sync::Mutex::new(()),
             walredo_mgr,
             remote_storage,
diff --git a/pageserver/src/tenant/gc_block.rs b/pageserver/src/tenant/gc_block.rs
index f7a7836a12..373779ddb8 100644
--- a/pageserver/src/tenant/gc_block.rs
+++ b/pageserver/src/tenant/gc_block.rs
@@ -141,14 +141,14 @@ impl GcBlock {
         Ok(())
     }
 
-    pub(crate) fn before_delete(&self, timeline: &super::Timeline) {
+    pub(crate) fn before_delete(&self, timeline_id: &super::TimelineId) {
         let unblocked = {
             let mut g = self.reasons.lock().unwrap();
             if g.is_empty() {
                 return;
             }
 
-            g.remove(&timeline.timeline_id);
+            g.remove(timeline_id);
 
             BlockingReasons::clean_and_summarize(g).is_none()
         };
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1d79b2b74b..2fd4e699cf 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -7,6 +7,7 @@ pub(crate) mod handle;
 mod init;
 pub mod layer_manager;
 pub(crate) mod logical_size;
+pub mod offload;
 pub mod span;
 pub mod uninit;
 mod walreceiver;
@@ -1556,6 +1557,17 @@ impl Timeline {
         }
     }
 
+    /// Checks if the internal state of the timeline is consistent with it being able to be offloaded.
+    /// This is neccessary but not sufficient for offloading of the timeline as it might have
+    /// child timelines that are not offloaded yet.
+    pub(crate) fn can_offload(&self) -> bool {
+        if self.remote_client.is_archived() != Some(true) {
+            return false;
+        }
+
+        true
+    }
+
     /// Outermost timeline compaction operation; downloads needed layers. Returns whether we have pending
     /// compaction tasks.
     pub(crate) async fn compact(
@@ -1818,7 +1830,6 @@ impl Timeline {
         self.current_state() == TimelineState::Active
     }
 
-    #[allow(unused)]
     pub(crate) fn is_archived(&self) -> Option<bool> {
         self.remote_client.is_archived()
     }
diff --git a/pageserver/src/tenant/timeline/delete.rs b/pageserver/src/tenant/timeline/delete.rs
index 90db08ea81..305c5758cc 100644
--- a/pageserver/src/tenant/timeline/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -15,7 +15,7 @@ use crate::{
     tenant::{
         metadata::TimelineMetadata,
         remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
-        CreateTimelineCause, DeleteTimelineError, Tenant,
+        CreateTimelineCause, DeleteTimelineError, Tenant, TimelineOrOffloaded,
     },
 };
 
@@ -24,12 +24,14 @@ use super::{Timeline, TimelineResources};
 /// Mark timeline as deleted in S3 so we won't pick it up next time
 /// during attach or pageserver restart.
 /// See comment in persist_index_part_with_deleted_flag.
-async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
-    match timeline
-        .remote_client
+async fn set_deleted_in_remote_index(
+    timeline: &TimelineOrOffloaded,
+) -> Result<(), DeleteTimelineError> {
+    let res = timeline
+        .remote_client()
         .persist_index_part_with_deleted_flag()
-        .await
-    {
+        .await;
+    match res {
         // If we (now, or already) marked it successfully as deleted, we can proceed
         Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
         // Bail out otherwise
@@ -127,9 +129,9 @@ pub(super) async fn delete_local_timeline_directory(
 }
 
 /// Removes remote layers and an index file after them.
-async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
+async fn delete_remote_layers_and_index(timeline: &TimelineOrOffloaded) -> anyhow::Result<()> {
     timeline
-        .remote_client
+        .remote_client()
         .delete_all()
         .await
         .context("delete_all")
@@ -137,27 +139,41 @@ async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<(
 
 /// It is important that this gets called when DeletionGuard is being held.
 /// For more context see comments in [`DeleteTimelineFlow::prepare`]
-async fn remove_timeline_from_tenant(
+async fn remove_maybe_offloaded_timeline_from_tenant(
     tenant: &Tenant,
-    timeline: &Timeline,
+    timeline: &TimelineOrOffloaded,
     _: &DeletionGuard, // using it as a witness
 ) -> anyhow::Result<()> {
     // Remove the timeline from the map.
+    // This observes the locking order between timelines and timelines_offloaded
     let mut timelines = tenant.timelines.lock().unwrap();
+    let mut timelines_offloaded = tenant.timelines_offloaded.lock().unwrap();
+    let offloaded_children_exist = timelines_offloaded
+        .iter()
+        .any(|(_, entry)| entry.ancestor_timeline_id == Some(timeline.timeline_id()));
     let children_exist = timelines
         .iter()
-        .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id));
-    // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
-    // We already deleted the layer files, so it's probably best to panic.
-    // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
-    if children_exist {
+        .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id()));
+    // XXX this can happen because of race conditions with branch creation.
+    // We already deleted the remote layer files, so it's probably best to panic.
+    if children_exist || offloaded_children_exist {
         panic!("Timeline grew children while we removed layer files");
     }
 
-    timelines
-        .remove(&timeline.timeline_id)
-        .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
+    match timeline {
+        TimelineOrOffloaded::Timeline(timeline) => {
+            timelines.remove(&timeline.timeline_id).expect(
+                "timeline that we were deleting was concurrently removed from 'timelines' map",
+            );
+        }
+        TimelineOrOffloaded::Offloaded(timeline) => {
+            timelines_offloaded
+                .remove(&timeline.timeline_id)
+                .expect("timeline that we were deleting was concurrently removed from 'timelines_offloaded' map");
+        }
+    }
 
+    drop(timelines_offloaded);
     drop(timelines);
 
     Ok(())
@@ -207,9 +223,11 @@ impl DeleteTimelineFlow {
         guard.mark_in_progress()?;
 
         // Now that the Timeline is in Stopping state, request all the related tasks to shut down.
-        timeline.shutdown(super::ShutdownMode::Hard).await;
+        if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
+            timeline.shutdown(super::ShutdownMode::Hard).await;
+        }
 
-        tenant.gc_block.before_delete(&timeline);
+        tenant.gc_block.before_delete(&timeline.timeline_id());
 
         fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
             Err(anyhow::anyhow!(
@@ -285,15 +303,16 @@ impl DeleteTimelineFlow {
 
         guard.mark_in_progress()?;
 
+        let timeline = TimelineOrOffloaded::Timeline(timeline);
         Self::schedule_background(guard, tenant.conf, tenant, timeline);
 
         Ok(())
     }
 
-    fn prepare(
+    pub(super) fn prepare(
         tenant: &Tenant,
         timeline_id: TimelineId,
-    ) -> Result<(Arc<Timeline>, DeletionGuard), DeleteTimelineError> {
+    ) -> Result<(TimelineOrOffloaded, DeletionGuard), DeleteTimelineError> {
         // Note the interaction between this guard and deletion guard.
         // Here we attempt to lock deletion guard when we're holding a lock on timelines.
         // This is important because when you take into account `remove_timeline_from_tenant`
@@ -307,8 +326,14 @@ impl DeleteTimelineFlow {
         let timelines = tenant.timelines.lock().unwrap();
 
         let timeline = match timelines.get(&timeline_id) {
-            Some(t) => t,
-            None => return Err(DeleteTimelineError::NotFound),
+            Some(t) => TimelineOrOffloaded::Timeline(Arc::clone(t)),
+            None => {
+                let offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
+                match offloaded_timelines.get(&timeline_id) {
+                    Some(t) => TimelineOrOffloaded::Offloaded(Arc::clone(t)),
+                    None => return Err(DeleteTimelineError::NotFound),
+                }
+            }
         };
 
         // Ensure that there are no child timelines **attached to that pageserver**,
@@ -334,30 +359,32 @@ impl DeleteTimelineFlow {
         // to remove the timeline from it.
         // Always if you have two locks that are taken in different order this can result in a deadlock.
 
-        let delete_progress = Arc::clone(&timeline.delete_progress);
+        let delete_progress = Arc::clone(timeline.delete_progress());
         let delete_lock_guard = match delete_progress.try_lock_owned() {
             Ok(guard) => DeletionGuard(guard),
             Err(_) => {
                 // Unfortunately if lock fails arc is consumed.
                 return Err(DeleteTimelineError::AlreadyInProgress(Arc::clone(
-                    &timeline.delete_progress,
+                    timeline.delete_progress(),
                 )));
             }
         };
 
-        timeline.set_state(TimelineState::Stopping);
+        if let TimelineOrOffloaded::Timeline(timeline) = &timeline {
+            timeline.set_state(TimelineState::Stopping);
+        }
 
-        Ok((Arc::clone(timeline), delete_lock_guard))
+        Ok((timeline, delete_lock_guard))
     }
 
     fn schedule_background(
         guard: DeletionGuard,
         conf: &'static PageServerConf,
         tenant: Arc<Tenant>,
-        timeline: Arc<Timeline>,
+        timeline: TimelineOrOffloaded,
     ) {
-        let tenant_shard_id = timeline.tenant_shard_id;
-        let timeline_id = timeline.timeline_id;
+        let tenant_shard_id = timeline.tenant_shard_id();
+        let timeline_id = timeline.timeline_id();
 
         task_mgr::spawn(
             task_mgr::BACKGROUND_RUNTIME.handle(),
@@ -368,7 +395,9 @@ impl DeleteTimelineFlow {
             async move {
                 if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
                     error!("Error: {err:#}");
-                    timeline.set_broken(format!("{err:#}"))
+                    if let TimelineOrOffloaded::Timeline(timeline) = timeline {
+                        timeline.set_broken(format!("{err:#}"))
+                    }
                 };
                 Ok(())
             }
@@ -380,15 +409,19 @@ impl DeleteTimelineFlow {
         mut guard: DeletionGuard,
         conf: &PageServerConf,
         tenant: &Tenant,
-        timeline: &Timeline,
+        timeline: &TimelineOrOffloaded,
     ) -> Result<(), DeleteTimelineError> {
-        delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
+        // Offloaded timelines have no local state
+        // TODO: once we persist offloaded information, delete the timeline from there, too
+        if let TimelineOrOffloaded::Timeline(timeline) = timeline {
+            delete_local_timeline_directory(conf, tenant.tenant_shard_id, timeline).await?;
+        }
 
         delete_remote_layers_and_index(timeline).await?;
 
         pausable_failpoint!("in_progress_delete");
 
-        remove_timeline_from_tenant(tenant, timeline, &guard).await?;
+        remove_maybe_offloaded_timeline_from_tenant(tenant, timeline, &guard).await?;
 
         *guard = Self::Finished;
 
@@ -400,7 +433,7 @@ impl DeleteTimelineFlow {
     }
 }
 
-struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
+pub(super) struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
 
 impl Deref for DeletionGuard {
     type Target = DeleteTimelineFlow;
diff --git a/pageserver/src/tenant/timeline/offload.rs b/pageserver/src/tenant/timeline/offload.rs
new file mode 100644
index 0000000000..fb906d906b
--- /dev/null
+++ b/pageserver/src/tenant/timeline/offload.rs
@@ -0,0 +1,69 @@
+use std::sync::Arc;
+
+use crate::tenant::{OffloadedTimeline, Tenant, TimelineOrOffloaded};
+
+use super::{
+    delete::{delete_local_timeline_directory, DeleteTimelineFlow, DeletionGuard},
+    Timeline,
+};
+
+pub(crate) async fn offload_timeline(
+    tenant: &Tenant,
+    timeline: &Arc<Timeline>,
+) -> anyhow::Result<()> {
+    tracing::info!("offloading archived timeline");
+    let (timeline, guard) = DeleteTimelineFlow::prepare(tenant, timeline.timeline_id)?;
+
+    let TimelineOrOffloaded::Timeline(timeline) = timeline else {
+        tracing::error!("timeline already offloaded, but given timeline object");
+        return Ok(());
+    };
+
+    // TODO extend guard mechanism above with method
+    // to make deletions possible while offloading is in progress
+
+    // TODO mark timeline as offloaded in S3
+
+    let conf = &tenant.conf;
+    delete_local_timeline_directory(conf, tenant.tenant_shard_id, &timeline).await?;
+
+    remove_timeline_from_tenant(tenant, &timeline, &guard).await?;
+
+    {
+        let mut offloaded_timelines = tenant.timelines_offloaded.lock().unwrap();
+        offloaded_timelines.insert(
+            timeline.timeline_id,
+            Arc::new(OffloadedTimeline::from_timeline(&timeline)),
+        );
+    }
+
+    Ok(())
+}
+
+/// It is important that this gets called when DeletionGuard is being held.
+/// For more context see comments in [`DeleteTimelineFlow::prepare`]
+async fn remove_timeline_from_tenant(
+    tenant: &Tenant,
+    timeline: &Timeline,
+    _: &DeletionGuard, // using it as a witness
+) -> anyhow::Result<()> {
+    // Remove the timeline from the map.
+    let mut timelines = tenant.timelines.lock().unwrap();
+    let children_exist = timelines
+        .iter()
+        .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline.timeline_id));
+    // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
+    // We already deleted the layer files, so it's probably best to panic.
+    // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
+    if children_exist {
+        panic!("Timeline grew children while we removed layer files");
+    }
+
+    timelines
+        .remove(&timeline.timeline_id)
+        .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
+
+    drop(timelines);
+
+    Ok(())
+}

From f87f5a383e2c56a3dfe93665bf768ba80e7ae22b Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 9 Oct 2024 11:58:50 +0300
Subject: [PATCH 02/12] tests: Remove redundant log lines when starting an
 endpoint (#9316)

The "Starting postgres endpoint <name>" message is not needed, because
the neon_cli.py prints the neon_local command line used to start the
endpoint. That contains the same information. The "Postgres startup took
XX seconds" message is not very useful because no one pays attention to
those in the python test logs when things are going smoothly, and if you
do wonder about the startup speed, the same information and more can be
found in the compute log.

Before:

    2024-10-07 22:32:27.794 INFO [neon_fixtures.py:3492] Starting postgres endpoint ep-1
    2024-10-07 22:32:27.794 INFO [neon_cli.py:73] Running command "/tmp/neon/bin/neon_local endpoint start --safekeepers 1 ep-1"
    2024-10-07 22:32:27.901 INFO [neon_fixtures.py:3690] Postgres startup took 0.11398935317993164 seconds

After:

    2024-10-07 22:32:27.794 INFO [neon_cli.py:73] Running command "/tmp/neon/bin/neon_local endpoint start --safekeepers 1 ep-1"
---
 test_runner/fixtures/neon_fixtures.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5a7f6e46d3..9a51899d5d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3485,8 +3485,6 @@ class Endpoint(PgProtocol, LogUtils):
         if safekeepers is not None:
             self.active_safekeepers = safekeepers
 
-        log.info(f"Starting postgres endpoint {self.endpoint_id}")
-
         self.env.neon_cli.endpoint_start(
             self.endpoint_id,
             safekeepers=self.active_safekeepers,
@@ -3666,8 +3664,6 @@ class Endpoint(PgProtocol, LogUtils):
         Returns self.
         """
 
-        started_at = time.time()
-
         self.create(
             branch_name=branch_name,
             endpoint_id=endpoint_id,
@@ -3683,8 +3679,6 @@ class Endpoint(PgProtocol, LogUtils):
             basebackup_request_tries=basebackup_request_tries,
         )
 
-        log.info(f"Postgres startup took {time.time() - started_at} seconds")
-
         return self
 
     def __enter__(self) -> Endpoint:

From 211970f0e0ce1f1a85b165309efdb8121bc0f1ce Mon Sep 17 00:00:00 2001
From: Erik Grinaker <erik@neon.tech>
Date: Wed, 9 Oct 2024 10:29:06 +0100
Subject: [PATCH 03/12] remote_storage: add `DownloadOpts::byte_(start|end)`
 (#9293)

`download_byte_range()` is basically a copy of `download()` with an
additional option passed to the backend SDKs. This can cause these code
paths to diverge, and prevents combining various options.

This patch adds `DownloadOpts::byte_(start|end)` and move byte range
handling into `download()`.
---
 libs/remote_storage/src/azure_blob.rs         |  26 +--
 libs/remote_storage/src/lib.rs                | 180 ++++++++++++------
 libs/remote_storage/src/local_fs.rs           | 167 ++++++----------
 libs/remote_storage/src/s3_bucket.rs          |  29 +--
 libs/remote_storage/src/simulate_failures.rs  |  19 +-
 libs/remote_storage/tests/common/tests.rs     |  49 ++++-
 pageserver/src/tenant/secondary/downloader.rs |   1 +
 safekeeper/src/wal_backup.rs                  |  10 +-
 8 files changed, 240 insertions(+), 241 deletions(-)

diff --git a/libs/remote_storage/src/azure_blob.rs b/libs/remote_storage/src/azure_blob.rs
index e113a987a5..f98d16789c 100644
--- a/libs/remote_storage/src/azure_blob.rs
+++ b/libs/remote_storage/src/azure_blob.rs
@@ -496,26 +496,12 @@ impl RemoteStorage for AzureBlobStorage {
             builder = builder.if_match(IfMatchCondition::NotMatch(etag.to_string()))
         }
 
-        self.download_for_builder(builder, cancel).await
-    }
-
-    async fn download_byte_range(
-        &self,
-        from: &RemotePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-        cancel: &CancellationToken,
-    ) -> Result<Download, DownloadError> {
-        let blob_client = self.client.blob_client(self.relative_path_to_name(from));
-
-        let mut builder = blob_client.get();
-
-        let range: Range = if let Some(end_exclusive) = end_exclusive {
-            (start_inclusive..end_exclusive).into()
-        } else {
-            (start_inclusive..).into()
-        };
-        builder = builder.range(range);
+        if let Some((start, end)) = opts.byte_range() {
+            builder = builder.range(match end {
+                Some(end) => Range::Range(start..end),
+                None => Range::RangeFrom(start..),
+            });
+        }
 
         self.download_for_builder(builder, cancel).await
     }
diff --git a/libs/remote_storage/src/lib.rs b/libs/remote_storage/src/lib.rs
index 0ff0f1c878..c6466237bf 100644
--- a/libs/remote_storage/src/lib.rs
+++ b/libs/remote_storage/src/lib.rs
@@ -19,7 +19,8 @@ mod simulate_failures;
 mod support;
 
 use std::{
-    collections::HashMap, fmt::Debug, num::NonZeroU32, pin::Pin, sync::Arc, time::SystemTime,
+    collections::HashMap, fmt::Debug, num::NonZeroU32, ops::Bound, pin::Pin, sync::Arc,
+    time::SystemTime,
 };
 
 use anyhow::Context;
@@ -162,11 +163,60 @@ pub struct Listing {
 }
 
 /// Options for downloads. The default value is a plain GET.
-#[derive(Default)]
 pub struct DownloadOpts {
     /// If given, returns [`DownloadError::Unmodified`] if the object still has
     /// the same ETag (using If-None-Match).
     pub etag: Option<Etag>,
+    /// The start of the byte range to download, or unbounded.
+    pub byte_start: Bound<u64>,
+    /// The end of the byte range to download, or unbounded. Must be after the
+    /// start bound.
+    pub byte_end: Bound<u64>,
+}
+
+impl Default for DownloadOpts {
+    fn default() -> Self {
+        Self {
+            etag: Default::default(),
+            byte_start: Bound::Unbounded,
+            byte_end: Bound::Unbounded,
+        }
+    }
+}
+
+impl DownloadOpts {
+    /// Returns the byte range with inclusive start and exclusive end, or None
+    /// if unbounded.
+    pub fn byte_range(&self) -> Option<(u64, Option<u64>)> {
+        if self.byte_start == Bound::Unbounded && self.byte_end == Bound::Unbounded {
+            return None;
+        }
+        let start = match self.byte_start {
+            Bound::Excluded(i) => i + 1,
+            Bound::Included(i) => i,
+            Bound::Unbounded => 0,
+        };
+        let end = match self.byte_end {
+            Bound::Excluded(i) => Some(i),
+            Bound::Included(i) => Some(i + 1),
+            Bound::Unbounded => None,
+        };
+        if let Some(end) = end {
+            assert!(start < end, "range end {end} at or before start {start}");
+        }
+        Some((start, end))
+    }
+
+    /// Returns the byte range as an RFC 2616 Range header value with inclusive
+    /// bounds, or None if unbounded.
+    pub fn byte_range_header(&self) -> Option<String> {
+        self.byte_range()
+            .map(|(start, end)| (start, end.map(|end| end - 1))) // make end inclusive
+            .map(|(start, end)| match end {
+                Some(end) => format!("bytes={start}-{end}"),
+                None => format!("bytes={start}-"),
+            })
+    }
 }
 
 /// Storage (potentially remote) API to manage its state.
@@ -257,21 +307,6 @@ pub trait RemoteStorage: Send + Sync + 'static {
         cancel: &CancellationToken,
     ) -> Result<Download, DownloadError>;
 
-    /// Streams a given byte range of the remote storage entry contents.
-    ///
-    /// The returned download stream will obey initial timeout and cancellation signal by erroring
-    /// on whichever happens first. Only one of the reasons will fail the stream, which is usually
-    /// enough for `tokio::io::copy_buf` usage. If needed the error can be filtered out.
-    ///
-    /// Returns the metadata, if any was stored with the file previously.
-    async fn download_byte_range(
-        &self,
-        from: &RemotePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-        cancel: &CancellationToken,
-    ) -> Result<Download, DownloadError>;
-
     /// Delete a single path from remote storage.
     ///
     /// If the operation fails because of timeout or cancellation, the root cause of the error will be
@@ -425,33 +460,6 @@ impl<Other: RemoteStorage> GenericRemoteStorage<Arc<Other>> {
         }
     }
 
-    pub async fn download_byte_range(
-        &self,
-        from: &RemotePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-        cancel: &CancellationToken,
-    ) -> Result<Download, DownloadError> {
-        match self {
-            Self::LocalFs(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
-                    .await
-            }
-            Self::AwsS3(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
-                    .await
-            }
-            Self::AzureBlob(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
-                    .await
-            }
-            Self::Unreliable(s) => {
-                s.download_byte_range(from, start_inclusive, end_exclusive, cancel)
-                    .await
-            }
-        }
-    }
-
     /// See [`RemoteStorage::delete`]
     pub async fn delete(
         &self,
@@ -573,20 +581,6 @@ impl GenericRemoteStorage {
             })
     }
 
-    /// Downloads the storage object into the `to_path` provided.
-    /// `byte_range` could be specified to dowload only a part of the file, if needed.
-    pub async fn download_storage_object(
-        &self,
-        byte_range: Option<(u64, Option<u64>)>,
-        from: &RemotePath,
-        cancel: &CancellationToken,
-    ) -> Result<Download, DownloadError> {
-        match byte_range {
-            Some((start, end)) => self.download_byte_range(from, start, end, cancel).await,
-            None => self.download(from, &DownloadOpts::default(), cancel).await,
-        }
-    }
-
     /// The name of the bucket/container/etc.
     pub fn bucket_name(&self) -> Option<&str> {
         match self {
@@ -660,6 +654,76 @@ impl ConcurrencyLimiter {
 mod tests {
     use super::*;
 
+    /// DownloadOpts::byte_range() should generate (inclusive, exclusive) ranges
+    /// with optional end bound, or None when unbounded.
+    #[test]
+    fn download_opts_byte_range() {
+        // Consider using test_case or a similar table-driven test framework.
+        let cases = [
+            // (byte_start, byte_end, expected)
+            (Bound::Unbounded, Bound::Unbounded, None),
+            (Bound::Unbounded, Bound::Included(7), Some((0, Some(8)))),
+            (Bound::Unbounded, Bound::Excluded(7), Some((0, Some(7)))),
+            (Bound::Included(3), Bound::Unbounded, Some((3, None))),
+            (Bound::Included(3), Bound::Included(7), Some((3, Some(8)))),
+            (Bound::Included(3), Bound::Excluded(7), Some((3, Some(7)))),
+            (Bound::Excluded(3), Bound::Unbounded, Some((4, None))),
+            (Bound::Excluded(3), Bound::Included(7), Some((4, Some(8)))),
+            (Bound::Excluded(3), Bound::Excluded(7), Some((4, Some(7)))),
+            // 1-sized ranges are fine, 0 aren't and will panic (separate test).
+            (Bound::Included(3), Bound::Included(3), Some((3, Some(4)))),
+            (Bound::Included(3), Bound::Excluded(4), Some((3, Some(4)))),
+        ];
+
+        for (byte_start, byte_end, expect) in cases {
+            let opts = DownloadOpts {
+                byte_start,
+                byte_end,
+                ..Default::default()
+            };
+            let result = opts.byte_range();
+            assert_eq!(
+                result, expect,
+                "byte_start={byte_start:?} byte_end={byte_end:?}"
+            );
+
+            // Check generated HTTP header, which uses an inclusive range.
+            let expect_header = expect.map(|(start, end)| match end {
+                Some(end) => format!("bytes={start}-{}", end - 1), // inclusive end
+                None => format!("bytes={start}-"),
+            });
+            assert_eq!(
+                opts.byte_range_header(),
+                expect_header,
+                "byte_start={byte_start:?} byte_end={byte_end:?}"
+            );
+        }
+    }
+
+    /// DownloadOpts::byte_range() zero-sized byte range should panic.
+    #[test]
+    #[should_panic]
+    fn download_opts_byte_range_zero() {
+        DownloadOpts {
+            byte_start: Bound::Included(3),
+            byte_end: Bound::Excluded(3),
+            ..Default::default()
+        }
+        .byte_range();
+    }
+
+    /// DownloadOpts::byte_range() negative byte range should panic.
+    #[test]
+    #[should_panic]
+    fn download_opts_byte_range_negative() {
+        DownloadOpts {
+            byte_start: Bound::Included(3),
+            byte_end: Bound::Included(2),
+            ..Default::default()
+        }
+        .byte_range();
+    }
+
     #[test]
     fn test_object_name() {
         let k = RemotePath::new(Utf8Path::new("a/b/c")).unwrap();
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index d912b94c74..93a052139b 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -506,54 +506,7 @@ impl RemoteStorage for LocalFs {
             return Err(DownloadError::Unmodified);
         }
 
-        let source = ReaderStream::new(
-            fs::OpenOptions::new()
-                .read(true)
-                .open(&target_path)
-                .await
-                .with_context(|| {
-                    format!("Failed to open source file {target_path:?} to use in the download")
-                })
-                .map_err(DownloadError::Other)?,
-        );
-
-        let metadata = self
-            .read_storage_metadata(&target_path)
-            .await
-            .map_err(DownloadError::Other)?;
-
-        let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
-        let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
-
-        Ok(Download {
-            metadata,
-            last_modified: file_metadata
-                .modified()
-                .map_err(|e| DownloadError::Other(anyhow::anyhow!(e).context("Reading mtime")))?,
-            etag,
-            download_stream: Box::pin(source),
-        })
-    }
-
-    async fn download_byte_range(
-        &self,
-        from: &RemotePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-        cancel: &CancellationToken,
-    ) -> Result<Download, DownloadError> {
-        if let Some(end_exclusive) = end_exclusive {
-            if end_exclusive <= start_inclusive {
-                return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) is not less than end_exclusive ({end_exclusive:?})")));
-            };
-            if start_inclusive == end_exclusive.saturating_sub(1) {
-                return Err(DownloadError::Other(anyhow::anyhow!("Invalid range, start ({start_inclusive}) and end_exclusive ({end_exclusive:?}) difference is zero bytes")));
-            }
-        }
-
-        let target_path = from.with_base(&self.storage_root);
-        let file_metadata = file_metadata(&target_path).await?;
-        let mut source = tokio::fs::OpenOptions::new()
+        let mut file = fs::OpenOptions::new()
             .read(true)
             .open(&target_path)
             .await
@@ -562,31 +515,29 @@ impl RemoteStorage for LocalFs {
             })
             .map_err(DownloadError::Other)?;
 
-        let len = source
-            .metadata()
-            .await
-            .context("query file length")
-            .map_err(DownloadError::Other)?
-            .len();
+        let mut take = file_metadata.len();
+        if let Some((start, end)) = opts.byte_range() {
+            if start > 0 {
+                file.seek(io::SeekFrom::Start(start))
+                    .await
+                    .context("Failed to seek to the range start in a local storage file")
+                    .map_err(DownloadError::Other)?;
+            }
+            if let Some(end) = end {
+                take = end - start;
+            }
+        }
 
-        source
-            .seek(io::SeekFrom::Start(start_inclusive))
-            .await
-            .context("Failed to seek to the range start in a local storage file")
-            .map_err(DownloadError::Other)?;
+        let source = ReaderStream::new(file.take(take));
 
         let metadata = self
             .read_storage_metadata(&target_path)
             .await
             .map_err(DownloadError::Other)?;
 
-        let source = source.take(end_exclusive.unwrap_or(len) - start_inclusive);
-        let source = ReaderStream::new(source);
-
         let cancel_or_timeout = crate::support::cancel_or_timeout(self.timeout, cancel.clone());
         let source = crate::support::DownloadStream::new(cancel_or_timeout, source);
 
-        let etag = mock_etag(&file_metadata);
         Ok(Download {
             metadata,
             last_modified: file_metadata
@@ -688,7 +639,7 @@ mod fs_tests {
     use super::*;
 
     use camino_tempfile::tempdir;
-    use std::{collections::HashMap, io::Write};
+    use std::{collections::HashMap, io::Write, ops::Bound};
 
     async fn read_and_check_metadata(
         storage: &LocalFs,
@@ -804,10 +755,12 @@ mod fs_tests {
         let (first_part_local, second_part_local) = uploaded_bytes.split_at(3);
 
         let first_part_download = storage
-            .download_byte_range(
+            .download(
                 &upload_target,
-                0,
-                Some(first_part_local.len() as u64),
+                &DownloadOpts {
+                    byte_end: Bound::Excluded(first_part_local.len() as u64),
+                    ..Default::default()
+                },
                 &cancel,
             )
             .await?;
@@ -823,10 +776,15 @@ mod fs_tests {
         );
 
         let second_part_download = storage
-            .download_byte_range(
+            .download(
                 &upload_target,
-                first_part_local.len() as u64,
-                Some((first_part_local.len() + second_part_local.len()) as u64),
+                &DownloadOpts {
+                    byte_start: Bound::Included(first_part_local.len() as u64),
+                    byte_end: Bound::Excluded(
+                        (first_part_local.len() + second_part_local.len()) as u64,
+                    ),
+                    ..Default::default()
+                },
                 &cancel,
             )
             .await?;
@@ -842,7 +800,14 @@ mod fs_tests {
         );
 
         let suffix_bytes = storage
-            .download_byte_range(&upload_target, 13, None, &cancel)
+            .download(
+                &upload_target,
+                &DownloadOpts {
+                    byte_start: Bound::Included(13),
+                    ..Default::default()
+                },
+                &cancel,
+            )
             .await?
             .download_stream;
         let suffix_bytes = aggregate(suffix_bytes).await?;
@@ -850,7 +815,7 @@ mod fs_tests {
         assert_eq!(upload_name, suffix);
 
         let all_bytes = storage
-            .download_byte_range(&upload_target, 0, None, &cancel)
+            .download(&upload_target, &DownloadOpts::default(), &cancel)
             .await?
             .download_stream;
         let all_bytes = aggregate(all_bytes).await?;
@@ -861,48 +826,26 @@ mod fs_tests {
     }
 
     #[tokio::test]
-    async fn download_file_range_negative() -> anyhow::Result<()> {
-        let (storage, cancel) = create_storage()?;
+    #[should_panic(expected = "at or before start")]
+    async fn download_file_range_negative() {
+        let (storage, cancel) = create_storage().unwrap();
         let upload_name = "upload_1";
-        let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel).await?;
+        let upload_target = upload_dummy_file(&storage, upload_name, None, &cancel)
+            .await
+            .unwrap();
 
-        let start = 1_000_000_000;
-        let end = start + 1;
-        match storage
-            .download_byte_range(
+        storage
+            .download(
                 &upload_target,
-                start,
-                Some(end), // exclusive end
+                &DownloadOpts {
+                    byte_start: Bound::Included(10),
+                    byte_end: Bound::Excluded(10),
+                    ..Default::default()
+                },
                 &cancel,
             )
             .await
-        {
-            Ok(_) => panic!("Should not allow downloading wrong ranges"),
-            Err(e) => {
-                let error_string = e.to_string();
-                assert!(error_string.contains("zero bytes"));
-                assert!(error_string.contains(&start.to_string()));
-                assert!(error_string.contains(&end.to_string()));
-            }
-        }
-
-        let start = 10000;
-        let end = 234;
-        assert!(start > end, "Should test an incorrect range");
-        match storage
-            .download_byte_range(&upload_target, start, Some(end), &cancel)
-            .await
-        {
-            Ok(_) => panic!("Should not allow downloading wrong ranges"),
-            Err(e) => {
-                let error_string = e.to_string();
-                assert!(error_string.contains("Invalid range"));
-                assert!(error_string.contains(&start.to_string()));
-                assert!(error_string.contains(&end.to_string()));
-            }
-        }
-
-        Ok(())
+            .unwrap();
     }
 
     #[tokio::test]
@@ -945,10 +888,12 @@ mod fs_tests {
         let (first_part_local, _) = uploaded_bytes.split_at(3);
 
         let partial_download_with_metadata = storage
-            .download_byte_range(
+            .download(
                 &upload_target,
-                0,
-                Some(first_part_local.len() as u64),
+                &DownloadOpts {
+                    byte_end: Bound::Excluded(first_part_local.len() as u64),
+                    ..Default::default()
+                },
                 &cancel,
             )
             .await?;
diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index ec7c047565..f950f2886c 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -804,34 +804,7 @@ impl RemoteStorage for S3Bucket {
                 bucket: self.bucket_name.clone(),
                 key: self.relative_path_to_s3_object(from),
                 etag: opts.etag.as_ref().map(|e| e.to_string()),
-                range: None,
-            },
-            cancel,
-        )
-        .await
-    }
-
-    async fn download_byte_range(
-        &self,
-        from: &RemotePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-        cancel: &CancellationToken,
-    ) -> Result<Download, DownloadError> {
-        // S3 accepts ranges as https://www.w3.org/Protocols/rfc2616/rfc2616-sec14.html#sec14.35
-        // and needs both ends to be exclusive
-        let end_inclusive = end_exclusive.map(|end| end.saturating_sub(1));
-        let range = Some(match end_inclusive {
-            Some(end_inclusive) => format!("bytes={start_inclusive}-{end_inclusive}"),
-            None => format!("bytes={start_inclusive}-"),
-        });
-
-        self.download_object(
-            GetObjectRequest {
-                bucket: self.bucket_name.clone(),
-                key: self.relative_path_to_s3_object(from),
-                etag: None,
-                range,
+                range: opts.byte_range_header(),
             },
             cancel,
         )
diff --git a/libs/remote_storage/src/simulate_failures.rs b/libs/remote_storage/src/simulate_failures.rs
index 05f82b5a5a..10db53971c 100644
--- a/libs/remote_storage/src/simulate_failures.rs
+++ b/libs/remote_storage/src/simulate_failures.rs
@@ -170,28 +170,13 @@ impl RemoteStorage for UnreliableWrapper {
         opts: &DownloadOpts,
         cancel: &CancellationToken,
     ) -> Result<Download, DownloadError> {
+        // Note: We treat any byte range as an "attempt" of the same operation.
+        // We don't pay attention to the ranges. That's good enough for now.
         self.attempt(RemoteOp::Download(from.clone()))
             .map_err(DownloadError::Other)?;
         self.inner.download(from, opts, cancel).await
     }
 
-    async fn download_byte_range(
-        &self,
-        from: &RemotePath,
-        start_inclusive: u64,
-        end_exclusive: Option<u64>,
-        cancel: &CancellationToken,
-    ) -> Result<Download, DownloadError> {
-        // Note: We treat any download_byte_range as an "attempt" of the same
-        // operation. We don't pay attention to the ranges. That's good enough
-        // for now.
-        self.attempt(RemoteOp::Download(from.clone()))
-            .map_err(DownloadError::Other)?;
-        self.inner
-            .download_byte_range(from, start_inclusive, end_exclusive, cancel)
-            .await
-    }
-
     async fn delete(&self, path: &RemotePath, cancel: &CancellationToken) -> anyhow::Result<()> {
         self.delete_inner(path, true, cancel).await
     }
diff --git a/libs/remote_storage/tests/common/tests.rs b/libs/remote_storage/tests/common/tests.rs
index e38cfb3ef0..e6f33fc3f8 100644
--- a/libs/remote_storage/tests/common/tests.rs
+++ b/libs/remote_storage/tests/common/tests.rs
@@ -2,6 +2,7 @@ use anyhow::Context;
 use camino::Utf8Path;
 use futures::StreamExt;
 use remote_storage::{DownloadError, DownloadOpts, ListingMode, ListingObject, RemotePath};
+use std::ops::Bound;
 use std::sync::Arc;
 use std::{collections::HashSet, num::NonZeroU32};
 use test_context::test_context;
@@ -293,7 +294,15 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
     // Full range (end specified)
     let dl = ctx
         .client
-        .download_byte_range(&path, 0, Some(len as u64), &cancel)
+        .download(
+            &path,
+            &DownloadOpts {
+                byte_start: Bound::Included(0),
+                byte_end: Bound::Excluded(len as u64),
+                ..Default::default()
+            },
+            &cancel,
+        )
         .await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
@@ -301,7 +310,15 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
     // partial range (end specified)
     let dl = ctx
         .client
-        .download_byte_range(&path, 4, Some(10), &cancel)
+        .download(
+            &path,
+            &DownloadOpts {
+                byte_start: Bound::Included(4),
+                byte_end: Bound::Excluded(10),
+                ..Default::default()
+            },
+            &cancel,
+        )
         .await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig[4..10]);
@@ -309,7 +326,15 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
     // partial range (end beyond real end)
     let dl = ctx
         .client
-        .download_byte_range(&path, 8, Some(len as u64 * 100), &cancel)
+        .download(
+            &path,
+            &DownloadOpts {
+                byte_start: Bound::Included(8),
+                byte_end: Bound::Excluded(len as u64 * 100),
+                ..Default::default()
+            },
+            &cancel,
+        )
         .await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig[8..]);
@@ -317,7 +342,14 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
     // Partial range (end unspecified)
     let dl = ctx
         .client
-        .download_byte_range(&path, 4, None, &cancel)
+        .download(
+            &path,
+            &DownloadOpts {
+                byte_start: Bound::Included(4),
+                ..Default::default()
+            },
+            &cancel,
+        )
         .await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig[4..]);
@@ -325,7 +357,14 @@ async fn upload_download_works(ctx: &mut MaybeEnabledStorage) -> anyhow::Result<
     // Full range (end unspecified)
     let dl = ctx
         .client
-        .download_byte_range(&path, 0, None, &cancel)
+        .download(
+            &path,
+            &DownloadOpts {
+                byte_start: Bound::Included(0),
+                ..Default::default()
+            },
+            &cancel,
+        )
         .await?;
     let buf = download_to_vec(dl).await?;
     assert_eq!(&buf, &orig);
diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs
index 9f7447a9ac..82c5702686 100644
--- a/pageserver/src/tenant/secondary/downloader.rs
+++ b/pageserver/src/tenant/secondary/downloader.rs
@@ -950,6 +950,7 @@ impl<'a> TenantDownloader<'a> {
         let cancel = &self.secondary_state.cancel;
         let opts = DownloadOpts {
             etag: prev_etag.cloned(),
+            ..Default::default()
         };
 
         backoff::retry(
diff --git a/safekeeper/src/wal_backup.rs b/safekeeper/src/wal_backup.rs
index ef26ac99c5..6c87e5a926 100644
--- a/safekeeper/src/wal_backup.rs
+++ b/safekeeper/src/wal_backup.rs
@@ -17,7 +17,9 @@ use std::time::Duration;
 use postgres_ffi::v14::xlog_utils::XLogSegNoOffsetToRecPtr;
 use postgres_ffi::XLogFileName;
 use postgres_ffi::{XLogSegNo, PG_TLI};
-use remote_storage::{GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata};
+use remote_storage::{
+    DownloadOpts, GenericRemoteStorage, ListingMode, RemotePath, StorageMetadata,
+};
 use tokio::fs::File;
 
 use tokio::select;
@@ -503,8 +505,12 @@ pub async fn read_object(
 
     let cancel = CancellationToken::new();
 
+    let opts = DownloadOpts {
+        byte_start: std::ops::Bound::Included(offset),
+        ..Default::default()
+    };
     let download = storage
-        .download_storage_object(Some((offset, None)), file_path, &cancel)
+        .download(file_path, &opts, &cancel)
         .await
         .with_context(|| {
             format!("Failed to open WAL segment download stream for remote path {file_path:?}")

From 8a138db8b7ed15cb3adfabcbc5d8ea4af23c1156 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 9 Oct 2024 12:55:56 +0300
Subject: [PATCH 04/12] tests: Reduce noise from logging renamed files (#9315)

Instead of printing the full absolute path for every file, print just
the filenames.

Before:

    2024-10-08 13:19:39.98 INFO [test_pageserver_generations.py:669] Found file /home/heikki/git-sandbox/neon/test_output/test_upgrade_generationless_local_file_paths[debug-pg16]/repo/pageserver_1/tenants/0c04a8df7691a367ad0bb1cc1373ba4d/timelines/f41022551e5f96ce8dbefb9b5d35ab45/000000067F0000000100000A8D0100000000-000000067F0000000100000AC10000000002__00000000014F16F0-v1-00000001
    2024-10-08 13:19:39.99 INFO [test_pageserver_generations.py:673] Renamed /home/heikki/git-sandbox/neon/test_output/test_upgrade_generationless_local_file_paths[debug-pg16]/repo/pageserver_1/tenants/0c04a8df7691a367ad0bb1cc1373ba4d/timelines/f41022551e5f96ce8dbefb9b5d35ab45/000000067F0000000100000A8D0100000000-000000067F0000000100000AC10000000002__00000000014F16F0-v1-00000001 -> /home/heikki/git-sandbox/neon/test_output/test_upgrade_generationless_local_file_paths[debug-pg16]/repo/pageserver_1/tenants/0c04a8df7691a367ad0bb1cc1373ba4d/timelines/f41022551e5f96ce8dbefb9b5d35ab45/000000067F0000000100000A8D0100000000-000000067F0000000100000AC10000000002__00000000014F16F0

After:

    2024-10-08 13:24:39.726 INFO [test_pageserver_generations.py:667] Renaming files in /home/heikki/git-sandbox/neon/test_output/test_upgrade_generationless_local_file_paths[debug-pg16]/repo/pageserver_1/tenants/3439538816c520adecc541cc8b1de21c/timelines/6a7be8ee707b355de48dd91b326d6ae1
    2024-10-08 13:24:39.728 INFO [test_pageserver_generations.py:673] Renamed
000000067F0000000100000A8D0100000000-000000067F0000000100000AC10000000002__00000000014F16F0-v1-00000001 -> 000000067F0000000100000A8D0100000000-000000067F0000000100000AC10000000002__00000000014F16F0
---
 .../regress/test_pageserver_generations.py        | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/test_runner/regress/test_pageserver_generations.py b/test_runner/regress/test_pageserver_generations.py
index d13cf0019d..577a3a25ca 100644
--- a/test_runner/regress/test_pageserver_generations.py
+++ b/test_runner/regress/test_pageserver_generations.py
@@ -666,14 +666,17 @@ def test_upgrade_generationless_local_file_paths(
         pageserver.stop()
         timeline_dir = pageserver.timeline_dir(tenant_id, timeline_id)
         files_renamed = 0
+        log.info(f"Renaming files in {timeline_dir}")
         for filename in os.listdir(timeline_dir):
-            path = os.path.join(timeline_dir, filename)
-            log.info(f"Found file {path}")
-            if path.endswith("-v1-00000001"):
-                new_path = path[:-12]
-                os.rename(path, new_path)
-                log.info(f"Renamed {path} -> {new_path}")
+            if filename.endswith("-v1-00000001"):
+                new_filename = filename[:-12]
+                os.rename(
+                    os.path.join(timeline_dir, filename), os.path.join(timeline_dir, new_filename)
+                )
+                log.info(f"Renamed {filename} -> {new_filename}")
                 files_renamed += 1
+            else:
+                log.info(f"Keeping {filename}")
 
         assert files_renamed > 0
 

From 54d1185789c3c99965df026e0e29e472f0c61e1f Mon Sep 17 00:00:00 2001
From: Folke Behrens <folke@neon.tech>
Date: Wed, 9 Oct 2024 12:44:17 +0200
Subject: [PATCH 05/12] proxy: Unalias hyper1 and replace one use of hyper0 in
 test (#9324)

Leaves one final use of hyper0 in proxy for the health service,
which requires some coordinated effort with other services.
---
 proxy/Cargo.toml                       |  2 +-
 proxy/src/auth/backend/jwt.rs          |  4 +-
 proxy/src/http/health_server.rs        |  6 +--
 proxy/src/http/mod.rs                  |  2 +-
 proxy/src/lib.rs                       |  2 -
 proxy/src/proxy/wake_compute.rs        |  2 +-
 proxy/src/serverless/backend.rs        |  4 +-
 proxy/src/serverless/http_conn_pool.rs |  6 +--
 proxy/src/serverless/http_util.rs      |  8 +--
 proxy/src/serverless/mod.rs            |  8 +--
 proxy/src/serverless/sql_over_http.rs  | 28 +++++-----
 proxy/src/serverless/websocket.rs      |  2 +-
 proxy/src/usage_metrics.rs             | 74 +++++++++++++-------------
 13 files changed, 74 insertions(+), 74 deletions(-)

diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index ae9b2531aa..c995ac3660 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -39,7 +39,7 @@ http.workspace = true
 humantime.workspace = true
 humantime-serde.workspace = true
 hyper0.workspace = true
-hyper1 = { package = "hyper", version = "1.2", features = ["server"] }
+hyper = { workspace = true, features = ["server", "http1", "http2"] }
 hyper-util = { version = "0.1", features = ["server", "http1", "http2", "tokio"] }
 http-body-util = { version = "0.1" }
 indexmap.workspace = true
diff --git a/proxy/src/auth/backend/jwt.rs b/proxy/src/auth/backend/jwt.rs
index b62a11ccb2..0c66fe5381 100644
--- a/proxy/src/auth/backend/jwt.rs
+++ b/proxy/src/auth/backend/jwt.rs
@@ -571,7 +571,7 @@ mod tests {
     use bytes::Bytes;
     use http::Response;
     use http_body_util::Full;
-    use hyper1::service::service_fn;
+    use hyper::service::service_fn;
     use hyper_util::rt::TokioIo;
     use rand::rngs::OsRng;
     use rsa::pkcs8::DecodePrivateKey;
@@ -736,7 +736,7 @@ X0n5X2/pBLJzxZc62ccvZYVnctBiFs6HbSnxpuMQCfkt/BcR/ttIepBQQIW86wHL
         });
 
         let listener = TcpListener::bind("0.0.0.0:0").await.unwrap();
-        let server = hyper1::server::conn::http1::Builder::new();
+        let server = hyper::server::conn::http1::Builder::new();
         let addr = listener.local_addr().unwrap();
         tokio::spawn(async move {
             loop {
diff --git a/proxy/src/http/health_server.rs b/proxy/src/http/health_server.rs
index cae9eb5b97..d0352351d5 100644
--- a/proxy/src/http/health_server.rs
+++ b/proxy/src/http/health_server.rs
@@ -1,5 +1,5 @@
 use anyhow::{anyhow, bail};
-use hyper::{header::CONTENT_TYPE, Body, Request, Response, StatusCode};
+use hyper0::{header::CONTENT_TYPE, Body, Request, Response, StatusCode};
 use measured::{text::BufferedTextEncoder, MetricGroup};
 use metrics::NeonMetrics;
 use std::{
@@ -21,7 +21,7 @@ async fn status_handler(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(StatusCode::OK, "")
 }
 
-fn make_router(metrics: AppMetrics) -> RouterBuilder<hyper::Body, ApiError> {
+fn make_router(metrics: AppMetrics) -> RouterBuilder<hyper0::Body, ApiError> {
     let state = Arc::new(Mutex::new(PrometheusHandler {
         encoder: BufferedTextEncoder::new(),
         metrics,
@@ -45,7 +45,7 @@ pub async fn task_main(
 
     let service = || RouterService::new(make_router(metrics).build()?);
 
-    hyper::Server::from_tcp(http_listener)?
+    hyper0::Server::from_tcp(http_listener)?
         .serve(service().map_err(|e| anyhow!(e))?)
         .await?;
 
diff --git a/proxy/src/http/mod.rs b/proxy/src/http/mod.rs
index 14720b5c6b..d8676d5b50 100644
--- a/proxy/src/http/mod.rs
+++ b/proxy/src/http/mod.rs
@@ -9,7 +9,7 @@ use std::time::Duration;
 use anyhow::bail;
 use bytes::Bytes;
 use http_body_util::BodyExt;
-use hyper1::body::Body;
+use hyper::body::Body;
 use serde::de::DeserializeOwned;
 
 pub(crate) use reqwest::{Request, Response};
diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs
index 79f9760461..8d274baa10 100644
--- a/proxy/src/lib.rs
+++ b/proxy/src/lib.rs
@@ -90,8 +90,6 @@ use tokio::task::JoinError;
 use tokio_util::sync::CancellationToken;
 use tracing::warn;
 
-extern crate hyper0 as hyper;
-
 pub mod auth;
 pub mod cache;
 pub mod cancellation;
diff --git a/proxy/src/proxy/wake_compute.rs b/proxy/src/proxy/wake_compute.rs
index 4dfee0656d..ba674f5d0d 100644
--- a/proxy/src/proxy/wake_compute.rs
+++ b/proxy/src/proxy/wake_compute.rs
@@ -7,7 +7,7 @@ use crate::metrics::{
     WakeupFailureKind,
 };
 use crate::proxy::retry::{retry_after, should_retry};
-use hyper1::StatusCode;
+use hyper::StatusCode;
 use tracing::{error, info, warn};
 
 use super::connect_compute::ComputeConnectBackend;
diff --git a/proxy/src/serverless/backend.rs b/proxy/src/serverless/backend.rs
index 4e758e6eda..764b97fb7b 100644
--- a/proxy/src/serverless/backend.rs
+++ b/proxy/src/serverless/backend.rs
@@ -257,7 +257,7 @@ pub(crate) enum LocalProxyConnError {
     #[error("error with connection to local-proxy")]
     Io(#[source] std::io::Error),
     #[error("could not establish h2 connection")]
-    H2(#[from] hyper1::Error),
+    H2(#[from] hyper::Error),
 }
 
 impl ReportableError for HttpConnError {
@@ -481,7 +481,7 @@ async fn connect_http2(
         };
     };
 
-    let (client, connection) = hyper1::client::conn::http2::Builder::new(TokioExecutor::new())
+    let (client, connection) = hyper::client::conn::http2::Builder::new(TokioExecutor::new())
         .timer(TokioTimer::new())
         .keep_alive_interval(Duration::from_secs(20))
         .keep_alive_while_idle(true)
diff --git a/proxy/src/serverless/http_conn_pool.rs b/proxy/src/serverless/http_conn_pool.rs
index 4e6f8cf55c..6d61536f1a 100644
--- a/proxy/src/serverless/http_conn_pool.rs
+++ b/proxy/src/serverless/http_conn_pool.rs
@@ -1,5 +1,5 @@
 use dashmap::DashMap;
-use hyper1::client::conn::http2;
+use hyper::client::conn::http2;
 use hyper_util::rt::{TokioExecutor, TokioIo};
 use parking_lot::RwLock;
 use rand::Rng;
@@ -18,9 +18,9 @@ use tracing::{info, info_span, Instrument};
 
 use super::conn_pool::ConnInfo;
 
-pub(crate) type Send = http2::SendRequest<hyper1::body::Incoming>;
+pub(crate) type Send = http2::SendRequest<hyper::body::Incoming>;
 pub(crate) type Connect =
-    http2::Connection<TokioIo<TcpStream>, hyper1::body::Incoming, TokioExecutor>;
+    http2::Connection<TokioIo<TcpStream>, hyper::body::Incoming, TokioExecutor>;
 
 #[derive(Clone)]
 struct ConnPoolEntry {
diff --git a/proxy/src/serverless/http_util.rs b/proxy/src/serverless/http_util.rs
index d766a46577..87a72ec5f0 100644
--- a/proxy/src/serverless/http_util.rs
+++ b/proxy/src/serverless/http_util.rs
@@ -11,7 +11,7 @@ use serde::Serialize;
 use utils::http::error::ApiError;
 
 /// Like [`ApiError::into_response`]
-pub(crate) fn api_error_into_response(this: ApiError) -> Response<BoxBody<Bytes, hyper1::Error>> {
+pub(crate) fn api_error_into_response(this: ApiError) -> Response<BoxBody<Bytes, hyper::Error>> {
     match this {
         ApiError::BadRequest(err) => HttpErrorBody::response_from_msg_and_status(
             format!("{err:#?}"), // use debug printing so that we give the cause
@@ -67,12 +67,12 @@ impl HttpErrorBody {
     fn response_from_msg_and_status(
         msg: String,
         status: StatusCode,
-    ) -> Response<BoxBody<Bytes, hyper1::Error>> {
+    ) -> Response<BoxBody<Bytes, hyper::Error>> {
         HttpErrorBody { msg }.to_response(status)
     }
 
     /// Same as [`utils::http::error::HttpErrorBody::to_response`]
-    fn to_response(&self, status: StatusCode) -> Response<BoxBody<Bytes, hyper1::Error>> {
+    fn to_response(&self, status: StatusCode) -> Response<BoxBody<Bytes, hyper::Error>> {
         Response::builder()
             .status(status)
             .header(http::header::CONTENT_TYPE, "application/json")
@@ -90,7 +90,7 @@ impl HttpErrorBody {
 pub(crate) fn json_response<T: Serialize>(
     status: StatusCode,
     data: T,
-) -> Result<Response<BoxBody<Bytes, hyper1::Error>>, ApiError> {
+) -> Result<Response<BoxBody<Bytes, hyper::Error>>, ApiError> {
     let json = serde_json::to_string(&data)
         .context("Failed to serialize JSON response")
         .map_err(ApiError::InternalServerError)?;
diff --git a/proxy/src/serverless/mod.rs b/proxy/src/serverless/mod.rs
index a7e3fa709b..5987776c28 100644
--- a/proxy/src/serverless/mod.rs
+++ b/proxy/src/serverless/mod.rs
@@ -22,7 +22,7 @@ use futures::TryFutureExt;
 use http::{Method, Response, StatusCode};
 use http_body_util::combinators::BoxBody;
 use http_body_util::{BodyExt, Empty};
-use hyper1::body::Incoming;
+use hyper::body::Incoming;
 use hyper_util::rt::TokioExecutor;
 use hyper_util::server::conn::auto::Builder;
 use rand::rngs::StdRng;
@@ -302,7 +302,7 @@ async fn connection_handler(
     let server = Builder::new(TokioExecutor::new());
     let conn = server.serve_connection_with_upgrades(
         hyper_util::rt::TokioIo::new(conn),
-        hyper1::service::service_fn(move |req: hyper1::Request<Incoming>| {
+        hyper::service::service_fn(move |req: hyper::Request<Incoming>| {
             // First HTTP request shares the same session ID
             let session_id = session_id.take().unwrap_or_else(uuid::Uuid::new_v4);
 
@@ -355,7 +355,7 @@ async fn connection_handler(
 
 #[allow(clippy::too_many_arguments)]
 async fn request_handler(
-    mut request: hyper1::Request<Incoming>,
+    mut request: hyper::Request<Incoming>,
     config: &'static ProxyConfig,
     backend: Arc<PoolingBackend>,
     ws_connections: TaskTracker,
@@ -365,7 +365,7 @@ async fn request_handler(
     // used to cancel in-flight HTTP requests. not used to cancel websockets
     http_cancellation_token: CancellationToken,
     endpoint_rate_limiter: Arc<EndpointRateLimiter>,
-) -> Result<Response<BoxBody<Bytes, hyper1::Error>>, ApiError> {
+) -> Result<Response<BoxBody<Bytes, hyper::Error>>, ApiError> {
     let host = request
         .headers()
         .get("host")
diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs
index f3a7ed9329..34c19157e6 100644
--- a/proxy/src/serverless/sql_over_http.rs
+++ b/proxy/src/serverless/sql_over_http.rs
@@ -12,14 +12,14 @@ use http::Method;
 use http_body_util::combinators::BoxBody;
 use http_body_util::BodyExt;
 use http_body_util::Full;
-use hyper1::body::Body;
-use hyper1::body::Incoming;
-use hyper1::header;
-use hyper1::http::HeaderName;
-use hyper1::http::HeaderValue;
-use hyper1::Response;
-use hyper1::StatusCode;
-use hyper1::{HeaderMap, Request};
+use hyper::body::Body;
+use hyper::body::Incoming;
+use hyper::header;
+use hyper::http::HeaderName;
+use hyper::http::HeaderValue;
+use hyper::Response;
+use hyper::StatusCode;
+use hyper::{HeaderMap, Request};
 use pq_proto::StartupMessageParamsBuilder;
 use serde::Serialize;
 use serde_json::Value;
@@ -272,7 +272,7 @@ pub(crate) async fn handle(
     request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
     cancel: CancellationToken,
-) -> Result<Response<BoxBody<Bytes, hyper1::Error>>, ApiError> {
+) -> Result<Response<BoxBody<Bytes, hyper::Error>>, ApiError> {
     let result = handle_inner(cancel, config, &ctx, request, backend).await;
 
     let mut response = match result {
@@ -435,7 +435,7 @@ impl UserFacingError for SqlOverHttpError {
 #[derive(Debug, thiserror::Error)]
 pub(crate) enum ReadPayloadError {
     #[error("could not read the HTTP request body: {0}")]
-    Read(#[from] hyper1::Error),
+    Read(#[from] hyper::Error),
     #[error("could not parse the HTTP request body: {0}")]
     Parse(#[from] serde_json::Error),
 }
@@ -476,7 +476,7 @@ struct HttpHeaders {
 }
 
 impl HttpHeaders {
-    fn try_parse(headers: &hyper1::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
+    fn try_parse(headers: &hyper::http::HeaderMap) -> Result<Self, SqlOverHttpError> {
         // Determine the output options. Default behaviour is 'false'. Anything that is not
         // strictly 'true' assumed to be false.
         let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
@@ -529,7 +529,7 @@ async fn handle_inner(
     ctx: &RequestMonitoring,
     request: Request<Incoming>,
     backend: Arc<PoolingBackend>,
-) -> Result<Response<BoxBody<Bytes, hyper1::Error>>, SqlOverHttpError> {
+) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> {
     let _requeset_gauge = Metrics::get()
         .proxy
         .connection_requests
@@ -577,7 +577,7 @@ async fn handle_db_inner(
     conn_info: ConnInfo,
     auth: AuthData,
     backend: Arc<PoolingBackend>,
-) -> Result<Response<BoxBody<Bytes, hyper1::Error>>, SqlOverHttpError> {
+) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> {
     //
     // Determine the destination and connection params
     //
@@ -744,7 +744,7 @@ async fn handle_auth_broker_inner(
     conn_info: ConnInfo,
     jwt: String,
     backend: Arc<PoolingBackend>,
-) -> Result<Response<BoxBody<Bytes, hyper1::Error>>, SqlOverHttpError> {
+) -> Result<Response<BoxBody<Bytes, hyper::Error>>, SqlOverHttpError> {
     backend
         .authenticate_with_jwt(
             ctx,
diff --git a/proxy/src/serverless/websocket.rs b/proxy/src/serverless/websocket.rs
index 3d257223b8..08d5da9bef 100644
--- a/proxy/src/serverless/websocket.rs
+++ b/proxy/src/serverless/websocket.rs
@@ -12,7 +12,7 @@ use anyhow::Context as _;
 use bytes::{Buf, BufMut, Bytes, BytesMut};
 use framed_websockets::{Frame, OpCode, WebSocketServer};
 use futures::{Sink, Stream};
-use hyper1::upgrade::OnUpgrade;
+use hyper::upgrade::OnUpgrade;
 use hyper_util::rt::TokioIo;
 use pin_project_lite::pin_project;
 
diff --git a/proxy/src/usage_metrics.rs b/proxy/src/usage_metrics.rs
index fd8599bcb3..bd3e62bc12 100644
--- a/proxy/src/usage_metrics.rs
+++ b/proxy/src/usage_metrics.rs
@@ -485,49 +485,51 @@ async fn upload_events_chunk(
 
 #[cfg(test)]
 mod tests {
-    use std::{
-        net::TcpListener,
-        sync::{Arc, Mutex},
-    };
+    use super::*;
 
+    use crate::{http, BranchId, EndpointId};
     use anyhow::Error;
     use chrono::Utc;
     use consumption_metrics::{Event, EventChunk};
-    use hyper::{
-        service::{make_service_fn, service_fn},
-        Body, Response,
-    };
+    use http_body_util::BodyExt;
+    use hyper::{body::Incoming, server::conn::http1, service::service_fn, Request, Response};
+    use hyper_util::rt::TokioIo;
+    use std::sync::{Arc, Mutex};
+    use tokio::net::TcpListener;
     use url::Url;
 
-    use super::*;
-    use crate::{http, BranchId, EndpointId};
-
     #[tokio::test]
     async fn metrics() {
-        let listener = TcpListener::bind("0.0.0.0:0").unwrap();
+        type Report = EventChunk<'static, Event<Ids, String>>;
+        let reports: Arc<Mutex<Vec<Report>>> = Arc::default();
 
-        let reports = Arc::new(Mutex::new(vec![]));
-        let reports2 = reports.clone();
-
-        let server = hyper::server::Server::from_tcp(listener)
-            .unwrap()
-            .serve(make_service_fn(move |_| {
-                let reports = reports.clone();
-                async move {
-                    Ok::<_, Error>(service_fn(move |req| {
+        let listener = TcpListener::bind("127.0.0.1:0").await.unwrap();
+        let addr = listener.local_addr().unwrap();
+        tokio::spawn({
+            let reports = reports.clone();
+            async move {
+                loop {
+                    if let Ok((stream, _addr)) = listener.accept().await {
                         let reports = reports.clone();
-                        async move {
-                            let bytes = hyper::body::to_bytes(req.into_body()).await?;
-                            let events: EventChunk<'static, Event<Ids, String>> =
-                                serde_json::from_slice(&bytes)?;
-                            reports.lock().unwrap().push(events);
-                            Ok::<_, Error>(Response::new(Body::from(vec![])))
-                        }
-                    }))
+                        http1::Builder::new()
+                            .serve_connection(
+                                TokioIo::new(stream),
+                                service_fn(move |req: Request<Incoming>| {
+                                    let reports = reports.clone();
+                                    async move {
+                                        let bytes = req.into_body().collect().await?.to_bytes();
+                                        let events = serde_json::from_slice(&bytes)?;
+                                        reports.lock().unwrap().push(events);
+                                        Ok::<_, Error>(Response::new(String::new()))
+                                    }
+                                }),
+                            )
+                            .await
+                            .unwrap();
+                    }
                 }
-            }));
-        let addr = server.local_addr();
-        tokio::spawn(server);
+            }
+        });
 
         let metrics = Metrics::default();
         let client = http::new_client();
@@ -536,7 +538,7 @@ mod tests {
 
         // no counters have been registered
         collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
-        let r = std::mem::take(&mut *reports2.lock().unwrap());
+        let r = std::mem::take(&mut *reports.lock().unwrap());
         assert!(r.is_empty());
 
         // register a new counter
@@ -548,7 +550,7 @@ mod tests {
 
         // the counter should be observed despite 0 egress
         collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
-        let r = std::mem::take(&mut *reports2.lock().unwrap());
+        let r = std::mem::take(&mut *reports.lock().unwrap());
         assert_eq!(r.len(), 1);
         assert_eq!(r[0].events.len(), 1);
         assert_eq!(r[0].events[0].value, 0);
@@ -558,7 +560,7 @@ mod tests {
 
         // egress should be observered
         collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
-        let r = std::mem::take(&mut *reports2.lock().unwrap());
+        let r = std::mem::take(&mut *reports.lock().unwrap());
         assert_eq!(r.len(), 1);
         assert_eq!(r[0].events.len(), 1);
         assert_eq!(r[0].events[0].value, 1);
@@ -568,7 +570,7 @@ mod tests {
 
         // we do not observe the counter
         collect_metrics_iteration(&metrics.endpoints, &client, &endpoint, "foo", now, now).await;
-        let r = std::mem::take(&mut *reports2.lock().unwrap());
+        let r = std::mem::take(&mut *reports.lock().unwrap());
         assert!(r.is_empty());
 
         // counter is unregistered

From cc599e23c1d06c8311f8257ef9f7745a807dc3f1 Mon Sep 17 00:00:00 2001
From: Vlad Lazar <vlad@neon.tech>
Date: Wed, 9 Oct 2024 11:53:29 +0100
Subject: [PATCH 06/12] storcon: make observed state updates more granular
 (#9276)

## Problem

Previously, observed state updates from the reconciler may have
clobbered inline changes made to the observed state by other code paths.

## Summary of changes

Model observed state changes from reconcilers as deltas. This means that
we only update what has changed. Handling for node going off-line concurrently
during the reconcile is also added: set observed state to None in such cases to
respect the convention.

Closes https://github.com/neondatabase/neon/issues/9124
---
 storage_controller/src/reconciler.rs   | 42 +++++++++++++-
 storage_controller/src/service.rs      | 44 ++++++++-------
 storage_controller/src/tenant_shard.rs | 77 +++++++++++++++++++++++++-
 3 files changed, 141 insertions(+), 22 deletions(-)

diff --git a/storage_controller/src/reconciler.rs b/storage_controller/src/reconciler.rs
index 4864a021fe..9d2182d44c 100644
--- a/storage_controller/src/reconciler.rs
+++ b/storage_controller/src/reconciler.rs
@@ -22,7 +22,7 @@ use utils::sync::gate::GateGuard;
 
 use crate::compute_hook::{ComputeHook, NotifyError};
 use crate::node::Node;
-use crate::tenant_shard::{IntentState, ObservedState, ObservedStateLocation};
+use crate::tenant_shard::{IntentState, ObservedState, ObservedStateDelta, ObservedStateLocation};
 
 const DEFAULT_HEATMAP_PERIOD: &str = "60s";
 
@@ -45,8 +45,15 @@ pub(super) struct Reconciler {
     pub(crate) reconciler_config: ReconcilerConfig,
 
     pub(crate) config: TenantConfig,
+
+    /// Observed state from the point of view of the reconciler.
+    /// This gets updated as the reconciliation makes progress.
     pub(crate) observed: ObservedState,
 
+    /// Snapshot of the observed state at the point when the reconciler
+    /// was spawned.
+    pub(crate) original_observed: ObservedState,
+
     pub(crate) service_config: service::Config,
 
     /// A hook to notify the running postgres instances when we change the location
@@ -846,6 +853,39 @@ impl Reconciler {
         }
     }
 
+    /// Compare the observed state snapshot from when the reconcile was created
+    /// with the final observed state in order to generate observed state deltas.
+    pub(crate) fn observed_deltas(&self) -> Vec<ObservedStateDelta> {
+        let mut deltas = Vec::default();
+
+        for (node_id, location) in &self.observed.locations {
+            let previous_location = self.original_observed.locations.get(node_id);
+            let do_upsert = match previous_location {
+                // Location config changed for node
+                Some(prev) if location.conf != prev.conf => true,
+                // New location config for node
+                None => true,
+                // Location config has not changed for node
+                _ => false,
+            };
+
+            if do_upsert {
+                deltas.push(ObservedStateDelta::Upsert(Box::new((
+                    *node_id,
+                    location.clone(),
+                ))));
+            }
+        }
+
+        for node_id in self.original_observed.locations.keys() {
+            if !self.observed.locations.contains_key(node_id) {
+                deltas.push(ObservedStateDelta::Delete(*node_id));
+            }
+        }
+
+        deltas
+    }
+
     /// Keep trying to notify the compute indefinitely, only dropping out if:
     /// - the node `origin` becomes unavailable -> Ok(())
     /// - the node `origin` no longer has our tenant shard attached -> Ok(())
diff --git a/storage_controller/src/service.rs b/storage_controller/src/service.rs
index c349e2b9bf..cc735dc27e 100644
--- a/storage_controller/src/service.rs
+++ b/storage_controller/src/service.rs
@@ -28,8 +28,8 @@ use crate::{
     reconciler::{ReconcileError, ReconcileUnits, ReconcilerConfig, ReconcilerConfigBuilder},
     scheduler::{MaySchedule, ScheduleContext, ScheduleError, ScheduleMode},
     tenant_shard::{
-        MigrateAttachment, ReconcileNeeded, ReconcilerStatus, ScheduleOptimization,
-        ScheduleOptimizationAction,
+        MigrateAttachment, ObservedStateDelta, ReconcileNeeded, ReconcilerStatus,
+        ScheduleOptimization, ScheduleOptimizationAction,
     },
 };
 use anyhow::Context;
@@ -1072,7 +1072,7 @@ impl Service {
         tenant_id=%result.tenant_shard_id.tenant_id, shard_id=%result.tenant_shard_id.shard_slug(),
         sequence=%result.sequence
     ))]
-    fn process_result(&self, mut result: ReconcileResult) {
+    fn process_result(&self, result: ReconcileResult) {
         let mut locked = self.inner.write().unwrap();
         let (nodes, tenants, _scheduler) = locked.parts_mut();
         let Some(tenant) = tenants.get_mut(&result.tenant_shard_id) else {
@@ -1094,22 +1094,27 @@ impl Service {
 
         // In case a node was deleted while this reconcile is in flight, filter it out of the update we will
         // make to the tenant
-        result
-            .observed
-            .locations
-            .retain(|node_id, _loc| nodes.contains_key(node_id));
+        let deltas = result.observed_deltas.into_iter().flat_map(|delta| {
+            // In case a node was deleted while this reconcile is in flight, filter it out of the update we will
+            // make to the tenant
+            let node = nodes.get(delta.node_id())?;
+
+            if node.is_available() {
+                return Some(delta);
+            }
+
+            // In case a node became unavailable concurrently with the reconcile, observed
+            // locations on it are now uncertain. By convention, set them to None in order
+            // for them to get refreshed when the node comes back online.
+            Some(ObservedStateDelta::Upsert(Box::new((
+                node.get_id(),
+                ObservedStateLocation { conf: None },
+            ))))
+        });
 
         match result.result {
             Ok(()) => {
-                for (node_id, loc) in &result.observed.locations {
-                    if let Some(conf) = &loc.conf {
-                        tracing::info!("Updating observed location {}: {:?}", node_id, conf);
-                    } else {
-                        tracing::info!("Setting observed location {} to None", node_id,)
-                    }
-                }
-
-                tenant.observed = result.observed;
+                tenant.apply_observed_deltas(deltas);
                 tenant.waiter.advance(result.sequence);
             }
             Err(e) => {
@@ -1131,9 +1136,10 @@ impl Service {
                 // so that waiters will see the correct error after waiting.
                 tenant.set_last_error(result.sequence, e);
 
-                for (node_id, o) in result.observed.locations {
-                    tenant.observed.locations.insert(node_id, o);
-                }
+                // Skip deletions on reconcile failures
+                let upsert_deltas =
+                    deltas.filter(|delta| matches!(delta, ObservedStateDelta::Upsert(_)));
+                tenant.apply_observed_deltas(upsert_deltas);
             }
         }
 
diff --git a/storage_controller/src/tenant_shard.rs b/storage_controller/src/tenant_shard.rs
index 2e85580e08..8a7ff866e6 100644
--- a/storage_controller/src/tenant_shard.rs
+++ b/storage_controller/src/tenant_shard.rs
@@ -425,6 +425,22 @@ pub(crate) enum ReconcileNeeded {
     Yes,
 }
 
+/// Pending modification to the observed state of a tenant shard.
+/// Produced by [`Reconciler::observed_deltas`] and applied in [`crate::service::Service::process_result`].
+pub(crate) enum ObservedStateDelta {
+    Upsert(Box<(NodeId, ObservedStateLocation)>),
+    Delete(NodeId),
+}
+
+impl ObservedStateDelta {
+    pub(crate) fn node_id(&self) -> &NodeId {
+        match self {
+            Self::Upsert(up) => &up.0,
+            Self::Delete(nid) => nid,
+        }
+    }
+}
+
 /// When a reconcile task completes, it sends this result object
 /// to be applied to the primary TenantShard.
 pub(crate) struct ReconcileResult {
@@ -437,7 +453,7 @@ pub(crate) struct ReconcileResult {
 
     pub(crate) tenant_shard_id: TenantShardId,
     pub(crate) generation: Option<Generation>,
-    pub(crate) observed: ObservedState,
+    pub(crate) observed_deltas: Vec<ObservedStateDelta>,
 
     /// Set [`TenantShard::pending_compute_notification`] from this flag
     pub(crate) pending_compute_notification: bool,
@@ -1123,7 +1139,7 @@ impl TenantShard {
             result,
             tenant_shard_id: reconciler.tenant_shard_id,
             generation: reconciler.generation,
-            observed: reconciler.observed,
+            observed_deltas: reconciler.observed_deltas(),
             pending_compute_notification: reconciler.compute_notify_failure,
         }
     }
@@ -1177,6 +1193,7 @@ impl TenantShard {
             reconciler_config,
             config: self.config.clone(),
             observed: self.observed.clone(),
+            original_observed: self.observed.clone(),
             compute_hook: compute_hook.clone(),
             service_config: service_config.clone(),
             _gate_guard: gate_guard,
@@ -1437,6 +1454,62 @@ impl TenantShard {
             .map(|(node_id, gen)| (node_id, Generation::new(gen)))
             .collect()
     }
+
+    /// Update the observed state of the tenant by applying incremental deltas
+    ///
+    /// Deltas are generated by reconcilers via [`Reconciler::observed_deltas`].
+    /// They are then filtered in [`crate::service::Service::process_result`].
+    pub(crate) fn apply_observed_deltas(
+        &mut self,
+        deltas: impl Iterator<Item = ObservedStateDelta>,
+    ) {
+        for delta in deltas {
+            match delta {
+                ObservedStateDelta::Upsert(ups) => {
+                    let (node_id, loc) = *ups;
+
+                    // If the generation of the observed location in the delta is lagging
+                    // behind the current one, then we have a race condition and cannot
+                    // be certain about the true observed state. Set the observed state
+                    // to None in order to reflect this.
+                    let crnt_gen = self
+                        .observed
+                        .locations
+                        .get(&node_id)
+                        .and_then(|loc| loc.conf.as_ref())
+                        .and_then(|conf| conf.generation);
+                    let new_gen = loc.conf.as_ref().and_then(|conf| conf.generation);
+                    match (crnt_gen, new_gen) {
+                        (Some(crnt), Some(new)) if crnt_gen > new_gen => {
+                            tracing::warn!(
+                                "Skipping observed state update {}: {:?} and using None due to stale generation ({} > {})",
+                                node_id, loc, crnt, new
+                            );
+
+                            self.observed
+                                .locations
+                                .insert(node_id, ObservedStateLocation { conf: None });
+
+                            continue;
+                        }
+                        _ => {}
+                    }
+
+                    if let Some(conf) = &loc.conf {
+                        tracing::info!("Updating observed location {}: {:?}", node_id, conf);
+                    } else {
+                        tracing::info!("Setting observed location {} to None", node_id,)
+                    }
+
+                    self.observed.locations.insert(node_id, loc);
+                }
+                ObservedStateDelta::Delete(node_id) => {
+                    tracing::info!("Deleting observed location {}", node_id);
+                    self.observed.locations.remove(&node_id);
+                }
+            }
+        }
+    }
 }
 
 #[cfg(test)]

From fc7397122cf300370e6ba62e091bdd127a53739e Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 9 Oct 2024 12:11:06 +0100
Subject: [PATCH 07/12] test_runner: fix path to tpc-h queries (#9327)

## Problem

The path to TPC-H queries was incorrectly changed in #9306.
This path is used for `test_tpch` parameterization, so all perf tests
started to fail:

```
==================================== ERRORS ====================================
__________ ERROR collecting test_runner/performance/test_perf_olap.py __________
test_runner/performance/test_perf_olap.py:205: in <module>
    @pytest.mark.parametrize("query", tpch_queuies())
test_runner/performance/test_perf_olap.py:196: in tpch_queuies
    assert queries_dir.exists(), f"TPC-H queries dir not found: {queries_dir}"
E   AssertionError: TPC-H queries dir not found: /__w/neon/neon/test_runner/performance/performance/tpc-h/queries
E   assert False
E    +  where False = <bound method Path.exists of PosixPath('/__w/neon/neon/test_runner/performance/performance/tpc-h/queries')>()
E    +    where <bound method Path.exists of PosixPath('/__w/neon/neon/test_runner/performance/performance/tpc-h/queries')> = PosixPath('/__w/neon/neon/test_runner/performance/performance/tpc-h/queries').exists
```

## Summary of changes
- Fix the path to tpc-h queries
---
 test_runner/performance/test_perf_olap.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test_runner/performance/test_perf_olap.py b/test_runner/performance/test_perf_olap.py
index 6dcde91b76..bc4ab64105 100644
--- a/test_runner/performance/test_perf_olap.py
+++ b/test_runner/performance/test_perf_olap.py
@@ -192,7 +192,7 @@ def tpch_queuies() -> tuple[ParameterSet, ...]:
     - querues in returning tuple are ordered by the query number
     - pytest parameters id is adjusted to match the query id (the numbering starts from 1)
     """
-    queries_dir = Path(__file__).parent / "performance" / "tpc-h" / "queries"
+    queries_dir = Path(__file__).parent / "tpc-h" / "queries"
     assert queries_dir.exists(), f"TPC-H queries dir not found: {queries_dir}"
 
     return tuple(

From a1813927385d2629dd608e1b16d428800ce64524 Mon Sep 17 00:00:00 2001
From: Arseny Sher <ars@neon.tech>
Date: Wed, 9 Oct 2024 14:40:30 +0300
Subject: [PATCH 08/12] safekeeper: add evicted_timelines gauge. (#9318)

showing total number of evicted timelines.
---
 safekeeper/src/metrics.rs                | 12 +++++--
 safekeeper/src/timeline.rs               |  8 ++++-
 safekeeper/src/timeline_eviction.rs      |  6 +++-
 safekeeper/src/timeline_manager.rs       | 15 ++++++++-
 safekeeper/src/timelines_global_map.rs   | 12 +++++--
 test_runner/regress/test_wal_acceptor.py | 42 ++++++++++++++++++++----
 6 files changed, 81 insertions(+), 14 deletions(-)

diff --git a/safekeeper/src/metrics.rs b/safekeeper/src/metrics.rs
index aa2bafbe92..e8fdddcdc1 100644
--- a/safekeeper/src/metrics.rs
+++ b/safekeeper/src/metrics.rs
@@ -12,8 +12,8 @@ use metrics::{
     core::{AtomicU64, Collector, Desc, GenericCounter, GenericGaugeVec, Opts},
     proto::MetricFamily,
     register_histogram_vec, register_int_counter, register_int_counter_pair,
-    register_int_counter_pair_vec, register_int_counter_vec, Gauge, HistogramVec, IntCounter,
-    IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec,
+    register_int_counter_pair_vec, register_int_counter_vec, register_int_gauge, Gauge,
+    HistogramVec, IntCounter, IntCounterPair, IntCounterPairVec, IntCounterVec, IntGaugeVec,
 };
 use once_cell::sync::Lazy;
 
@@ -231,6 +231,14 @@ pub(crate) static EVICTION_EVENTS_COMPLETED: Lazy<IntCounterVec> = Lazy::new(||
     .expect("Failed to register metric")
 });
 
+pub static NUM_EVICTED_TIMELINES: Lazy<IntGauge> = Lazy::new(|| {
+    register_int_gauge!(
+        "safekeeper_evicted_timelines",
+        "Number of currently evicted timelines"
+    )
+    .expect("Failed to register metric")
+});
+
 pub const LABEL_UNKNOWN: &str = "unknown";
 
 /// Labels for traffic metrics.
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index fb98534768..3494b0b764 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -631,13 +631,19 @@ impl Timeline {
 
             return Err(e);
         }
-        self.bootstrap(conf, broker_active_set, partial_backup_rate_limiter);
+        self.bootstrap(
+            shared_state,
+            conf,
+            broker_active_set,
+            partial_backup_rate_limiter,
+        );
         Ok(())
     }
 
     /// Bootstrap new or existing timeline starting background tasks.
     pub fn bootstrap(
         self: &Arc<Timeline>,
+        _shared_state: &mut WriteGuardSharedState<'_>,
         conf: &SafeKeeperConf,
         broker_active_set: Arc<TimelinesSet>,
         partial_backup_rate_limiter: RateLimiter,
diff --git a/safekeeper/src/timeline_eviction.rs b/safekeeper/src/timeline_eviction.rs
index 5aa4921a92..fae6571277 100644
--- a/safekeeper/src/timeline_eviction.rs
+++ b/safekeeper/src/timeline_eviction.rs
@@ -15,7 +15,9 @@ use tracing::{debug, info, instrument, warn};
 use utils::crashsafe::durable_rename;
 
 use crate::{
-    metrics::{EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED},
+    metrics::{
+        EvictionEvent, EVICTION_EVENTS_COMPLETED, EVICTION_EVENTS_STARTED, NUM_EVICTED_TIMELINES,
+    },
     rate_limit::rand_duration,
     timeline_manager::{Manager, StateSnapshot},
     wal_backup,
@@ -93,6 +95,7 @@ impl Manager {
         }
 
         info!("successfully evicted timeline");
+        NUM_EVICTED_TIMELINES.inc();
     }
 
     /// Attempt to restore evicted timeline from remote storage; it must be
@@ -128,6 +131,7 @@ impl Manager {
             tokio::time::Instant::now() + rand_duration(&self.conf.eviction_min_resident);
 
         info!("successfully restored evicted timeline");
+        NUM_EVICTED_TIMELINES.dec();
     }
 }
 
diff --git a/safekeeper/src/timeline_manager.rs b/safekeeper/src/timeline_manager.rs
index f5535c0cea..2129e86baa 100644
--- a/safekeeper/src/timeline_manager.rs
+++ b/safekeeper/src/timeline_manager.rs
@@ -25,7 +25,10 @@ use utils::lsn::Lsn;
 
 use crate::{
     control_file::{FileStorage, Storage},
-    metrics::{MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS},
+    metrics::{
+        MANAGER_ACTIVE_CHANGES, MANAGER_ITERATIONS_TOTAL, MISC_OPERATION_SECONDS,
+        NUM_EVICTED_TIMELINES,
+    },
     rate_limit::{rand_duration, RateLimiter},
     recovery::recovery_main,
     remove_wal::calc_horizon_lsn,
@@ -251,6 +254,11 @@ pub async fn main_task(
         mgr.recovery_task = Some(tokio::spawn(recovery_main(tli, mgr.conf.clone())));
     }
 
+    // If timeline is evicted, reflect that in the metric.
+    if mgr.is_offloaded {
+        NUM_EVICTED_TIMELINES.inc();
+    }
+
     let last_state = 'outer: loop {
         MANAGER_ITERATIONS_TOTAL.inc();
 
@@ -367,6 +375,11 @@ pub async fn main_task(
         mgr.update_wal_removal_end(res);
     }
 
+    // If timeline is deleted while evicted decrement the gauge.
+    if mgr.tli.is_cancelled() && mgr.is_offloaded {
+        NUM_EVICTED_TIMELINES.dec();
+    }
+
     mgr.set_status(Status::Finished);
 }
 
diff --git a/safekeeper/src/timelines_global_map.rs b/safekeeper/src/timelines_global_map.rs
index 6662e18817..866cde3339 100644
--- a/safekeeper/src/timelines_global_map.rs
+++ b/safekeeper/src/timelines_global_map.rs
@@ -165,12 +165,14 @@ impl GlobalTimelines {
                         match Timeline::load_timeline(&conf, ttid) {
                             Ok(timeline) => {
                                 let tli = Arc::new(timeline);
+                                let mut shared_state = tli.write_shared_state().await;
                                 TIMELINES_STATE
                                     .lock()
                                     .unwrap()
                                     .timelines
                                     .insert(ttid, tli.clone());
                                 tli.bootstrap(
+                                    &mut shared_state,
                                     &conf,
                                     broker_active_set.clone(),
                                     partial_backup_rate_limiter.clone(),
@@ -213,6 +215,7 @@ impl GlobalTimelines {
         match Timeline::load_timeline(&conf, ttid) {
             Ok(timeline) => {
                 let tli = Arc::new(timeline);
+                let mut shared_state = tli.write_shared_state().await;
 
                 // TODO: prevent concurrent timeline creation/loading
                 {
@@ -227,8 +230,13 @@ impl GlobalTimelines {
                     state.timelines.insert(ttid, tli.clone());
                 }
 
-                tli.bootstrap(&conf, broker_active_set, partial_backup_rate_limiter);
-
+                tli.bootstrap(
+                    &mut shared_state,
+                    &conf,
+                    broker_active_set,
+                    partial_backup_rate_limiter,
+                );
+                drop(shared_state);
                 Ok(tli)
             }
             // If we can't load a timeline, it's bad. Caller will figure it out.
diff --git a/test_runner/regress/test_wal_acceptor.py b/test_runner/regress/test_wal_acceptor.py
index d372e2d461..d803cd7c78 100644
--- a/test_runner/regress/test_wal_acceptor.py
+++ b/test_runner/regress/test_wal_acceptor.py
@@ -2314,12 +2314,12 @@ def test_s3_eviction(
     ]
     if delete_offloaded_wal:
         neon_env_builder.safekeeper_extra_opts.append("--delete-offloaded-wal")
-
-    env = neon_env_builder.init_start(
-        initial_tenant_conf={
-            "checkpoint_timeout": "100ms",
-        }
-    )
+    # make lagging_wal_timeout small to force pageserver quickly forget about
+    # safekeeper after it stops sending updates (timeline is deactivated) to
+    # make test faster. Won't be needed with
+    # https://github.com/neondatabase/neon/issues/8148 fixed.
+    initial_tenant_conf = {"lagging_wal_timeout": "1s", "checkpoint_timeout": "100ms"}
+    env = neon_env_builder.init_start(initial_tenant_conf=initial_tenant_conf)
 
     n_timelines = 5
 
@@ -2407,9 +2407,37 @@ def test_s3_eviction(
         and sk.log_contains("successfully restored evicted timeline")
         for sk in env.safekeepers
     )
-
     assert event_metrics_seen
 
+    # test safekeeper_evicted_timelines metric
+    log.info("testing safekeeper_evicted_timelines metric")
+    # checkpoint pageserver to force remote_consistent_lsn update
+    for i in range(n_timelines):
+        ps_client.timeline_checkpoint(env.initial_tenant, timelines[i], wait_until_uploaded=True)
+    for ep in endpoints:
+        log.info(ep.is_running())
+    sk = env.safekeepers[0]
+
+    # all timelines must be evicted eventually
+    def all_evicted():
+        n_evicted = sk.http_client().get_metric_value("safekeeper_evicted_timelines")
+        assert n_evicted  # make mypy happy
+        assert int(n_evicted) == n_timelines
+
+    wait_until(60, 0.5, all_evicted)
+    # restart should preserve the metric value
+    sk.stop().start()
+    wait_until(60, 0.5, all_evicted)
+    # and endpoint start should reduce is
+    endpoints[0].start()
+
+    def one_unevicted():
+        n_evicted = sk.http_client().get_metric_value("safekeeper_evicted_timelines")
+        assert n_evicted  # make mypy happy
+        assert int(n_evicted) < n_timelines
+
+    wait_until(60, 0.5, one_unevicted)
+
 
 # Test resetting uploaded partial segment state.
 def test_backup_partial_reset(neon_env_builder: NeonEnvBuilder):

From 63e7fab990602382eab6a9e38d555905d5142ffe Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Wed, 9 Oct 2024 13:32:13 +0100
Subject: [PATCH 09/12] Add /installed_extensions endpoint to collect
 statistics about extension usage. (#8917)

Add /installed_extensions endpoint to collect
statistics about extension usage.
It returns a list of installed extensions in the format:

```json
{
  "extensions": [
    {
      "extname": "extension_name",
      "versions": ["1.0", "1.1"],
      "n_databases": 5,
    }
  ]
}
```

---------

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 compute_tools/src/compute.rs                  | 22 +++++
 compute_tools/src/http/api.rs                 | 26 ++++++
 compute_tools/src/http/openapi_spec.yaml      | 32 +++++++
 compute_tools/src/installed_extensions.rs     | 80 +++++++++++++++++
 compute_tools/src/lib.rs                      |  1 +
 libs/compute_api/src/responses.rs             | 13 +++
 test_runner/fixtures/endpoint/http.py         |  5 ++
 .../regress/test_installed_extensions.py      | 87 +++++++++++++++++++
 8 files changed, 266 insertions(+)
 create mode 100644 compute_tools/src/installed_extensions.rs
 create mode 100644 test_runner/regress/test_installed_extensions.py

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 3e558a7d3c..285be56264 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -1484,6 +1484,28 @@ LIMIT 100",
             info!("Pageserver config changed");
         }
     }
+
+    // Gather info about installed extensions
+    pub fn get_installed_extensions(&self) -> Result<()> {
+        let connstr = self.connstr.clone();
+
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .expect("failed to create runtime");
+        let result = rt
+            .block_on(crate::installed_extensions::get_installed_extensions(
+                connstr,
+            ))
+            .expect("failed to get installed extensions");
+
+        info!(
+            "{}",
+            serde_json::to_string(&result).expect("failed to serialize extensions list")
+        );
+
+        Ok(())
+    }
 }
 
 pub fn forward_termination_signal() {
diff --git a/compute_tools/src/http/api.rs b/compute_tools/src/http/api.rs
index fade3bbe6d..79e6158081 100644
--- a/compute_tools/src/http/api.rs
+++ b/compute_tools/src/http/api.rs
@@ -165,6 +165,32 @@ async fn routes(req: Request<Body>, compute: &Arc<ComputeNode>) -> Response<Body
             }
         }
 
+        // get the list of installed extensions
+        // currently only used in python tests
+        // TODO: call it from cplane
+        (&Method::GET, "/installed_extensions") => {
+            info!("serving /installed_extensions GET request");
+            let status = compute.get_status();
+            if status != ComputeStatus::Running {
+                let msg = format!(
+                    "invalid compute status for extensions request: {:?}",
+                    status
+                );
+                error!(msg);
+                return Response::new(Body::from(msg));
+            }
+
+            let connstr = compute.connstr.clone();
+            let res = crate::installed_extensions::get_installed_extensions(connstr).await;
+            match res {
+                Ok(res) => render_json(Body::from(serde_json::to_string(&res).unwrap())),
+                Err(e) => render_json_error(
+                    &format!("could not get list of installed extensions: {}", e),
+                    StatusCode::INTERNAL_SERVER_ERROR,
+                ),
+            }
+        }
+
         // download extension files from remote extension storage on demand
         (&Method::POST, route) if route.starts_with("/extension_server/") => {
             info!("serving {:?} POST request", route);
diff --git a/compute_tools/src/http/openapi_spec.yaml b/compute_tools/src/http/openapi_spec.yaml
index b0ddaeae2b..e9fa66b323 100644
--- a/compute_tools/src/http/openapi_spec.yaml
+++ b/compute_tools/src/http/openapi_spec.yaml
@@ -53,6 +53,20 @@ paths:
               schema:
                 $ref: "#/components/schemas/ComputeInsights"
 
+  /installed_extensions:
+    get:
+      tags:
+      - Info
+      summary: Get installed extensions.
+      description: ""
+      operationId: getInstalledExtensions
+      responses:
+        200:
+          description: List of installed extensions
+          content:
+            application/json:
+              schema:
+                $ref: "#/components/schemas/InstalledExtensions"
   /info:
     get:
       tags:
@@ -395,6 +409,24 @@ components:
         - configuration
       example: running
 
+    InstalledExtensions:
+      type: object
+      properties:
+        extensions:
+          description: Contains list of installed extensions.
+          type: array
+          items:
+            type: object
+            properties:
+              extname:
+                type: string
+              versions:
+                type: array
+                items:
+                  type: string
+              n_databases:
+                type: integer
+
     #
     # Errors
     #
diff --git a/compute_tools/src/installed_extensions.rs b/compute_tools/src/installed_extensions.rs
new file mode 100644
index 0000000000..3d8b22a8a3
--- /dev/null
+++ b/compute_tools/src/installed_extensions.rs
@@ -0,0 +1,80 @@
+use compute_api::responses::{InstalledExtension, InstalledExtensions};
+use std::collections::HashMap;
+use std::collections::HashSet;
+use url::Url;
+
+use anyhow::Result;
+use postgres::{Client, NoTls};
+use tokio::task;
+
+/// We don't reuse get_existing_dbs() just for code clarity
+/// and to make database listing query here more explicit.
+///
+/// Limit the number of databases to 500 to avoid excessive load.
+fn list_dbs(client: &mut Client) -> Result<Vec<String>> {
+    // `pg_database.datconnlimit = -2` means that the database is in the
+    // invalid state
+    let databases = client
+        .query(
+            "SELECT datname FROM pg_catalog.pg_database
+                WHERE datallowconn
+                AND datconnlimit <> - 2
+                LIMIT 500",
+            &[],
+        )?
+        .iter()
+        .map(|row| {
+            let db: String = row.get("datname");
+            db
+        })
+        .collect();
+
+    Ok(databases)
+}
+
+/// Connect to every database (see list_dbs above) and get the list of installed extensions.
+/// Same extension can be installed in multiple databases with different versions,
+/// we only keep the highest and lowest version across all databases.
+pub async fn get_installed_extensions(connstr: Url) -> Result<InstalledExtensions> {
+    let mut connstr = connstr.clone();
+
+    task::spawn_blocking(move || {
+        let mut client = Client::connect(connstr.as_str(), NoTls)?;
+        let databases: Vec<String> = list_dbs(&mut client)?;
+
+        let mut extensions_map: HashMap<String, InstalledExtension> = HashMap::new();
+        for db in databases.iter() {
+            connstr.set_path(db);
+            let mut db_client = Client::connect(connstr.as_str(), NoTls)?;
+            let extensions: Vec<(String, String)> = db_client
+                .query(
+                    "SELECT extname, extversion FROM pg_catalog.pg_extension;",
+                    &[],
+                )?
+                .iter()
+                .map(|row| (row.get("extname"), row.get("extversion")))
+                .collect();
+
+            for (extname, v) in extensions.iter() {
+                let version = v.to_string();
+                extensions_map
+                    .entry(extname.to_string())
+                    .and_modify(|e| {
+                        e.versions.insert(version.clone());
+                        // count the number of databases where the extension is installed
+                        e.n_databases += 1;
+                    })
+                    .or_insert(InstalledExtension {
+                        extname: extname.to_string(),
+                        versions: HashSet::from([version.clone()]),
+                        n_databases: 1,
+                    });
+            }
+        }
+
+        Ok(InstalledExtensions {
+            extensions: extensions_map.values().cloned().collect(),
+        })
+    })
+    .await?
+}
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index 477f423aa2..d27ae58fa2 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -15,6 +15,7 @@ pub mod catalog;
 pub mod compute;
 pub mod disk_quota;
 pub mod extension_server;
+pub mod installed_extensions;
 pub mod local_proxy;
 pub mod lsn_lease;
 mod migration;
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 3f055b914a..5023fce003 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -1,5 +1,6 @@
 //! Structs representing the JSON formats used in the compute_ctl's HTTP API.
 
+use std::collections::HashSet;
 use std::fmt::Display;
 
 use chrono::{DateTime, Utc};
@@ -155,3 +156,15 @@ pub enum ControlPlaneComputeStatus {
     // should be able to start with provided spec.
     Attached,
 }
+
+#[derive(Clone, Debug, Default, Serialize)]
+pub struct InstalledExtension {
+    pub extname: String,
+    pub versions: HashSet<String>,
+    pub n_databases: u32, // Number of databases using this extension
+}
+
+#[derive(Clone, Debug, Default, Serialize)]
+pub struct InstalledExtensions {
+    pub extensions: Vec<InstalledExtension>,
+}
diff --git a/test_runner/fixtures/endpoint/http.py b/test_runner/fixtures/endpoint/http.py
index aedd711dbd..26895df8a6 100644
--- a/test_runner/fixtures/endpoint/http.py
+++ b/test_runner/fixtures/endpoint/http.py
@@ -23,3 +23,8 @@ class EndpointHttpClient(requests.Session):
         res = self.get(f"http://localhost:{self.port}/database_schema?database={database}")
         res.raise_for_status()
         return res.text
+
+    def installed_extensions(self):
+        res = self.get(f"http://localhost:{self.port}/installed_extensions")
+        res.raise_for_status()
+        return res.json()
diff --git a/test_runner/regress/test_installed_extensions.py b/test_runner/regress/test_installed_extensions.py
new file mode 100644
index 0000000000..4700db85ee
--- /dev/null
+++ b/test_runner/regress/test_installed_extensions.py
@@ -0,0 +1,87 @@
+from logging import info
+
+from fixtures.neon_fixtures import NeonEnv
+
+
+def test_installed_extensions(neon_simple_env: NeonEnv):
+    """basic test for the endpoint that returns the list of installed extensions"""
+
+    env = neon_simple_env
+
+    env.create_branch("test_installed_extensions")
+
+    endpoint = env.endpoints.create_start("test_installed_extensions")
+
+    endpoint.safe_psql("CREATE DATABASE test_installed_extensions")
+    endpoint.safe_psql("CREATE DATABASE test_installed_extensions_2")
+
+    client = endpoint.http_client()
+    res = client.installed_extensions()
+
+    info("Extensions list: %s", res)
+    info("Extensions: %s", res["extensions"])
+    # 'plpgsql' is a default extension that is always installed.
+    assert any(
+        ext["extname"] == "plpgsql" and ext["versions"] == ["1.0"] for ext in res["extensions"]
+    ), "The 'plpgsql' extension is missing"
+
+    # check that the neon_test_utils extension is not installed
+    assert not any(
+        ext["extname"] == "neon_test_utils" for ext in res["extensions"]
+    ), "The 'neon_test_utils' extension is installed"
+
+    pg_conn = endpoint.connect(dbname="test_installed_extensions")
+    with pg_conn.cursor() as cur:
+        cur.execute("CREATE EXTENSION neon_test_utils")
+        cur.execute(
+            "SELECT default_version FROM pg_available_extensions WHERE name = 'neon_test_utils'"
+        )
+        res = cur.fetchone()
+        neon_test_utils_version = res[0]
+
+    with pg_conn.cursor() as cur:
+        cur.execute("CREATE EXTENSION neon version '1.1'")
+
+    pg_conn_2 = endpoint.connect(dbname="test_installed_extensions_2")
+    with pg_conn_2.cursor() as cur:
+        cur.execute("CREATE EXTENSION neon version '1.2'")
+
+    res = client.installed_extensions()
+
+    info("Extensions list: %s", res)
+    info("Extensions: %s", res["extensions"])
+
+    # check that the neon_test_utils extension is installed only in 1 database
+    # and has the expected version
+    assert any(
+        ext["extname"] == "neon_test_utils"
+        and ext["versions"] == [neon_test_utils_version]
+        and ext["n_databases"] == 1
+        for ext in res["extensions"]
+    )
+
+    # check that the plpgsql extension is installed in all databases
+    # this is a default extension that is always installed
+    assert any(ext["extname"] == "plpgsql" and ext["n_databases"] == 4 for ext in res["extensions"])
+
+    # check that the neon extension is installed and has expected versions
+    for ext in res["extensions"]:
+        if ext["extname"] == "neon":
+            assert ext["n_databases"] == 2
+            ext["versions"].sort()
+            assert ext["versions"] == ["1.1", "1.2"]
+
+    with pg_conn.cursor() as cur:
+        cur.execute("ALTER EXTENSION neon UPDATE TO '1.3'")
+
+    res = client.installed_extensions()
+
+    info("Extensions list: %s", res)
+    info("Extensions: %s", res["extensions"])
+
+    # check that the neon_test_utils extension is updated
+    for ext in res["extensions"]:
+        if ext["extname"] == "neon":
+            assert ext["n_databases"] == 2
+            ext["versions"].sort()
+            assert ext["versions"] == ["1.2", "1.3"]

From bee04b8a6991277f58ec1a10ba8a020e1d326462 Mon Sep 17 00:00:00 2001
From: Yuchen Liang <70461588+yliang412@users.noreply.github.com>
Date: Wed, 9 Oct 2024 08:33:07 -0400
Subject: [PATCH 10/12] pageserver: add direct io config to virtual file
 (#9214)

## Problem
We need a way to incrementally switch to direct IO. During the rollout
we might want to switch to O_DIRECT on image and delta layer read path
first before others.

## Summary of changes
- Revisited and simplified direct io config in `PageserverConf`.
- We could add a fallback mode for open, but for read there isn't a
reasonable alternative (without creating another buffered virtual file).
- Added a wrapper around `VirtualFile`, current implementation become
`VirtualFileInner`
- Use `open_v2`, `create_v2`, `open_with_options_v2` when we want to use
the IO mode specified in PS config.
- Once we onboard all IO through VirtualFile using this new API, we will
delete the old code path.
- Make io mode live configurable for benchmarking.
- Only guaranteed for files opened after the config change, so do it
before the experiment.

As an example, we are using `open_v2` with
`virtual_file::IoMode::Direct` in
https://github.com/neondatabase/neon/pull/9169

We also remove `io_buffer_alignment` config in
a04cfd754b2b49099e32f6f24ac3d78acbaffd71 and use it as a compile time
constant. This way we don't have to carry the alignment around or make
frequent call to retrieve this information from the static variable.

Signed-off-by: Yuchen Liang <yuchen@neon.tech>
---
 libs/pageserver_api/src/config.rs             |   8 +-
 libs/pageserver_api/src/models.rs             |  75 +++--
 pageserver/benches/bench_ingest.rs            |   6 +-
 pageserver/client/src/mgmt_api.rs             |  11 +-
 pageserver/ctl/src/layer_map_analyzer.rs      |   6 +-
 pageserver/ctl/src/layers.rs                  |   8 +-
 pageserver/ctl/src/main.rs                    |   8 +-
 .../pagebench/src/cmd/getpage_latest_lsn.rs   |   8 +-
 pageserver/src/bin/pageserver.rs              |   9 +-
 pageserver/src/config.rs                      |  10 +-
 pageserver/src/http/routes.rs                 |  15 +-
 pageserver/src/tenant/ephemeral_file.rs       |   6 +-
 .../src/tenant/storage_layer/delta_layer.rs   |  12 +-
 .../src/tenant/storage_layer/image_layer.rs   |   6 +-
 pageserver/src/tenant/vectored_blob_io.rs     | 112 +++-----
 pageserver/src/virtual_file.rs                | 264 ++++++++++++++----
 test_runner/fixtures/neon_fixtures.py         |  17 +-
 test_runner/fixtures/parametrize.py           |   4 +-
 18 files changed, 335 insertions(+), 250 deletions(-)

diff --git a/libs/pageserver_api/src/config.rs b/libs/pageserver_api/src/config.rs
index 105c8a50d3..24474d4840 100644
--- a/libs/pageserver_api/src/config.rs
+++ b/libs/pageserver_api/src/config.rs
@@ -104,8 +104,7 @@ pub struct ConfigToml {
     pub image_compression: ImageCompressionAlgorithm,
     pub ephemeral_bytes_per_memory_kb: usize,
     pub l0_flush: Option<crate::models::L0FlushConfig>,
-    pub virtual_file_direct_io: crate::models::virtual_file::DirectIoMode,
-    pub io_buffer_alignment: usize,
+    pub virtual_file_io_mode: Option<crate::models::virtual_file::IoMode>,
 }
 
 #[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
@@ -388,10 +387,7 @@ impl Default for ConfigToml {
             image_compression: (DEFAULT_IMAGE_COMPRESSION),
             ephemeral_bytes_per_memory_kb: (DEFAULT_EPHEMERAL_BYTES_PER_MEMORY_KB),
             l0_flush: None,
-            virtual_file_direct_io: crate::models::virtual_file::DirectIoMode::default(),
-
-            io_buffer_alignment: DEFAULT_IO_BUFFER_ALIGNMENT,
-
+            virtual_file_io_mode: None,
             tenant_config: TenantConfigToml::default(),
         }
     }
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 45abda0ad8..3ec9cac2c3 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -972,8 +972,6 @@ pub struct TopTenantShardsResponse {
 }
 
 pub mod virtual_file {
-    use std::path::PathBuf;
-
     #[derive(
         Copy,
         Clone,
@@ -994,50 +992,45 @@ pub mod virtual_file {
     }
 
     /// Direct IO modes for a pageserver.
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(tag = "mode", rename_all = "kebab-case", deny_unknown_fields)]
-    pub enum DirectIoMode {
-        /// Direct IO disabled (uses usual buffered IO).
-        #[default]
-        Disabled,
-        /// Direct IO disabled (performs checks and perf simulations).
-        Evaluate {
-            /// Alignment check level
-            alignment_check: DirectIoAlignmentCheckLevel,
-            /// Latency padded for performance simulation.
-            latency_padding: DirectIoLatencyPadding,
-        },
-        /// Direct IO enabled.
-        Enabled {
-            /// Actions to perform on alignment error.
-            on_alignment_error: DirectIoOnAlignmentErrorAction,
-        },
+    #[derive(
+        Copy,
+        Clone,
+        PartialEq,
+        Eq,
+        Hash,
+        strum_macros::EnumString,
+        strum_macros::Display,
+        serde_with::DeserializeFromStr,
+        serde_with::SerializeDisplay,
+        Debug,
+    )]
+    #[strum(serialize_all = "kebab-case")]
+    #[repr(u8)]
+    pub enum IoMode {
+        /// Uses buffered IO.
+        Buffered,
+        /// Uses direct IO, error out if the operation fails.
+        #[cfg(target_os = "linux")]
+        Direct,
     }
 
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(rename_all = "kebab-case")]
-    pub enum DirectIoAlignmentCheckLevel {
-        #[default]
-        Error,
-        Log,
-        None,
+    impl IoMode {
+        pub const fn preferred() -> Self {
+            Self::Buffered
+        }
     }
 
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(rename_all = "kebab-case")]
-    pub enum DirectIoOnAlignmentErrorAction {
-        Error,
-        #[default]
-        FallbackToBuffered,
-    }
+    impl TryFrom<u8> for IoMode {
+        type Error = u8;
 
-    #[derive(Debug, PartialEq, Eq, Clone, serde::Deserialize, serde::Serialize, Default)]
-    #[serde(tag = "type", rename_all = "kebab-case")]
-    pub enum DirectIoLatencyPadding {
-        /// Pad virtual file operations with IO to a fake file.
-        FakeFileRW { path: PathBuf },
-        #[default]
-        None,
+        fn try_from(value: u8) -> Result<Self, Self::Error> {
+            Ok(match value {
+                v if v == (IoMode::Buffered as u8) => IoMode::Buffered,
+                #[cfg(target_os = "linux")]
+                v if v == (IoMode::Direct as u8) => IoMode::Direct,
+                x => return Err(x),
+            })
+        }
     }
 }
 
diff --git a/pageserver/benches/bench_ingest.rs b/pageserver/benches/bench_ingest.rs
index 72cbb6beab..821c8008a9 100644
--- a/pageserver/benches/bench_ingest.rs
+++ b/pageserver/benches/bench_ingest.rs
@@ -164,11 +164,7 @@ fn criterion_benchmark(c: &mut Criterion) {
     let conf: &'static PageServerConf = Box::leak(Box::new(
         pageserver::config::PageServerConf::dummy_conf(temp_dir.path().to_path_buf()),
     ));
-    virtual_file::init(
-        16384,
-        virtual_file::io_engine_for_bench(),
-        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
+    virtual_file::init(16384, virtual_file::io_engine_for_bench());
     page_cache::init(conf.page_cache_size);
 
     {
diff --git a/pageserver/client/src/mgmt_api.rs b/pageserver/client/src/mgmt_api.rs
index 592f1ded0d..4d76c66905 100644
--- a/pageserver/client/src/mgmt_api.rs
+++ b/pageserver/client/src/mgmt_api.rs
@@ -540,10 +540,13 @@ impl Client {
             .map_err(Error::ReceiveBody)
     }
 
-    /// Configs io buffer alignment at runtime.
-    pub async fn put_io_alignment(&self, align: usize) -> Result<()> {
-        let uri = format!("{}/v1/io_alignment", self.mgmt_api_endpoint);
-        self.request(Method::PUT, uri, align)
+    /// Configs io mode at runtime.
+    pub async fn put_io_mode(
+        &self,
+        mode: &pageserver_api::models::virtual_file::IoMode,
+    ) -> Result<()> {
+        let uri = format!("{}/v1/io_mode", self.mgmt_api_endpoint);
+        self.request(Method::PUT, uri, mode)
             .await?
             .json()
             .await
diff --git a/pageserver/ctl/src/layer_map_analyzer.rs b/pageserver/ctl/src/layer_map_analyzer.rs
index adc090823d..151b94cf62 100644
--- a/pageserver/ctl/src/layer_map_analyzer.rs
+++ b/pageserver/ctl/src/layer_map_analyzer.rs
@@ -152,11 +152,7 @@ pub(crate) async fn main(cmd: &AnalyzeLayerMapCmd) -> Result<()> {
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
 
     // Initialize virtual_file (file desriptor cache) and page cache which are needed to access layer persistent B-Tree.
-    pageserver::virtual_file::init(
-        10,
-        virtual_file::api::IoEngineKind::StdFs,
-        pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
+    pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
     pageserver::page_cache::init(100);
 
     let mut total_delta_layers = 0usize;
diff --git a/pageserver/ctl/src/layers.rs b/pageserver/ctl/src/layers.rs
index dd753398e2..fd948bf2ef 100644
--- a/pageserver/ctl/src/layers.rs
+++ b/pageserver/ctl/src/layers.rs
@@ -59,7 +59,7 @@ pub(crate) enum LayerCmd {
 
 async fn read_delta_file(path: impl AsRef<Path>, ctx: &RequestContext) -> Result<()> {
     let path = Utf8Path::from_path(path.as_ref()).expect("non-Unicode path");
-    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs, 1);
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
     page_cache::init(100);
     let file = VirtualFile::open(path, ctx).await?;
     let file_id = page_cache::next_file_id();
@@ -190,11 +190,7 @@ pub(crate) async fn main(cmd: &LayerCmd) -> Result<()> {
             new_tenant_id,
             new_timeline_id,
         } => {
-            pageserver::virtual_file::init(
-                10,
-                virtual_file::api::IoEngineKind::StdFs,
-                pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT,
-            );
+            pageserver::virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
             pageserver::page_cache::init(100);
 
             let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
diff --git a/pageserver/ctl/src/main.rs b/pageserver/ctl/src/main.rs
index cf001ef0d5..c96664d346 100644
--- a/pageserver/ctl/src/main.rs
+++ b/pageserver/ctl/src/main.rs
@@ -26,7 +26,7 @@ use pageserver::{
     tenant::{dump_layerfile_from_path, metadata::TimelineMetadata},
     virtual_file,
 };
-use pageserver_api::{config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT, shard::TenantShardId};
+use pageserver_api::shard::TenantShardId;
 use postgres_ffi::ControlFileData;
 use remote_storage::{RemotePath, RemoteStorageConfig};
 use tokio_util::sync::CancellationToken;
@@ -205,11 +205,7 @@ fn read_pg_control_file(control_file_path: &Utf8Path) -> anyhow::Result<()> {
 
 async fn print_layerfile(path: &Utf8Path) -> anyhow::Result<()> {
     // Basic initialization of things that don't change after startup
-    virtual_file::init(
-        10,
-        virtual_file::api::IoEngineKind::StdFs,
-        DEFAULT_IO_BUFFER_ALIGNMENT,
-    );
+    virtual_file::init(10, virtual_file::api::IoEngineKind::StdFs);
     page_cache::init(100);
     let ctx = RequestContext::new(TaskKind::DebugTool, DownloadBehavior::Error);
     dump_layerfile_from_path(path, true, &ctx).await
diff --git a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
index ac4a732377..b2df01714d 100644
--- a/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
+++ b/pageserver/pagebench/src/cmd/getpage_latest_lsn.rs
@@ -59,9 +59,9 @@ pub(crate) struct Args {
     #[clap(long)]
     set_io_engine: Option<pageserver_api::models::virtual_file::IoEngineKind>,
 
-    /// Before starting the benchmark, live-reconfigure the pageserver to use specified alignment for io buffers.
+    /// Before starting the benchmark, live-reconfigure the pageserver to use specified io mode (buffered vs. direct).
     #[clap(long)]
-    set_io_alignment: Option<usize>,
+    set_io_mode: Option<pageserver_api::models::virtual_file::IoMode>,
 
     targets: Option<Vec<TenantTimelineId>>,
 }
@@ -129,8 +129,8 @@ async fn main_impl(
         mgmt_api_client.put_io_engine(engine_str).await?;
     }
 
-    if let Some(align) = args.set_io_alignment {
-        mgmt_api_client.put_io_alignment(align).await?;
+    if let Some(mode) = &args.set_io_mode {
+        mgmt_api_client.put_io_mode(mode).await?;
     }
 
     // discover targets
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 593ca6db2d..f71a3d2653 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -125,8 +125,7 @@ fn main() -> anyhow::Result<()> {
 
     // after setting up logging, log the effective IO engine choice and read path implementations
     info!(?conf.virtual_file_io_engine, "starting with virtual_file IO engine");
-    info!(?conf.virtual_file_direct_io, "starting with virtual_file Direct IO settings");
-    info!(?conf.io_buffer_alignment, "starting with setting for IO buffer alignment");
+    info!(?conf.virtual_file_io_mode, "starting with virtual_file IO mode");
 
     // The tenants directory contains all the pageserver local disk state.
     // Create if not exists and make sure all the contents are durable before proceeding.
@@ -168,11 +167,7 @@ fn main() -> anyhow::Result<()> {
     let scenario = failpoint_support::init();
 
     // Basic initialization of things that don't change after startup
-    virtual_file::init(
-        conf.max_file_descriptors,
-        conf.virtual_file_io_engine,
-        conf.io_buffer_alignment,
-    );
+    virtual_file::init(conf.max_file_descriptors, conf.virtual_file_io_engine);
     page_cache::init(conf.page_cache_size);
 
     start_pageserver(launch_ts, conf).context("Failed to start pageserver")?;
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index e15f1c791b..8db78285e4 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -174,9 +174,7 @@ pub struct PageServerConf {
     pub l0_flush: crate::l0_flush::L0FlushConfig,
 
     /// Direct IO settings
-    pub virtual_file_direct_io: virtual_file::DirectIoMode,
-
-    pub io_buffer_alignment: usize,
+    pub virtual_file_io_mode: virtual_file::IoMode,
 }
 
 /// Token for authentication to safekeepers
@@ -325,11 +323,10 @@ impl PageServerConf {
             image_compression,
             ephemeral_bytes_per_memory_kb,
             l0_flush,
-            virtual_file_direct_io,
+            virtual_file_io_mode,
             concurrent_tenant_warmup,
             concurrent_tenant_size_logical_size_queries,
             virtual_file_io_engine,
-            io_buffer_alignment,
             tenant_config,
         } = config_toml;
 
@@ -368,8 +365,6 @@ impl PageServerConf {
             max_vectored_read_bytes,
             image_compression,
             ephemeral_bytes_per_memory_kb,
-            virtual_file_direct_io,
-            io_buffer_alignment,
 
             // ------------------------------------------------------------
             // fields that require additional validation or custom handling
@@ -408,6 +403,7 @@ impl PageServerConf {
             l0_flush: l0_flush
                 .map(crate::l0_flush::L0FlushConfig::from)
                 .unwrap_or_default(),
+            virtual_file_io_mode: virtual_file_io_mode.unwrap_or(virtual_file::IoMode::preferred()),
         };
 
         // ------------------------------------------------------------
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 5fc8272074..2985ab1efb 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -17,6 +17,7 @@ use hyper::header;
 use hyper::StatusCode;
 use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
+use pageserver_api::models::virtual_file::IoMode;
 use pageserver_api::models::AuxFilePolicy;
 use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use pageserver_api::models::IngestAuxFilesRequest;
@@ -2381,17 +2382,13 @@ async fn put_io_engine_handler(
     json_response(StatusCode::OK, ())
 }
 
-async fn put_io_alignment_handler(
+async fn put_io_mode_handler(
     mut r: Request<Body>,
     _cancel: CancellationToken,
 ) -> Result<Response<Body>, ApiError> {
     check_permission(&r, None)?;
-    let align: usize = json_request(&mut r).await?;
-    crate::virtual_file::set_io_buffer_alignment(align).map_err(|align| {
-        ApiError::PreconditionFailed(
-            format!("Requested io alignment ({align}) is not a power of two").into(),
-        )
-    })?;
+    let mode: IoMode = json_request(&mut r).await?;
+    crate::virtual_file::set_io_mode(mode);
     json_response(StatusCode::OK, ())
 }
 
@@ -3082,9 +3079,7 @@ pub fn make_router(
             |r| api_handler(r, timeline_collect_keyspace),
         )
         .put("/v1/io_engine", |r| api_handler(r, put_io_engine_handler))
-        .put("/v1/io_alignment", |r| {
-            api_handler(r, put_io_alignment_handler)
-        })
+        .put("/v1/io_mode", |r| api_handler(r, put_io_mode_handler))
         .put(
             "/v1/tenant/:tenant_shard_id/timeline/:timeline_id/force_aux_policy_switch",
             |r| api_handler(r, force_aux_policy_switch_handler),
diff --git a/pageserver/src/tenant/ephemeral_file.rs b/pageserver/src/tenant/ephemeral_file.rs
index 5324e1807d..a62a47f9a7 100644
--- a/pageserver/src/tenant/ephemeral_file.rs
+++ b/pageserver/src/tenant/ephemeral_file.rs
@@ -84,7 +84,7 @@ impl Drop for EphemeralFile {
     fn drop(&mut self) {
         // unlink the file
         // we are clear to do this, because we have entered a gate
-        let path = &self.buffered_writer.as_inner().as_inner().path;
+        let path = self.buffered_writer.as_inner().as_inner().path();
         let res = std::fs::remove_file(path);
         if let Err(e) = res {
             if e.kind() != std::io::ErrorKind::NotFound {
@@ -356,7 +356,7 @@ mod tests {
         }
 
         let file_contents =
-            std::fs::read(&file.buffered_writer.as_inner().as_inner().path).unwrap();
+            std::fs::read(file.buffered_writer.as_inner().as_inner().path()).unwrap();
         assert_eq!(file_contents, &content[0..cap]);
 
         let buffer_contents = file.buffered_writer.inspect_buffer();
@@ -392,7 +392,7 @@ mod tests {
             .buffered_writer
             .as_inner()
             .as_inner()
-            .path
+            .path()
             .metadata()
             .unwrap();
         assert_eq!(
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 2acad666b8..8be7d7876f 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -573,7 +573,7 @@ impl DeltaLayerWriterInner {
         ensure!(
             metadata.len() <= S3_UPLOAD_LIMIT,
             "Created delta layer file at {} of size {} above limit {S3_UPLOAD_LIMIT}!",
-            file.path,
+            file.path(),
             metadata.len()
         );
 
@@ -791,7 +791,7 @@ impl DeltaLayerInner {
         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path, ctx)
+        let file = VirtualFile::open_v2(path, ctx)
             .await
             .context("open layer file")?;
 
@@ -1022,7 +1022,7 @@ impl DeltaLayerInner {
                             blob_meta.key,
                             PageReconstructError::Other(anyhow!(
                                 "Failed to read blobs from virtual file {}: {}",
-                                self.file.path,
+                                self.file.path(),
                                 kind
                             )),
                         );
@@ -1048,7 +1048,7 @@ impl DeltaLayerInner {
                             meta.meta.key,
                             PageReconstructError::Other(anyhow!(e).context(format!(
                                 "Failed to decompress blob from virtual file {}",
-                                self.file.path,
+                                self.file.path(),
                             ))),
                         );
 
@@ -1066,7 +1066,7 @@ impl DeltaLayerInner {
                             meta.meta.key,
                             PageReconstructError::Other(anyhow!(e).context(format!(
                                 "Failed to deserialize blob from virtual file {}",
-                                self.file.path,
+                                self.file.path(),
                             ))),
                         );
 
@@ -1198,7 +1198,6 @@ impl DeltaLayerInner {
         let mut prev: Option<(Key, Lsn, BlobRef)> = None;
 
         let mut read_builder: Option<ChunkedVectoredReadBuilder> = None;
-        let align = virtual_file::get_io_buffer_alignment();
 
         let max_read_size = self
             .max_vectored_read_bytes
@@ -1247,7 +1246,6 @@ impl DeltaLayerInner {
                         offsets.end.pos(),
                         meta,
                         max_read_size,
-                        align,
                     ))
                 }
             } else {
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 9b53fa9e18..de8155f455 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -389,7 +389,7 @@ impl ImageLayerInner {
         max_vectored_read_bytes: Option<MaxVectoredReadBytes>,
         ctx: &RequestContext,
     ) -> anyhow::Result<Self> {
-        let file = VirtualFile::open(path, ctx)
+        let file = VirtualFile::open_v2(path, ctx)
             .await
             .context("open layer file")?;
         let file_id = page_cache::next_file_id();
@@ -626,7 +626,7 @@ impl ImageLayerInner {
                                     meta.meta.key,
                                     PageReconstructError::Other(anyhow!(e).context(format!(
                                         "Failed to decompress blob from virtual file {}",
-                                        self.file.path,
+                                        self.file.path(),
                                     ))),
                                 );
 
@@ -647,7 +647,7 @@ impl ImageLayerInner {
                             blob_meta.key,
                             PageReconstructError::from(anyhow!(
                                 "Failed to read blobs from virtual file {}: {}",
-                                self.file.path,
+                                self.file.path(),
                                 kind
                             )),
                         );
diff --git a/pageserver/src/tenant/vectored_blob_io.rs b/pageserver/src/tenant/vectored_blob_io.rs
index 1faa6bab99..792c769b4f 100644
--- a/pageserver/src/tenant/vectored_blob_io.rs
+++ b/pageserver/src/tenant/vectored_blob_io.rs
@@ -194,8 +194,6 @@ pub(crate) struct ChunkedVectoredReadBuilder {
     /// Start offset and metadata for each blob in this read
     blobs_at: VecMap<u64, BlobMeta>,
     max_read_size: Option<usize>,
-    /// Chunk size reads are coalesced into.
-    chunk_size: usize,
 }
 
 /// Computes x / d rounded up.
@@ -204,6 +202,7 @@ fn div_round_up(x: usize, d: usize) -> usize {
 }
 
 impl ChunkedVectoredReadBuilder {
+    const CHUNK_SIZE: usize = virtual_file::get_io_buffer_alignment();
     /// Start building a new vectored read.
     ///
     /// Note that by design, this does not check against reading more than `max_read_size` to
@@ -214,21 +213,19 @@ impl ChunkedVectoredReadBuilder {
         end_offset: u64,
         meta: BlobMeta,
         max_read_size: Option<usize>,
-        chunk_size: usize,
     ) -> Self {
         let mut blobs_at = VecMap::default();
         blobs_at
             .append(start_offset, meta)
             .expect("First insertion always succeeds");
 
-        let start_blk_no = start_offset as usize / chunk_size;
-        let end_blk_no = div_round_up(end_offset as usize, chunk_size);
+        let start_blk_no = start_offset as usize / Self::CHUNK_SIZE;
+        let end_blk_no = div_round_up(end_offset as usize, Self::CHUNK_SIZE);
         Self {
             start_blk_no,
             end_blk_no,
             blobs_at,
             max_read_size,
-            chunk_size,
         }
     }
 
@@ -237,18 +234,12 @@ impl ChunkedVectoredReadBuilder {
         end_offset: u64,
         meta: BlobMeta,
         max_read_size: usize,
-        align: usize,
     ) -> Self {
-        Self::new_impl(start_offset, end_offset, meta, Some(max_read_size), align)
+        Self::new_impl(start_offset, end_offset, meta, Some(max_read_size))
     }
 
-    pub(crate) fn new_streaming(
-        start_offset: u64,
-        end_offset: u64,
-        meta: BlobMeta,
-        align: usize,
-    ) -> Self {
-        Self::new_impl(start_offset, end_offset, meta, None, align)
+    pub(crate) fn new_streaming(start_offset: u64, end_offset: u64, meta: BlobMeta) -> Self {
+        Self::new_impl(start_offset, end_offset, meta, None)
     }
 
     /// Attempts to extend the current read with a new blob if the new blob resides in the same or the immediate next chunk.
@@ -256,12 +247,12 @@ impl ChunkedVectoredReadBuilder {
     /// The resulting size also must be below the max read size.
     pub(crate) fn extend(&mut self, start: u64, end: u64, meta: BlobMeta) -> VectoredReadExtended {
         tracing::trace!(start, end, "trying to extend");
-        let start_blk_no = start as usize / self.chunk_size;
-        let end_blk_no = div_round_up(end as usize, self.chunk_size);
+        let start_blk_no = start as usize / Self::CHUNK_SIZE;
+        let end_blk_no = div_round_up(end as usize, Self::CHUNK_SIZE);
 
         let not_limited_by_max_read_size = {
             if let Some(max_read_size) = self.max_read_size {
-                let coalesced_size = (end_blk_no - self.start_blk_no) * self.chunk_size;
+                let coalesced_size = (end_blk_no - self.start_blk_no) * Self::CHUNK_SIZE;
                 coalesced_size <= max_read_size
             } else {
                 true
@@ -292,12 +283,12 @@ impl ChunkedVectoredReadBuilder {
     }
 
     pub(crate) fn size(&self) -> usize {
-        (self.end_blk_no - self.start_blk_no) * self.chunk_size
+        (self.end_blk_no - self.start_blk_no) * Self::CHUNK_SIZE
     }
 
     pub(crate) fn build(self) -> VectoredRead {
-        let start = (self.start_blk_no * self.chunk_size) as u64;
-        let end = (self.end_blk_no * self.chunk_size) as u64;
+        let start = (self.start_blk_no * Self::CHUNK_SIZE) as u64;
+        let end = (self.end_blk_no * Self::CHUNK_SIZE) as u64;
         VectoredRead {
             start,
             end,
@@ -328,18 +319,14 @@ pub struct VectoredReadPlanner {
     prev: Option<(Key, Lsn, u64, BlobFlag)>,
 
     max_read_size: usize,
-
-    align: usize,
 }
 
 impl VectoredReadPlanner {
     pub fn new(max_read_size: usize) -> Self {
-        let align = virtual_file::get_io_buffer_alignment();
         Self {
             blobs: BTreeMap::new(),
             prev: None,
             max_read_size,
-            align,
         }
     }
 
@@ -418,7 +405,6 @@ impl VectoredReadPlanner {
                         end_offset,
                         BlobMeta { key, lsn },
                         self.max_read_size,
-                        self.align,
                     );
 
                     let prev_read_builder = current_read_builder.replace(next_read_builder);
@@ -472,13 +458,13 @@ impl<'a> VectoredBlobReader<'a> {
         );
 
         if cfg!(debug_assertions) {
-            let align = virtual_file::get_io_buffer_alignment() as u64;
+            const ALIGN: u64 = virtual_file::get_io_buffer_alignment() as u64;
             debug_assert_eq!(
-                read.start % align,
+                read.start % ALIGN,
                 0,
                 "Read start at {} does not satisfy the required io buffer alignment ({} bytes)",
                 read.start,
-                align
+                ALIGN
             );
         }
 
@@ -553,22 +539,18 @@ pub struct StreamingVectoredReadPlanner {
     max_cnt: usize,
     /// Size of the current batch
     cnt: usize,
-
-    align: usize,
 }
 
 impl StreamingVectoredReadPlanner {
     pub fn new(max_read_size: u64, max_cnt: usize) -> Self {
         assert!(max_cnt > 0);
         assert!(max_read_size > 0);
-        let align = virtual_file::get_io_buffer_alignment();
         Self {
             read_builder: None,
             prev: None,
             max_cnt,
             max_read_size,
             cnt: 0,
-            align,
         }
     }
 
@@ -621,7 +603,6 @@ impl StreamingVectoredReadPlanner {
                         start_offset,
                         end_offset,
                         BlobMeta { key, lsn },
-                        self.align,
                     ))
                 };
             }
@@ -656,9 +637,9 @@ mod tests {
     use super::*;
 
     fn validate_read(read: &VectoredRead, offset_range: &[(Key, Lsn, u64, BlobFlag)]) {
-        let align = virtual_file::get_io_buffer_alignment() as u64;
-        assert_eq!(read.start % align, 0);
-        assert_eq!(read.start / align, offset_range.first().unwrap().2 / align);
+        const ALIGN: u64 = virtual_file::get_io_buffer_alignment() as u64;
+        assert_eq!(read.start % ALIGN, 0);
+        assert_eq!(read.start / ALIGN, offset_range.first().unwrap().2 / ALIGN);
 
         let expected_offsets_in_read: Vec<_> = offset_range.iter().map(|o| o.2).collect();
 
@@ -676,32 +657,27 @@ mod tests {
     fn planner_chunked_coalesce_all_test() {
         use crate::virtual_file;
 
-        let chunk_size = virtual_file::get_io_buffer_alignment() as u64;
+        const CHUNK_SIZE: u64 = virtual_file::get_io_buffer_alignment() as u64;
 
-        // The test explicitly does not check chunk size < 512
-        if chunk_size < 512 {
-            return;
-        }
-
-        let max_read_size = chunk_size as usize * 8;
+        let max_read_size = CHUNK_SIZE as usize * 8;
         let key = Key::MIN;
         let lsn = Lsn(0);
 
         let blob_descriptions = [
-            (key, lsn, chunk_size / 8, BlobFlag::None), // Read 1 BEGIN
-            (key, lsn, chunk_size / 4, BlobFlag::Ignore), // Gap
-            (key, lsn, chunk_size / 2, BlobFlag::None),
-            (key, lsn, chunk_size - 2, BlobFlag::Ignore), // Gap
-            (key, lsn, chunk_size, BlobFlag::None),
-            (key, lsn, chunk_size * 2 - 1, BlobFlag::None),
-            (key, lsn, chunk_size * 2 + 1, BlobFlag::Ignore), // Gap
-            (key, lsn, chunk_size * 3 + 1, BlobFlag::None),
-            (key, lsn, chunk_size * 5 + 1, BlobFlag::None),
-            (key, lsn, chunk_size * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce.
-            (key, lsn, chunk_size * 7 + 1, BlobFlag::None),
-            (key, lsn, chunk_size * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size)
-            (key, lsn, chunk_size * 9, BlobFlag::Ignore), // ==== skipped a chunk
-            (key, lsn, chunk_size * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce)
+            (key, lsn, CHUNK_SIZE / 8, BlobFlag::None), // Read 1 BEGIN
+            (key, lsn, CHUNK_SIZE / 4, BlobFlag::Ignore), // Gap
+            (key, lsn, CHUNK_SIZE / 2, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE - 2, BlobFlag::Ignore), // Gap
+            (key, lsn, CHUNK_SIZE, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 2 - 1, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 2 + 1, BlobFlag::Ignore), // Gap
+            (key, lsn, CHUNK_SIZE * 3 + 1, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 5 + 1, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 6 + 1, BlobFlag::Ignore), // skipped chunk size, but not a chunk: should coalesce.
+            (key, lsn, CHUNK_SIZE * 7 + 1, BlobFlag::None),
+            (key, lsn, CHUNK_SIZE * 8, BlobFlag::None), // Read 2 BEGIN (b/c max_read_size)
+            (key, lsn, CHUNK_SIZE * 9, BlobFlag::Ignore), // ==== skipped a chunk
+            (key, lsn, CHUNK_SIZE * 10, BlobFlag::None), // Read 3 BEGIN (cannot coalesce)
         ];
 
         let ranges = [
@@ -780,19 +756,19 @@ mod tests {
 
     #[test]
     fn planner_replacement_test() {
-        let chunk_size = virtual_file::get_io_buffer_alignment() as u64;
-        let max_read_size = 128 * chunk_size as usize;
+        const CHUNK_SIZE: u64 = virtual_file::get_io_buffer_alignment() as u64;
+        let max_read_size = 128 * CHUNK_SIZE as usize;
         let first_key = Key::MIN;
         let second_key = first_key.next();
         let lsn = Lsn(0);
 
         let blob_descriptions = vec![
             (first_key, lsn, 0, BlobFlag::None),          // First in read 1
-            (first_key, lsn, chunk_size, BlobFlag::None), // Last in read 1
-            (second_key, lsn, 2 * chunk_size, BlobFlag::ReplaceAll),
-            (second_key, lsn, 3 * chunk_size, BlobFlag::None),
-            (second_key, lsn, 4 * chunk_size, BlobFlag::ReplaceAll), // First in read 2
-            (second_key, lsn, 5 * chunk_size, BlobFlag::None),       // Last in read 2
+            (first_key, lsn, CHUNK_SIZE, BlobFlag::None), // Last in read 1
+            (second_key, lsn, 2 * CHUNK_SIZE, BlobFlag::ReplaceAll),
+            (second_key, lsn, 3 * CHUNK_SIZE, BlobFlag::None),
+            (second_key, lsn, 4 * CHUNK_SIZE, BlobFlag::ReplaceAll), // First in read 2
+            (second_key, lsn, 5 * CHUNK_SIZE, BlobFlag::None),       // Last in read 2
         ];
 
         let ranges = [&blob_descriptions[0..2], &blob_descriptions[4..]];
@@ -802,7 +778,7 @@ mod tests {
             planner.handle(key, lsn, offset, flag);
         }
 
-        planner.handle_range_end(6 * chunk_size);
+        planner.handle_range_end(6 * CHUNK_SIZE);
 
         let reads = planner.finish();
         assert_eq!(reads.len(), 2);
@@ -947,7 +923,6 @@ mod tests {
         let reserved_bytes = blobs.iter().map(|bl| bl.len()).max().unwrap() * 2 + 16;
         let mut buf = BytesMut::with_capacity(reserved_bytes);
 
-        let align = virtual_file::get_io_buffer_alignment();
         let vectored_blob_reader = VectoredBlobReader::new(&file);
         let meta = BlobMeta {
             key: Key::MIN,
@@ -959,8 +934,7 @@ mod tests {
             if idx + 1 == offsets.len() {
                 continue;
             }
-            let read_builder =
-                ChunkedVectoredReadBuilder::new(*offset, *end, meta, 16 * 4096, align);
+            let read_builder = ChunkedVectoredReadBuilder::new(*offset, *end, meta, 16 * 4096);
             let read = read_builder.build();
             let result = vectored_blob_reader.read_blobs(&read, buf, &ctx).await?;
             assert_eq!(result.blobs.len(), 1);
diff --git a/pageserver/src/virtual_file.rs b/pageserver/src/virtual_file.rs
index 5b7b279888..d260116b38 100644
--- a/pageserver/src/virtual_file.rs
+++ b/pageserver/src/virtual_file.rs
@@ -23,10 +23,12 @@ use pageserver_api::config::defaults::DEFAULT_IO_BUFFER_ALIGNMENT;
 use pageserver_api::shard::TenantShardId;
 use std::fs::File;
 use std::io::{Error, ErrorKind, Seek, SeekFrom};
+#[cfg(target_os = "linux")]
+use std::os::unix::fs::OpenOptionsExt;
 use tokio_epoll_uring::{BoundedBuf, IoBuf, IoBufMut, Slice};
 
 use std::os::fd::{AsRawFd, FromRawFd, IntoRawFd, OwnedFd, RawFd};
-use std::sync::atomic::{AtomicBool, AtomicUsize, Ordering};
+use std::sync::atomic::{AtomicBool, AtomicU8, AtomicUsize, Ordering};
 use tokio::sync::{RwLock, RwLockReadGuard, RwLockWriteGuard};
 use tokio::time::Instant;
 
@@ -38,7 +40,7 @@ pub use io_engine::FeatureTestResult as IoEngineFeatureTestResult;
 mod metadata;
 mod open_options;
 use self::owned_buffers_io::write::OwnedAsyncWriter;
-pub(crate) use api::DirectIoMode;
+pub(crate) use api::IoMode;
 pub(crate) use io_engine::IoEngineKind;
 pub(crate) use metadata::Metadata;
 pub(crate) use open_options::*;
@@ -61,6 +63,171 @@ pub(crate) mod owned_buffers_io {
     }
 }
 
+#[derive(Debug)]
+pub struct VirtualFile {
+    inner: VirtualFileInner,
+    _mode: IoMode,
+}
+
+impl VirtualFile {
+    /// Open a file in read-only mode. Like File::open.
+    pub async fn open<P: AsRef<Utf8Path>>(
+        path: P,
+        ctx: &RequestContext,
+    ) -> Result<Self, std::io::Error> {
+        let inner = VirtualFileInner::open(path, ctx).await?;
+        Ok(VirtualFile {
+            inner,
+            _mode: IoMode::Buffered,
+        })
+    }
+
+    /// Open a file in read-only mode. Like File::open.
+    ///
+    /// `O_DIRECT` will be enabled base on `virtual_file_io_mode`.
+    pub async fn open_v2<P: AsRef<Utf8Path>>(
+        path: P,
+        ctx: &RequestContext,
+    ) -> Result<Self, std::io::Error> {
+        Self::open_with_options_v2(path.as_ref(), OpenOptions::new().read(true), ctx).await
+    }
+
+    pub async fn create<P: AsRef<Utf8Path>>(
+        path: P,
+        ctx: &RequestContext,
+    ) -> Result<Self, std::io::Error> {
+        let inner = VirtualFileInner::create(path, ctx).await?;
+        Ok(VirtualFile {
+            inner,
+            _mode: IoMode::Buffered,
+        })
+    }
+
+    pub async fn create_v2<P: AsRef<Utf8Path>>(
+        path: P,
+        ctx: &RequestContext,
+    ) -> Result<Self, std::io::Error> {
+        VirtualFile::open_with_options_v2(
+            path.as_ref(),
+            OpenOptions::new().write(true).create(true).truncate(true),
+            ctx,
+        )
+        .await
+    }
+
+    pub async fn open_with_options<P: AsRef<Utf8Path>>(
+        path: P,
+        open_options: &OpenOptions,
+        ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
+    ) -> Result<Self, std::io::Error> {
+        let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
+        Ok(VirtualFile {
+            inner,
+            _mode: IoMode::Buffered,
+        })
+    }
+
+    pub async fn open_with_options_v2<P: AsRef<Utf8Path>>(
+        path: P,
+        open_options: &OpenOptions,
+        ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
+    ) -> Result<Self, std::io::Error> {
+        let file = match get_io_mode() {
+            IoMode::Buffered => {
+                let inner = VirtualFileInner::open_with_options(path, open_options, ctx).await?;
+                VirtualFile {
+                    inner,
+                    _mode: IoMode::Buffered,
+                }
+            }
+            #[cfg(target_os = "linux")]
+            IoMode::Direct => {
+                let inner = VirtualFileInner::open_with_options(
+                    path,
+                    open_options.clone().custom_flags(nix::libc::O_DIRECT),
+                    ctx,
+                )
+                .await?;
+                VirtualFile {
+                    inner,
+                    _mode: IoMode::Direct,
+                }
+            }
+        };
+        Ok(file)
+    }
+
+    pub fn path(&self) -> &Utf8Path {
+        self.inner.path.as_path()
+    }
+
+    pub async fn crashsafe_overwrite<B: BoundedBuf<Buf = Buf> + Send, Buf: IoBuf + Send>(
+        final_path: Utf8PathBuf,
+        tmp_path: Utf8PathBuf,
+        content: B,
+    ) -> std::io::Result<()> {
+        VirtualFileInner::crashsafe_overwrite(final_path, tmp_path, content).await
+    }
+
+    pub async fn sync_all(&self) -> Result<(), Error> {
+        self.inner.sync_all().await
+    }
+
+    pub async fn sync_data(&self) -> Result<(), Error> {
+        self.inner.sync_data().await
+    }
+
+    pub async fn metadata(&self) -> Result<Metadata, Error> {
+        self.inner.metadata().await
+    }
+
+    pub fn remove(self) {
+        self.inner.remove();
+    }
+
+    pub async fn seek(&mut self, pos: SeekFrom) -> Result<u64, Error> {
+        self.inner.seek(pos).await
+    }
+
+    pub async fn read_exact_at<Buf>(
+        &self,
+        slice: Slice<Buf>,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> Result<Slice<Buf>, Error>
+    where
+        Buf: IoBufMut + Send,
+    {
+        self.inner.read_exact_at(slice, offset, ctx).await
+    }
+
+    pub async fn read_exact_at_page(
+        &self,
+        page: PageWriteGuard<'static>,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> Result<PageWriteGuard<'static>, Error> {
+        self.inner.read_exact_at_page(page, offset, ctx).await
+    }
+
+    pub async fn write_all_at<Buf: IoBuf + Send>(
+        &self,
+        buf: FullSlice<Buf>,
+        offset: u64,
+        ctx: &RequestContext,
+    ) -> (FullSlice<Buf>, Result<(), Error>) {
+        self.inner.write_all_at(buf, offset, ctx).await
+    }
+
+    pub async fn write_all<Buf: IoBuf + Send>(
+        &mut self,
+        buf: FullSlice<Buf>,
+        ctx: &RequestContext,
+    ) -> (FullSlice<Buf>, Result<usize, Error>) {
+        self.inner.write_all(buf, ctx).await
+    }
+}
+
 ///
 /// A virtual file descriptor. You can use this just like std::fs::File, but internally
 /// the underlying file is closed if the system is low on file descriptors,
@@ -77,7 +244,7 @@ pub(crate) mod owned_buffers_io {
 /// 'tag' field is used to detect whether the handle still is valid or not.
 ///
 #[derive(Debug)]
-pub struct VirtualFile {
+pub struct VirtualFileInner {
     /// Lazy handle to the global file descriptor cache. The slot that this points to
     /// might contain our File, or it may be empty, or it may contain a File that
     /// belongs to a different VirtualFile.
@@ -350,12 +517,12 @@ macro_rules! with_file {
     }};
 }
 
-impl VirtualFile {
+impl VirtualFileInner {
     /// Open a file in read-only mode. Like File::open.
     pub async fn open<P: AsRef<Utf8Path>>(
         path: P,
         ctx: &RequestContext,
-    ) -> Result<VirtualFile, std::io::Error> {
+    ) -> Result<VirtualFileInner, std::io::Error> {
         Self::open_with_options(path.as_ref(), OpenOptions::new().read(true), ctx).await
     }
 
@@ -364,7 +531,7 @@ impl VirtualFile {
     pub async fn create<P: AsRef<Utf8Path>>(
         path: P,
         ctx: &RequestContext,
-    ) -> Result<VirtualFile, std::io::Error> {
+    ) -> Result<VirtualFileInner, std::io::Error> {
         Self::open_with_options(
             path.as_ref(),
             OpenOptions::new().write(true).create(true).truncate(true),
@@ -382,7 +549,7 @@ impl VirtualFile {
         path: P,
         open_options: &OpenOptions,
         _ctx: &RequestContext, /* TODO: carry a pointer to the metrics in the RequestContext instead of the parsing https://github.com/neondatabase/neon/issues/6107 */
-    ) -> Result<VirtualFile, std::io::Error> {
+    ) -> Result<VirtualFileInner, std::io::Error> {
         let path_ref = path.as_ref();
         let path_str = path_ref.to_string();
         let parts = path_str.split('/').collect::<Vec<&str>>();
@@ -423,7 +590,7 @@ impl VirtualFile {
         reopen_options.create_new(false);
         reopen_options.truncate(false);
 
-        let vfile = VirtualFile {
+        let vfile = VirtualFileInner {
             handle: RwLock::new(handle),
             pos: 0,
             path: path_ref.to_path_buf(),
@@ -1034,6 +1201,21 @@ impl tokio_epoll_uring::IoFd for FileGuard {
 
 #[cfg(test)]
 impl VirtualFile {
+    pub(crate) async fn read_blk(
+        &self,
+        blknum: u32,
+        ctx: &RequestContext,
+    ) -> Result<crate::tenant::block_io::BlockLease<'_>, std::io::Error> {
+        self.inner.read_blk(blknum, ctx).await
+    }
+
+    async fn read_to_end(&mut self, buf: &mut Vec<u8>, ctx: &RequestContext) -> Result<(), Error> {
+        self.inner.read_to_end(buf, ctx).await
+    }
+}
+
+#[cfg(test)]
+impl VirtualFileInner {
     pub(crate) async fn read_blk(
         &self,
         blknum: u32,
@@ -1067,7 +1249,7 @@ impl VirtualFile {
     }
 }
 
-impl Drop for VirtualFile {
+impl Drop for VirtualFileInner {
     /// If a VirtualFile is dropped, close the underlying file if it was open.
     fn drop(&mut self) {
         let handle = self.handle.get_mut();
@@ -1143,15 +1325,10 @@ impl OpenFiles {
 /// server startup.
 ///
 #[cfg(not(test))]
-pub fn init(num_slots: usize, engine: IoEngineKind, io_buffer_alignment: usize) {
+pub fn init(num_slots: usize, engine: IoEngineKind) {
     if OPEN_FILES.set(OpenFiles::new(num_slots)).is_err() {
         panic!("virtual_file::init called twice");
     }
-    if set_io_buffer_alignment(io_buffer_alignment).is_err() {
-        panic!(
-            "IO buffer alignment needs to be a power of two and greater than 512, got {io_buffer_alignment}"
-        );
-    }
     io_engine::init(engine);
     crate::metrics::virtual_file_descriptor_cache::SIZE_MAX.set(num_slots as u64);
 }
@@ -1175,47 +1352,20 @@ fn get_open_files() -> &'static OpenFiles {
     }
 }
 
-static IO_BUFFER_ALIGNMENT: AtomicUsize = AtomicUsize::new(DEFAULT_IO_BUFFER_ALIGNMENT);
-
-/// Returns true if the alignment is a power of two and is greater or equal to 512.
-fn is_valid_io_buffer_alignment(align: usize) -> bool {
-    align.is_power_of_two() && align >= 512
-}
-
-/// Sets IO buffer alignment requirement. Returns error if the alignment requirement is
-/// not a power of two or less than 512 bytes.
-#[allow(unused)]
-pub(crate) fn set_io_buffer_alignment(align: usize) -> Result<(), usize> {
-    if is_valid_io_buffer_alignment(align) {
-        IO_BUFFER_ALIGNMENT.store(align, std::sync::atomic::Ordering::Relaxed);
-        Ok(())
-    } else {
-        Err(align)
-    }
-}
-
 /// Gets the io buffer alignment.
-///
-/// This function should be used for getting the actual alignment value to use.
-pub(crate) fn get_io_buffer_alignment() -> usize {
-    let align = IO_BUFFER_ALIGNMENT.load(std::sync::atomic::Ordering::Relaxed);
-
-    if cfg!(test) {
-        let env_var_name = "NEON_PAGESERVER_UNIT_TEST_IO_BUFFER_ALIGNMENT";
-        if let Some(test_align) = utils::env::var(env_var_name) {
-            if is_valid_io_buffer_alignment(test_align) {
-                test_align
-            } else {
-                panic!("IO buffer alignment needs to be a power of two and greater than 512, got {test_align}");
-            }
-        } else {
-            align
-        }
-    } else {
-        align
-    }
+pub(crate) const fn get_io_buffer_alignment() -> usize {
+    DEFAULT_IO_BUFFER_ALIGNMENT
 }
 
+static IO_MODE: AtomicU8 = AtomicU8::new(IoMode::preferred() as u8);
+
+pub(crate) fn set_io_mode(mode: IoMode) {
+    IO_MODE.store(mode as u8, std::sync::atomic::Ordering::Relaxed);
+}
+
+pub(crate) fn get_io_mode() -> IoMode {
+    IoMode::try_from(IO_MODE.load(Ordering::Relaxed)).unwrap()
+}
 #[cfg(test)]
 mod tests {
     use crate::context::DownloadBehavior;
@@ -1524,7 +1674,7 @@ mod tests {
         // Open the file many times.
         let mut files = Vec::new();
         for _ in 0..VIRTUAL_FILES {
-            let f = VirtualFile::open_with_options(
+            let f = VirtualFileInner::open_with_options(
                 &test_file_path,
                 OpenOptions::new().read(true),
                 &ctx,
@@ -1576,7 +1726,7 @@ mod tests {
         let path = testdir.join("myfile");
         let tmp_path = testdir.join("myfile.tmp");
 
-        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
+        VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
@@ -1585,7 +1735,7 @@ mod tests {
         assert!(!tmp_path.exists());
         drop(file);
 
-        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
+        VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"bar".to_vec())
             .await
             .unwrap();
         let mut file = MaybeVirtualFile::from(VirtualFile::open(&path, &ctx).await.unwrap());
@@ -1608,7 +1758,7 @@ mod tests {
         std::fs::write(&tmp_path, "some preexisting junk that should be removed").unwrap();
         assert!(tmp_path.exists());
 
-        VirtualFile::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
+        VirtualFileInner::crashsafe_overwrite(path.clone(), tmp_path.clone(), b"foo".to_vec())
             .await
             .unwrap();
 
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 9a51899d5d..e6de72752f 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -395,7 +395,7 @@ class NeonEnvBuilder:
         pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]] = None,
         safekeeper_extra_opts: Optional[list[str]] = None,
         storage_controller_port_override: Optional[int] = None,
-        pageserver_io_buffer_alignment: Optional[int] = None,
+        pageserver_virtual_file_io_mode: Optional[str] = None,
     ):
         self.repo_dir = repo_dir
         self.rust_log_override = rust_log_override
@@ -449,7 +449,7 @@ class NeonEnvBuilder:
 
         self.storage_controller_port_override = storage_controller_port_override
 
-        self.pageserver_io_buffer_alignment = pageserver_io_buffer_alignment
+        self.pageserver_virtual_file_io_mode = pageserver_virtual_file_io_mode
 
         assert test_name.startswith(
             "test_"
@@ -1038,7 +1038,7 @@ class NeonEnv:
 
         self.pageserver_virtual_file_io_engine = config.pageserver_virtual_file_io_engine
         self.pageserver_aux_file_policy = config.pageserver_aux_file_policy
-        self.pageserver_io_buffer_alignment = config.pageserver_io_buffer_alignment
+        self.pageserver_virtual_file_io_mode = config.pageserver_virtual_file_io_mode
 
         # Create the neon_local's `NeonLocalInitConf`
         cfg: dict[str, Any] = {
@@ -1102,7 +1102,8 @@ class NeonEnv:
                         for key, value in override.items():
                             ps_cfg[key] = value
 
-            ps_cfg["io_buffer_alignment"] = self.pageserver_io_buffer_alignment
+            if self.pageserver_virtual_file_io_mode is not None:
+                ps_cfg["virtual_file_io_mode"] = self.pageserver_virtual_file_io_mode
 
             # Create a corresponding NeonPageserver object
             self.pageservers.append(
@@ -1407,7 +1408,7 @@ def neon_simple_env(
     pageserver_virtual_file_io_engine: str,
     pageserver_aux_file_policy: Optional[AuxFileStore],
     pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]],
-    pageserver_io_buffer_alignment: Optional[int],
+    pageserver_virtual_file_io_mode: Optional[str],
 ) -> Iterator[NeonEnv]:
     """
     Simple Neon environment, with no authentication and no safekeepers.
@@ -1433,7 +1434,7 @@ def neon_simple_env(
         pageserver_virtual_file_io_engine=pageserver_virtual_file_io_engine,
         pageserver_aux_file_policy=pageserver_aux_file_policy,
         pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
-        pageserver_io_buffer_alignment=pageserver_io_buffer_alignment,
+        pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
     ) as builder:
         env = builder.init_start()
 
@@ -1457,7 +1458,7 @@ def neon_env_builder(
     pageserver_default_tenant_config_compaction_algorithm: Optional[dict[str, Any]],
     pageserver_aux_file_policy: Optional[AuxFileStore],
     record_property: Callable[[str, object], None],
-    pageserver_io_buffer_alignment: Optional[int],
+    pageserver_virtual_file_io_mode: Optional[str],
 ) -> Iterator[NeonEnvBuilder]:
     """
     Fixture to create a Neon environment for test.
@@ -1492,7 +1493,7 @@ def neon_env_builder(
         test_overlay_dir=test_overlay_dir,
         pageserver_aux_file_policy=pageserver_aux_file_policy,
         pageserver_default_tenant_config_compaction_algorithm=pageserver_default_tenant_config_compaction_algorithm,
-        pageserver_io_buffer_alignment=pageserver_io_buffer_alignment,
+        pageserver_virtual_file_io_mode=pageserver_virtual_file_io_mode,
     ) as builder:
         yield builder
         # Propogate `preserve_database_files` to make it possible to use in other fixtures,
diff --git a/test_runner/fixtures/parametrize.py b/test_runner/fixtures/parametrize.py
index b408d83cf3..3bbac4b8ee 100644
--- a/test_runner/fixtures/parametrize.py
+++ b/test_runner/fixtures/parametrize.py
@@ -41,8 +41,8 @@ def pageserver_virtual_file_io_engine() -> Optional[str]:
 
 
 @pytest.fixture(scope="function", autouse=True)
-def pageserver_io_buffer_alignment() -> Optional[int]:
-    return None
+def pageserver_virtual_file_io_mode() -> Optional[str]:
+    return os.getenv("PAGESERVER_VIRTUAL_FILE_IO_MODE")
 
 
 @pytest.fixture(scope="function", autouse=True)

From eb23d355a94cfbc8b0589f56354cc137bdc33233 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 9 Oct 2024 15:34:51 +0300
Subject: [PATCH 11/12] tests: Use ThreadedMotoServer python class to launch
 mock S3 server (#9313)

This is simpler than using subprocess.

One difference is in how moto's log output is now collected. Previously,
moto's logs went to stderr, and were collected and printed at the end of
the test by pytest, like this:

    2024-10-07T22:45:12.3705222Z ----------------------------- Captured stderr call -----------------------------
    2024-10-07T22:45:12.3705577Z 127.0.0.1 - - [07/Oct/2024 22:35:14] "PUT /pageserver-test-deletion-queue-2e6efa8245ec92a37a07004569c29eb7 HTTP/1.1" 200 -
    2024-10-07T22:45:12.3706181Z 127.0.0.1 - - [07/Oct/2024 22:35:15] "GET /pageserver-test-deletion-queue-2e6efa8245ec92a37a07004569c29eb7/?list-type=2&delimiter=/&prefix=/tenants/43da25eac0f41412696dd31b94dbb83c/timelines/ HTTP/1.1" 200 -
    2024-10-07T22:45:12.3706894Z 127.0.0.1 - - [07/Oct/2024 22:35:16] "PUT /pageserver-test-deletion-queue-2e6efa8245ec92a37a07004569c29eb7//tenants/43da25eac0f41412696dd31b94dbb83c/timelines/eabba5f0c1c72c8656d3ef1d85b98c1d/initdb.tar.zst?x-id=PutObject HTTP/1.1" 200 -

Note the timestamps: the timestamp at the beginning of the line is the
time that the stderr was dumped, i.e. the end of the test, which makes
those timestamps rather useless. The timestamp in the middle of the line
is when the operation actually happened, but it has only 1 s
granularity.

With this change, moto's log lines are printed in the "live log call"
section, as they happen, which makes the timestamps more useful:

    2024-10-08 12:12:31.129 INFO [_internal.py:97] 127.0.0.1 - - [08/Oct/2024 12:12:31] "GET /pageserver-test-deletion-queue-e24e7525d437e1874d8a52030dcabb4f/?list-type=2&delimiter=/&prefix=/tenants/7b6a16b1460eda5204083fba78bc360f/timelines/ HTTP/1.1" 200 -
    2024-10-08 12:12:32.612 INFO [_internal.py:97] 127.0.0.1 - - [08/Oct/2024 12:12:32] "PUT /pageserver-test-deletion-queue-e24e7525d437e1874d8a52030dcabb4f//tenants/7b6a16b1460eda5204083fba78bc360f/timelines/7ab4c2b67fa8c712cada207675139877/initdb.tar.zst?x-id=PutObject HTTP/1.1" 200 -
---
 poetry.lock                            | 29 +++++++++++++++++++-------
 test_runner/fixtures/remote_storage.py | 23 ++++----------------
 2 files changed, 26 insertions(+), 26 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 07f30d10e7..00fe2505c9 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2095,6 +2095,7 @@ files = [
     {file = "psycopg2_binary-2.9.9-cp311-cp311-win32.whl", hash = "sha256:dc4926288b2a3e9fd7b50dc6a1909a13bbdadfc67d93f3374d984e56f885579d"},
     {file = "psycopg2_binary-2.9.9-cp311-cp311-win_amd64.whl", hash = "sha256:b76bedd166805480ab069612119ea636f5ab8f8771e640ae103e05a4aae3e417"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8532fd6e6e2dc57bcb3bc90b079c60de896d2128c5d9d6f24a63875a95a088cf"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:b0605eaed3eb239e87df0d5e3c6489daae3f7388d455d0c0b4df899519c6a38d"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:8f8544b092a29a6ddd72f3556a9fcf249ec412e10ad28be6a0c0d948924f2212"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:2d423c8d8a3c82d08fe8af900ad5b613ce3632a1249fd6a223941d0735fce493"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:2e5afae772c00980525f6d6ecf7cbca55676296b580c0e6abb407f15f3706996"},
@@ -2103,6 +2104,8 @@ files = [
     {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_i686.whl", hash = "sha256:cb16c65dcb648d0a43a2521f2f0a2300f40639f6f8c1ecbc662141e4e3e1ee07"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_ppc64le.whl", hash = "sha256:911dda9c487075abd54e644ccdf5e5c16773470a6a5d3826fda76699410066fb"},
     {file = "psycopg2_binary-2.9.9-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:57fede879f08d23c85140a360c6a77709113efd1c993923c59fde17aa27599fe"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-win32.whl", hash = "sha256:64cf30263844fa208851ebb13b0732ce674d8ec6a0c86a4e160495d299ba3c93"},
+    {file = "psycopg2_binary-2.9.9-cp312-cp312-win_amd64.whl", hash = "sha256:81ff62668af011f9a48787564ab7eded4e9fb17a4a6a74af5ffa6a457400d2ab"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-macosx_10_9_x86_64.whl", hash = "sha256:2293b001e319ab0d869d660a704942c9e2cce19745262a8aba2115ef41a0a42a"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:03ef7df18daf2c4c07e2695e8cfd5ee7f748a1d54d802330985a78d2a5a6dca9"},
     {file = "psycopg2_binary-2.9.9-cp37-cp37m-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0a602ea5aff39bb9fac6308e9c9d82b9a35c2bf288e184a816002c9fae930b77"},
@@ -2584,6 +2587,7 @@ files = [
     {file = "PyYAML-6.0.1-cp311-cp311-win_amd64.whl", hash = "sha256:bf07ee2fef7014951eeb99f56f39c9bb4af143d8aa3c21b1677805985307da34"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:855fb52b0dc35af121542a76b9a84f8d1cd886ea97c84703eaa6d88e37a2ad28"},
     {file = "PyYAML-6.0.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:40df9b996c2b73138957fe23a16a4f0ba614f4c0efce1e9406a184b6d07fa3a9"},
+    {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:a08c6f0fe150303c1c6b71ebcd7213c2858041a7e01975da3a99aed1e7a378ef"},
     {file = "PyYAML-6.0.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c22bec3fbe2524cde73d7ada88f6566758a8f7227bfbf93a408a9d86bcc12a0"},
     {file = "PyYAML-6.0.1-cp312-cp312-musllinux_1_1_x86_64.whl", hash = "sha256:8d4e9c88387b0f5c7d5f281e55304de64cf7f9c0021a3525bd3b1c542da3b0e4"},
     {file = "PyYAML-6.0.1-cp312-cp312-win32.whl", hash = "sha256:d483d2cdf104e7c9fa60c544d92981f12ad66a457afae824d146093b8c294c54"},
@@ -2729,21 +2733,22 @@ use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
 
 [[package]]
 name = "responses"
-version = "0.21.0"
+version = "0.25.3"
 description = "A utility library for mocking out the `requests` Python library."
 optional = false
-python-versions = ">=3.7"
+python-versions = ">=3.8"
 files = [
-    {file = "responses-0.21.0-py3-none-any.whl", hash = "sha256:2dcc863ba63963c0c3d9ee3fa9507cbe36b7d7b0fccb4f0bdfd9e96c539b1487"},
-    {file = "responses-0.21.0.tar.gz", hash = "sha256:b82502eb5f09a0289d8e209e7bad71ef3978334f56d09b444253d5ad67bf5253"},
+    {file = "responses-0.25.3-py3-none-any.whl", hash = "sha256:521efcbc82081ab8daa588e08f7e8a64ce79b91c39f6e62199b19159bea7dbcb"},
+    {file = "responses-0.25.3.tar.gz", hash = "sha256:617b9247abd9ae28313d57a75880422d55ec63c29d33d629697590a034358dba"},
 ]
 
 [package.dependencies]
-requests = ">=2.0,<3.0"
-urllib3 = ">=1.25.10"
+pyyaml = "*"
+requests = ">=2.30.0,<3.0"
+urllib3 = ">=1.25.10,<3.0"
 
 [package.extras]
-tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-localserver", "types-mock", "types-requests"]
+tests = ["coverage (>=6.0.0)", "flake8", "mypy", "pytest (>=7.0.0)", "pytest-asyncio", "pytest-cov", "pytest-httpserver", "tomli", "tomli-w", "types-PyYAML", "types-requests"]
 
 [[package]]
 name = "rfc3339-validator"
@@ -3137,6 +3142,16 @@ files = [
     {file = "wrapt-1.14.1-cp310-cp310-musllinux_1_1_x86_64.whl", hash = "sha256:8ad85f7f4e20964db4daadcab70b47ab05c7c1cf2a7c1e51087bfaa83831854c"},
     {file = "wrapt-1.14.1-cp310-cp310-win32.whl", hash = "sha256:a9a52172be0b5aae932bef82a79ec0a0ce87288c7d132946d645eba03f0ad8a8"},
     {file = "wrapt-1.14.1-cp310-cp310-win_amd64.whl", hash = "sha256:6d323e1554b3d22cfc03cd3243b5bb815a51f5249fdcbb86fda4bf62bab9e164"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:ecee4132c6cd2ce5308e21672015ddfed1ff975ad0ac8d27168ea82e71413f55"},
+    {file = "wrapt-1.14.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:2020f391008ef874c6d9e208b24f28e31bcb85ccff4f335f15a3251d222b92d9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:2feecf86e1f7a86517cab34ae6c2f081fd2d0dac860cb0c0ded96d799d20b335"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:240b1686f38ae665d1b15475966fe0472f78e71b1b4903c143a842659c8e4cb9"},
+    {file = "wrapt-1.14.1-cp311-cp311-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a9008dad07d71f68487c91e96579c8567c98ca4c3881b9b113bc7b33e9fd78b8"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_aarch64.whl", hash = "sha256:6447e9f3ba72f8e2b985a1da758767698efa72723d5b59accefd716e9e8272bf"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_i686.whl", hash = "sha256:acae32e13a4153809db37405f5eba5bac5fbe2e2ba61ab227926a22901051c0a"},
+    {file = "wrapt-1.14.1-cp311-cp311-musllinux_1_1_x86_64.whl", hash = "sha256:49ef582b7a1152ae2766557f0550a9fcbf7bbd76f43fbdc94dd3bf07cc7168be"},
+    {file = "wrapt-1.14.1-cp311-cp311-win32.whl", hash = "sha256:358fe87cc899c6bb0ddc185bf3dbfa4ba646f05b1b0b9b5a27c2cb92c2cea204"},
+    {file = "wrapt-1.14.1-cp311-cp311-win_amd64.whl", hash = "sha256:26046cd03936ae745a502abf44dac702a5e6880b2b01c29aea8ddf3353b68224"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_i686.whl", hash = "sha256:43ca3bbbe97af00f49efb06e352eae40434ca9d915906f77def219b88e85d907"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux1_x86_64.whl", hash = "sha256:6b1a564e6cb69922c7fe3a678b9f9a3c54e72b469875aa8018f18b4d1dd1adf3"},
     {file = "wrapt-1.14.1-cp35-cp35m-manylinux2010_i686.whl", hash = "sha256:00b6d4ea20a906c0ca56d84f93065b398ab74b927a7a3dbd470f6fc503f95dc3"},
diff --git a/test_runner/fixtures/remote_storage.py b/test_runner/fixtures/remote_storage.py
index a527b810df..20e6bd9318 100644
--- a/test_runner/fixtures/remote_storage.py
+++ b/test_runner/fixtures/remote_storage.py
@@ -5,13 +5,13 @@ import hashlib
 import json
 import os
 import re
-import subprocess
 from dataclasses import dataclass
 from pathlib import Path
 from typing import TYPE_CHECKING, Union
 
 import boto3
 import toml
+from moto.server import ThreadedMotoServer
 from mypy_boto3_s3 import S3Client
 
 from fixtures.common_types import TenantId, TenantShardId, TimelineId
@@ -43,7 +43,6 @@ class RemoteStorageUser(str, enum.Enum):
 class MockS3Server:
     """
     Starts a mock S3 server for testing on a port given, errors if the server fails to start or exits prematurely.
-    Relies that `poetry` and `moto` server are installed, since it's the way the tests are run.
 
     Also provides a set of methods to derive the connection properties from and the method to kill the underlying server.
     """
@@ -53,22 +52,8 @@ class MockS3Server:
         port: int,
     ):
         self.port = port
-
-        # XXX: do not use `shell=True` or add `exec ` to the command here otherwise.
-        # We use `self.subprocess.kill()` to shut down the server, which would not "just" work in Linux
-        # if a process is started from the shell process.
-        self.subprocess = subprocess.Popen(["poetry", "run", "moto_server", f"-p{port}"])
-        error = None
-        try:
-            return_code = self.subprocess.poll()
-            if return_code is not None:
-                error = f"expected mock s3 server to run but it exited with code {return_code}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'"
-        except Exception as e:
-            error = f"expected mock s3 server to start but it failed with exception: {e}. stdout: '{self.subprocess.stdout}', stderr: '{self.subprocess.stderr}'"
-        if error is not None:
-            log.error(error)
-            self.kill()
-            raise RuntimeError("failed to start s3 mock server")
+        self.server = ThreadedMotoServer(port=port)
+        self.server.start()
 
     def endpoint(self) -> str:
         return f"http://127.0.0.1:{self.port}"
@@ -83,7 +68,7 @@ class MockS3Server:
         return "test"
 
     def kill(self):
-        self.subprocess.kill()
+        self.server.stop()
 
 
 @dataclass

From 72ef0e0fa173e60916ceabd4a2d8d4fe1b03042c Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Wed, 9 Oct 2024 15:51:34 +0300
Subject: [PATCH 12/12] tests: Remove redundant log lines when stopping storage
 nodes (#9317)

The neon_cli functions print the command that gets executed, which
contains the same information.

Before:

    2024-10-07 22:32:28.884 INFO [neon_fixtures.py:3927] Stopping safekeeper 1
    2024-10-07 22:32:28.884 INFO [neon_cli.py:73] Running command "/tmp/neon/bin/neon_local safekeeper stop 1"
    2024-10-07 22:32:28.989 INFO [neon_fixtures.py:3927] Stopping safekeeper 2
    2024-10-07 22:32:28.989 INFO [neon_cli.py:73] Running command "/tmp/neon/bin/neon_local safekeeper stop 2"
    2024-10-07 22:32:29.93 INFO [neon_fixtures.py:3927] Stopping safekeeper 3
    2024-10-07 22:32:29.94 INFO [neon_cli.py:73] Running command "/tmp/neon/bin/neon_local safekeeper stop 3"
    2024-10-07 22:32:29.251 INFO [neon_cli.py:450] Stopping pageserver with ['pageserver', 'stop', '--id=1']
    2024-10-07 22:32:29.251 INFO [neon_cli.py:73] Running command "/tmp/neon/bin/neon_local pageserver stop --id=1"

After:

    2024-10-07 22:32:28.884 INFO [neon_cli.py:73] Running command "/tmp/neon/bin/neon_local safekeeper stop 1"
    2024-10-07 22:32:28.989 INFO [neon_cli.py:73] Running command "/tmp/neon/bin/neon_local safekeeper stop 2"
    2024-10-07 22:32:29.94 INFO [neon_cli.py:73] Running command "/tmp/neon/bin/neon_local safekeeper stop 3"
    2024-10-07 22:32:29.251 INFO [neon_cli.py:73] Running command "/tmp/neon/bin/neon_local pageserver stop --id=1"
---
 test_runner/fixtures/neon_cli.py      | 1 -
 test_runner/fixtures/neon_fixtures.py | 1 -
 2 files changed, 2 deletions(-)

diff --git a/test_runner/fixtures/neon_cli.py b/test_runner/fixtures/neon_cli.py
index 2d499fd982..0d3dcd1671 100644
--- a/test_runner/fixtures/neon_cli.py
+++ b/test_runner/fixtures/neon_cli.py
@@ -446,7 +446,6 @@ class NeonLocalCli(AbstractNeonCli):
         if immediate:
             cmd.extend(["-m", "immediate"])
 
-        log.info(f"Stopping pageserver with {cmd}")
         return self.raw_cli(cmd)
 
     def safekeeper_start(
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index e6de72752f..5cb9821476 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -3915,7 +3915,6 @@ class Safekeeper(LogUtils):
         return self
 
     def stop(self, immediate: bool = False) -> Safekeeper:
-        log.info(f"Stopping safekeeper {self.id}")
         self.env.neon_cli.safekeeper_stop(self.id, immediate)
         self.running = False
         return self