From dc6a3828731b787f71cf172e5276ed341c6489c8 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 6 Jun 2023 15:16:54 +0400
Subject: [PATCH 01/14] Increase timeouts on compute -> sk connections.

context: https://github.com/neondatabase/neon/issues/4414

And improve messages/comments here and there.
---
 pgxn/neon/walproposer.c | 17 ++++++++++++-----
 1 file changed, 12 insertions(+), 5 deletions(-)

diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index a99be40955..64d980d2e4 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -254,20 +254,20 @@ nwp_register_gucs(void)
 
 	DefineCustomIntVariable(
 							"neon.safekeeper_reconnect_timeout",
-							"Timeout for reconnecting to offline wal acceptor.",
+							"Walproposer reconnects to offline safekeepers once in this interval.",
 							NULL,
 							&wal_acceptor_reconnect_timeout,
-							1000, 0, INT_MAX,	/* default, min, max */
+							5000, 0, INT_MAX,	/* default, min, max */
 							PGC_SIGHUP, /* context */
 							GUC_UNIT_MS,	/* flags */
 							NULL, NULL, NULL);
 
 	DefineCustomIntVariable(
 							"neon.safekeeper_connect_timeout",
-							"Timeout for connection establishement and it's maintenance against safekeeper",
+							"Connection or connection attempt to safekeeper is terminated if no message is received (or connection attempt doesn't finish) within this period.",
 							NULL,
 							&wal_acceptor_connection_timeout,
-							5000, 0, INT_MAX,
+							10000, 0, INT_MAX,
 							PGC_SIGHUP,
 							GUC_UNIT_MS,
 							NULL, NULL, NULL);
@@ -441,7 +441,7 @@ WalProposerPoll(void)
 				if (TimestampDifferenceExceeds(sk->latestMsgReceivedAt, now,
 											   wal_acceptor_connection_timeout))
 				{
-					elog(WARNING, "failed to connect to node '%s:%s' in '%s' state: exceeded connection timeout %dms",
+					elog(WARNING, "terminating connection to safekeeper '%s:%s' in '%s' state: no messages received during the last %dms or connection attempt took longer than that",
 						 sk->host, sk->port, FormatSafekeeperState(sk->state), wal_acceptor_connection_timeout);
 					ShutdownConnection(sk);
 				}
@@ -1035,9 +1035,16 @@ RecvAcceptorGreeting(Safekeeper *sk)
 	if (!AsyncReadMessage(sk, (AcceptorProposerMessage *) & sk->greetResponse))
 		return;
 
+	elog(LOG, "received AcceptorGreeting from safekeeper %s:%s", sk->host, sk->port);
+
 	/* Protocol is all good, move to voting. */
 	sk->state = SS_VOTING;
 
+	/* 
+	 * Note: it would be better to track the counter on per safekeeper basis,
+	 * but at worst walproposer would restart with 'term rejected', so leave as
+	 * is for now.
+	 */
 	++n_connected;
 	if (n_connected <= quorum)
 	{

From c058e1cec2df70505263b19ad4e1f337d9643285 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 6 Jun 2023 15:27:17 +0400
Subject: [PATCH 02/14] Quick exit in truncate_wal if nothing to do.

ref https://github.com/neondatabase/neon/issues/4414
---
 safekeeper/src/wal_storage.rs | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/safekeeper/src/wal_storage.rs b/safekeeper/src/wal_storage.rs
index 1b82bd754e..644c956fc1 100644
--- a/safekeeper/src/wal_storage.rs
+++ b/safekeeper/src/wal_storage.rs
@@ -379,6 +379,12 @@ impl Storage for PhysicalStorage {
             );
         }
 
+        // Quick exit if nothing to do to avoid writing up to 16 MiB of zeros on
+        // disk (this happens on each connect).
+        if end_pos == self.write_lsn {
+            return Ok(());
+        }
+
         // Close previously opened file, if any
         if let Some(mut unflushed_file) = self.file.take() {
             self.fdatasync_file(&mut unflushed_file)?;

From 6b3c020cd90aab8e1e8c015ece39ebf8b6898da2 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Tue, 6 Jun 2023 15:30:26 +0400
Subject: [PATCH 03/14] Don't warn on system id = 0 in walproposer greeting.

sync-safekeepers doesn't know it and sends 0.
---
 safekeeper/src/safekeeper.rs | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index 33da0c8e5a..eb434136d4 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -634,7 +634,8 @@ where
         }
 
         // system_id will be updated on mismatch
-        if self.state.server.system_id != msg.system_id {
+        // sync-safekeepers doesn't know sysid and sends 0, ignore it
+        if self.state.server.system_id != msg.system_id && msg.system_id != 0 {
             if self.state.server.system_id != 0 {
                 warn!(
                     "unexpected system ID arrived, got {}, expected {}",

From 88f0cfc5755cd226c7cfd40440cb03130a28b432 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Wed, 7 Jun 2023 11:41:53 +0200
Subject: [PATCH 04/14] Fix `pgx_ulid` extension (#4431)

The issue was in the wrong `control` file name
---
 Dockerfile.compute-node | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index f8429e72b8..44e13a6c73 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -531,7 +531,7 @@ RUN wget https://github.com/pksunkara/pgx_ulid/archive/refs/tags/v0.1.0.tar.gz -
     mkdir pgx_ulid-src && cd pgx_ulid-src && tar xvzf ../pgx_ulid.tar.gz --strip-components=1 -C . && \
     sed -i 's/pgx        = "=0.7.3"/pgx = { version = "0.7.3", features = [ "unsafe-postgres" ] }/g' Cargo.toml && \
     cargo pgx install --release && \
-    echo "trusted = true" >> /usr/local/pgsql/share/extension/pgx_ulid.control
+    echo "trusted = true" >> /usr/local/pgsql/share/extension/ulid.control
 
 #########################################################################################
 #

From 5761190e0d61618ab805dd3b6dff3ef7fb768aff Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 7 Jun 2023 14:29:23 +0300
Subject: [PATCH 05/14] feat: three phased startup order (#4399)

Initial logical size calculation could still hinder our fast startup
efforts in #4397. See #4183. In deployment of 2023-06-06
about a 200 initial logical sizes were calculated on hosts which
took the longest to complete initial load (12s).

Implements the three step/tier initialization ordering described in
#4397:
1. load local tenants
2. do initial logical sizes per walreceivers for 10s
3. background tasks

Ordering is controlled by:
- waiting on `utils::completion::Barrier`s on background tasks
- having one attempt for each Timeline to do initial logical size
calculation
- `pageserver/src/bin/pageserver.rs` releasing background jobs after
timeout or completion of initial logical size calculation

The timeout is there just to safeguard in case a legitimate non-broken
timeline initial logical size calculation goes long. The timeout is
configurable, by default 10s, which I think would be fine for production
systems. In the test cases I've been looking at, it seems that these
steps are completed as fast as possible.

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/bin/pageserver.rs              | 121 ++++++++++++++++--
 pageserver/src/config.rs                      |  34 ++++-
 pageserver/src/disk_usage_eviction_task.rs    |  21 ++-
 pageserver/src/lib.rs                         |  23 ++++
 pageserver/src/tenant.rs                      |  58 +++++----
 pageserver/src/tenant/mgr.rs                  |  13 +-
 pageserver/src/tenant/tasks.rs                |  34 +++--
 pageserver/src/tenant/timeline.rs             |  46 ++++++-
 .../src/tenant/timeline/eviction_task.rs      |  16 ++-
 .../regress/test_disk_usage_eviction.py       |   6 +
 10 files changed, 295 insertions(+), 77 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index e0731ba79b..1fa5e4ab3b 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -337,33 +337,114 @@ fn start_pageserver(
 
     // Startup staging or optimizing:
     //
-    // (init_done_tx, init_done_rx) are used to control when do background loops start. This is to
-    // avoid starving out the BACKGROUND_RUNTIME async worker threads doing heavy work, like
-    // initial repartitioning while we still have Loading tenants.
+    // We want to minimize downtime for `page_service` connections, and trying not to overload
+    // BACKGROUND_RUNTIME by doing initial compactions and initial logical sizes at the same time.
     //
-    // init_done_rx is a barrier which stops waiting once all init_done_tx clones are dropped.
+    // init_done_rx will notify when all initial load operations have completed.
+    //
+    // background_jobs_can_start (same name used to hold off background jobs from starting at
+    // consumer side) will be dropped once we can start the background jobs. Currently it is behind
+    // completing all initial logical size calculations (init_logical_size_done_rx) and a timeout
+    // (background_task_maximum_delay).
     let (init_done_tx, init_done_rx) = utils::completion::channel();
 
+    let (init_logical_size_done_tx, init_logical_size_done_rx) = utils::completion::channel();
+
+    let (background_jobs_can_start, background_jobs_barrier) = utils::completion::channel();
+
+    let order = pageserver::InitializationOrder {
+        initial_tenant_load: Some(init_done_tx),
+        initial_logical_size_can_start: init_done_rx.clone(),
+        initial_logical_size_attempt: init_logical_size_done_tx,
+        background_jobs_can_start: background_jobs_barrier.clone(),
+    };
+
     // Scan the local 'tenants/' directory and start loading the tenants
     let init_started_at = std::time::Instant::now();
+    let shutdown_pageserver = tokio_util::sync::CancellationToken::new();
+
     BACKGROUND_RUNTIME.block_on(mgr::init_tenant_mgr(
         conf,
         broker_client.clone(),
         remote_storage.clone(),
-        (init_done_tx, init_done_rx.clone()),
+        order,
     ))?;
 
     BACKGROUND_RUNTIME.spawn({
-        let init_done_rx = init_done_rx.clone();
-        async move {
-            init_done_rx.wait().await;
+        let init_done_rx = init_done_rx;
+        let shutdown_pageserver = shutdown_pageserver.clone();
+        let drive_init = async move {
+            // NOTE: unlike many futures in pageserver, this one is cancellation-safe
+            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial load completed"));
 
-            let elapsed = init_started_at.elapsed();
+            init_done_rx.wait().await;
+            // initial logical sizes can now start, as they were waiting on init_done_rx.
+
+            scopeguard::ScopeGuard::into_inner(guard);
+
+            let init_done = std::time::Instant::now();
+            let elapsed = init_done - init_started_at;
 
             tracing::info!(
                 elapsed_millis = elapsed.as_millis(),
-                "Initial load completed."
+                "Initial load completed"
             );
+
+            let mut init_sizes_done = std::pin::pin!(init_logical_size_done_rx.wait());
+
+            let timeout = conf.background_task_maximum_delay;
+
+            let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
+
+            let init_sizes_done = tokio::select! {
+                _ = &mut init_sizes_done => {
+                    let now = std::time::Instant::now();
+                    tracing::info!(
+                        from_init_done_millis = (now - init_done).as_millis(),
+                        from_init_millis = (now - init_started_at).as_millis(),
+                        "Initial logical sizes completed"
+                    );
+                    None
+                }
+                _ = tokio::time::sleep(timeout) => {
+                    tracing::info!(
+                        timeout_millis = timeout.as_millis(),
+                        "Initial logical size timeout elapsed; starting background jobs"
+                    );
+                    Some(init_sizes_done)
+                }
+            };
+
+            scopeguard::ScopeGuard::into_inner(guard);
+
+            // allow background jobs to start
+            drop(background_jobs_can_start);
+
+            if let Some(init_sizes_done) = init_sizes_done {
+                // ending up here is not a bug; at the latest logical sizes will be queried by
+                // consumption metrics.
+                let guard = scopeguard::guard_on_success((), |_| tracing::info!("Cancelled before initial logical sizes completed"));
+                init_sizes_done.await;
+
+                scopeguard::ScopeGuard::into_inner(guard);
+
+                let now = std::time::Instant::now();
+                tracing::info!(
+                    from_init_done_millis = (now - init_done).as_millis(),
+                    from_init_millis = (now - init_started_at).as_millis(),
+                    "Initial logical sizes completed after timeout (background jobs already started)"
+                );
+
+            }
+        };
+
+        async move {
+            let mut drive_init = std::pin::pin!(drive_init);
+            // just race these tasks
+            tokio::select! {
+                _ = shutdown_pageserver.cancelled() => {},
+                _ = &mut drive_init => {},
+            }
         }
     });
 
@@ -378,7 +459,7 @@ fn start_pageserver(
             conf,
             remote_storage.clone(),
             disk_usage_eviction_state.clone(),
-            init_done_rx.clone(),
+            background_jobs_barrier.clone(),
         )?;
     }
 
@@ -416,7 +497,7 @@ fn start_pageserver(
         );
 
         if let Some(metric_collection_endpoint) = &conf.metric_collection_endpoint {
-            let init_done_rx = init_done_rx;
+            let background_jobs_barrier = background_jobs_barrier;
             let metrics_ctx = RequestContext::todo_child(
                 TaskKind::MetricsCollection,
                 // This task itself shouldn't download anything.
@@ -432,12 +513,17 @@ fn start_pageserver(
                 "consumption metrics collection",
                 true,
                 async move {
-                    // first wait for initial load to complete before first iteration.
+                    // first wait until background jobs are cleared to launch.
                     //
                     // this is because we only process active tenants and timelines, and the
                     // Timeline::get_current_logical_size will spawn the logical size calculation,
                     // which will not be rate-limited.
-                    init_done_rx.wait().await;
+                    let cancel = task_mgr::shutdown_token();
+
+                    tokio::select! {
+                        _ = cancel.cancelled() => { return Ok(()); },
+                        _ = background_jobs_barrier.wait() => {}
+                    };
 
                     pageserver::consumption_metrics::collect_metrics(
                         metric_collection_endpoint,
@@ -487,6 +573,8 @@ fn start_pageserver(
         );
     }
 
+    let mut shutdown_pageserver = Some(shutdown_pageserver.drop_guard());
+
     // All started up! Now just sit and wait for shutdown signal.
     ShutdownSignals::handle(|signal| match signal {
         Signal::Quit => {
@@ -502,6 +590,11 @@ fn start_pageserver(
                 "Got {}. Terminating gracefully in fast shutdown mode",
                 signal.name()
             );
+
+            // This cancels the `shutdown_pageserver` cancellation tree.
+            // Right now that tree doesn't reach very far, and `task_mgr` is used instead.
+            // The plan is to change that over time.
+            shutdown_pageserver.take();
             BACKGROUND_RUNTIME.block_on(pageserver::shutdown_pageserver(0));
             unreachable!()
         }
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 02763c9b7d..17e6e3fb2a 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -63,6 +63,7 @@ pub mod defaults {
     pub const DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL: &str = "1 hour";
     pub const DEFAULT_METRIC_COLLECTION_ENDPOINT: Option<reqwest::Url> = None;
     pub const DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL: &str = "10 min";
+    pub const DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY: &str = "10s";
 
     ///
     /// Default built-in configuration file.
@@ -91,9 +92,10 @@ pub mod defaults {
 #cached_metric_collection_interval = '{DEFAULT_CACHED_METRIC_COLLECTION_INTERVAL}'
 #synthetic_size_calculation_interval = '{DEFAULT_SYNTHETIC_SIZE_CALCULATION_INTERVAL}'
 
-
 #disk_usage_based_eviction = {{ max_usage_pct = .., min_avail_bytes = .., period = "10s"}}
 
+#background_task_maximum_delay = '{DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY}'
+
 # [tenant_config]
 #checkpoint_distance = {DEFAULT_CHECKPOINT_DISTANCE} # in bytes
 #checkpoint_timeout = {DEFAULT_CHECKPOINT_TIMEOUT}
@@ -187,6 +189,15 @@ pub struct PageServerConf {
     pub test_remote_failures: u64,
 
     pub ondemand_download_behavior_treat_error_as_warn: bool,
+
+    /// How long will background tasks be delayed at most after initial load of tenants.
+    ///
+    /// Our largest initialization completions are in the range of 100-200s, so perhaps 10s works
+    /// as we now isolate initial loading, initial logical size calculation and background tasks.
+    /// Smaller nodes will have background tasks "not running" for this long unless every timeline
+    /// has it's initial logical size calculated. Not running background tasks for some seconds is
+    /// not terrible.
+    pub background_task_maximum_delay: Duration,
 }
 
 /// We do not want to store this in a PageServerConf because the latter may be logged
@@ -259,6 +270,8 @@ struct PageServerConfigBuilder {
     test_remote_failures: BuilderValue<u64>,
 
     ondemand_download_behavior_treat_error_as_warn: BuilderValue<bool>,
+
+    background_task_maximum_delay: BuilderValue<Duration>,
 }
 
 impl Default for PageServerConfigBuilder {
@@ -316,6 +329,11 @@ impl Default for PageServerConfigBuilder {
             test_remote_failures: Set(0),
 
             ondemand_download_behavior_treat_error_as_warn: Set(false),
+
+            background_task_maximum_delay: Set(humantime::parse_duration(
+                DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY,
+            )
+            .unwrap()),
         }
     }
 }
@@ -440,6 +458,10 @@ impl PageServerConfigBuilder {
             BuilderValue::Set(ondemand_download_behavior_treat_error_as_warn);
     }
 
+    pub fn background_task_maximum_delay(&mut self, delay: Duration) {
+        self.background_task_maximum_delay = BuilderValue::Set(delay);
+    }
+
     pub fn build(self) -> anyhow::Result<PageServerConf> {
         let concurrent_tenant_size_logical_size_queries = self
             .concurrent_tenant_size_logical_size_queries
@@ -522,6 +544,9 @@ impl PageServerConfigBuilder {
                 .ok_or(anyhow!(
                     "missing ondemand_download_behavior_treat_error_as_warn"
                 ))?,
+            background_task_maximum_delay: self
+                .background_task_maximum_delay
+                .ok_or(anyhow!("missing background_task_maximum_delay"))?,
         })
     }
 }
@@ -710,6 +735,7 @@ impl PageServerConf {
                     )
                 },
                 "ondemand_download_behavior_treat_error_as_warn" => builder.ondemand_download_behavior_treat_error_as_warn(parse_toml_bool(key, item)?),
+                "background_task_maximum_delay" => builder.background_task_maximum_delay(parse_toml_duration(key, item)?),
                 _ => bail!("unrecognized pageserver option '{key}'"),
             }
         }
@@ -877,6 +903,7 @@ impl PageServerConf {
             disk_usage_based_eviction: None,
             test_remote_failures: 0,
             ondemand_download_behavior_treat_error_as_warn: false,
+            background_task_maximum_delay: Duration::ZERO,
         }
     }
 }
@@ -1036,6 +1063,7 @@ metric_collection_endpoint = 'http://localhost:80/metrics'
 synthetic_size_calculation_interval = '333 s'
 
 log_format = 'json'
+background_task_maximum_delay = '334 s'
 
 "#;
 
@@ -1094,6 +1122,9 @@ log_format = 'json'
                 disk_usage_based_eviction: None,
                 test_remote_failures: 0,
                 ondemand_download_behavior_treat_error_as_warn: false,
+                background_task_maximum_delay: humantime::parse_duration(
+                    defaults::DEFAULT_BACKGROUND_TASK_MAXIMUM_DELAY
+                )?,
             },
             "Correct defaults should be used when no config values are provided"
         );
@@ -1148,6 +1179,7 @@ log_format = 'json'
                 disk_usage_based_eviction: None,
                 test_remote_failures: 0,
                 ondemand_download_behavior_treat_error_as_warn: false,
+                background_task_maximum_delay: Duration::from_secs(334),
             },
             "Should be able to parse all basic config values correctly"
         );
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 1a8886935c..ce5f81c44b 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -83,7 +83,7 @@ pub fn launch_disk_usage_global_eviction_task(
     conf: &'static PageServerConf,
     storage: GenericRemoteStorage,
     state: Arc<State>,
-    init_done: completion::Barrier,
+    background_jobs_barrier: completion::Barrier,
 ) -> anyhow::Result<()> {
     let Some(task_config) = &conf.disk_usage_based_eviction else {
         info!("disk usage based eviction task not configured");
@@ -100,17 +100,16 @@ pub fn launch_disk_usage_global_eviction_task(
         "disk usage based eviction",
         false,
         async move {
-            // wait until initial load is complete, because we cannot evict from loading tenants.
-            init_done.wait().await;
+            let cancel = task_mgr::shutdown_token();
 
-            disk_usage_eviction_task(
-                &state,
-                task_config,
-                storage,
-                &conf.tenants_path(),
-                task_mgr::shutdown_token(),
-            )
-            .await;
+            // wait until initial load is complete, because we cannot evict from loading tenants.
+            tokio::select! {
+                _ = cancel.cancelled() => { return Ok(()); },
+                _ = background_jobs_barrier.wait() => { }
+            };
+
+            disk_usage_eviction_task(&state, task_config, storage, &conf.tenants_path(), cancel)
+                .await;
             info!("disk usage based eviction task finishing");
             Ok(())
         },
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 40a672bee3..5831091098 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -132,6 +132,29 @@ pub fn is_uninit_mark(path: &Path) -> bool {
     }
 }
 
+/// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
+/// blocking.
+///
+/// The instances of this value exist only during startup, otherwise `None` is provided, meaning no
+/// delaying is needed.
+#[derive(Clone)]
+pub struct InitializationOrder {
+    /// Each initial tenant load task carries this until completion.
+    pub initial_tenant_load: Option<utils::completion::Completion>,
+
+    /// Barrier for when we can start initial logical size calculations.
+    pub initial_logical_size_can_start: utils::completion::Barrier,
+
+    /// Each timeline owns a clone of this to be consumed on the initial logical size calculation
+    /// attempt. It is important to drop this once the attempt has completed.
+    pub initial_logical_size_attempt: utils::completion::Completion,
+
+    /// Barrier for when we can start any background jobs.
+    ///
+    /// This can be broken up later on, but right now there is just one class of a background job.
+    pub background_jobs_can_start: utils::completion::Barrier,
+}
+
 #[cfg(test)]
 mod backoff_defaults_tests {
     use super::*;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 7ce0ed81bc..29086cae86 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -65,6 +65,7 @@ use crate::tenant::remote_timeline_client::PersistIndexPartWithDeletedFlagError;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
+use crate::InitializationOrder;
 
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
@@ -510,6 +511,7 @@ impl Tenant {
         local_metadata: Option<TimelineMetadata>,
         ancestor: Option<Arc<Timeline>>,
         first_save: bool,
+        init_order: Option<&InitializationOrder>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         let tenant_id = self.tenant_id;
@@ -535,6 +537,7 @@ impl Tenant {
                 up_to_date_metadata,
                 ancestor.clone(),
                 remote_client,
+                init_order,
             )?;
 
             let timeline = UninitializedTimeline {
@@ -560,6 +563,7 @@ impl Tenant {
                             up_to_date_metadata,
                             ancestor.clone(),
                             None,
+                            None,
                         )
                         .with_context(|| {
                             format!("creating broken timeline data for {tenant_id}/{timeline_id}")
@@ -858,6 +862,7 @@ impl Tenant {
             local_metadata,
             ancestor,
             true,
+            None,
             ctx,
         )
         .await
@@ -892,16 +897,13 @@ impl Tenant {
     ///
     /// If the loading fails for some reason, the Tenant will go into Broken
     /// state.
-    ///
-    /// `init_done` is an optional channel used during initial load to delay background task
-    /// start. It is not used later.
     #[instrument(skip_all, fields(tenant_id=%tenant_id))]
     pub fn spawn_load(
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         broker_client: storage_broker::BrokerClientChannel,
         remote_storage: Option<GenericRemoteStorage>,
-        init_done: Option<(completion::Completion, completion::Barrier)>,
+        init_order: Option<InitializationOrder>,
         ctx: &RequestContext,
     ) -> Arc<Tenant> {
         debug_assert_current_span_has_tenant_id();
@@ -937,17 +939,17 @@ impl Tenant {
             "initial tenant load",
             false,
             async move {
-                // keep the sender alive as long as we have the initial load ongoing; it will be
-                // None for loads spawned after init_tenant_mgr.
-                let (_tx, rx) = if let Some((tx, rx)) = init_done {
-                    (Some(tx), Some(rx))
-                } else {
-                    (None, None)
-                };
-                match tenant_clone.load(&ctx).await {
+                let mut init_order = init_order;
+
+                // take the completion because initial tenant loading will complete when all of
+                // these tasks complete.
+                let _completion = init_order.as_mut().and_then(|x| x.initial_tenant_load.take());
+
+                match tenant_clone.load(init_order.as_ref(), &ctx).await {
                     Ok(()) => {
                         debug!("load finished, activating");
-                        tenant_clone.activate(broker_client, rx.as_ref(), &ctx);
+                        let background_jobs_can_start = init_order.as_ref().map(|x| &x.background_jobs_can_start);
+                        tenant_clone.activate(broker_client, background_jobs_can_start, &ctx);
                     }
                     Err(err) => {
                         error!("load failed, setting tenant state to Broken: {err:?}");
@@ -974,7 +976,11 @@ impl Tenant {
     /// files on disk. Used at pageserver startup.
     ///
     /// No background tasks are started as part of this routine.
-    async fn load(self: &Arc<Tenant>, ctx: &RequestContext) -> anyhow::Result<()> {
+    async fn load(
+        self: &Arc<Tenant>,
+        init_order: Option<&InitializationOrder>,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<()> {
         debug_assert_current_span_has_tenant_id();
 
         debug!("loading tenant task");
@@ -1094,7 +1100,7 @@ impl Tenant {
         //    1. "Timeline has no ancestor and no layer files"
 
         for (timeline_id, local_metadata) in sorted_timelines {
-            self.load_local_timeline(timeline_id, local_metadata, ctx)
+            self.load_local_timeline(timeline_id, local_metadata, init_order, ctx)
                 .await
                 .with_context(|| format!("load local timeline {timeline_id}"))?;
         }
@@ -1112,6 +1118,7 @@ impl Tenant {
         &self,
         timeline_id: TimelineId,
         local_metadata: TimelineMetadata,
+        init_order: Option<&InitializationOrder>,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
         debug_assert_current_span_has_tenant_id();
@@ -1181,6 +1188,7 @@ impl Tenant {
             Some(local_metadata),
             ancestor,
             false,
+            init_order,
             ctx,
         )
         .await
@@ -1724,12 +1732,12 @@ impl Tenant {
 
     /// Changes tenant status to active, unless shutdown was already requested.
     ///
-    /// `init_done` is an optional channel used during initial load to delay background task
-    /// start. It is not used later.
+    /// `background_jobs_can_start` is an optional barrier set to a value during pageserver startup
+    /// to delay background jobs. Background jobs can be started right away when None is given.
     fn activate(
         self: &Arc<Self>,
         broker_client: BrokerClientChannel,
-        init_done: Option<&completion::Barrier>,
+        background_jobs_can_start: Option<&completion::Barrier>,
         ctx: &RequestContext,
     ) {
         debug_assert_current_span_has_tenant_id();
@@ -1762,12 +1770,12 @@ impl Tenant {
 
             // Spawn gc and compaction loops. The loops will shut themselves
             // down when they notice that the tenant is inactive.
-            tasks::start_background_loops(self, init_done);
+            tasks::start_background_loops(self, background_jobs_can_start);
 
             let mut activated_timelines = 0;
 
             for timeline in not_broken_timelines {
-                timeline.activate(broker_client.clone(), init_done, ctx);
+                timeline.activate(broker_client.clone(), background_jobs_can_start, ctx);
                 activated_timelines += 1;
             }
 
@@ -2158,6 +2166,7 @@ impl Tenant {
         new_metadata: &TimelineMetadata,
         ancestor: Option<Arc<Timeline>>,
         remote_client: Option<RemoteTimelineClient>,
+        init_order: Option<&InitializationOrder>,
     ) -> anyhow::Result<Arc<Timeline>> {
         if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() {
             anyhow::ensure!(
@@ -2166,6 +2175,9 @@ impl Tenant {
             )
         }
 
+        let initial_logical_size_can_start = init_order.map(|x| &x.initial_logical_size_can_start);
+        let initial_logical_size_attempt = init_order.map(|x| &x.initial_logical_size_attempt);
+
         let pg_version = new_metadata.pg_version();
         Ok(Timeline::new(
             self.conf,
@@ -2177,6 +2189,8 @@ impl Tenant {
             Arc::clone(&self.walredo_mgr),
             remote_client,
             pg_version,
+            initial_logical_size_can_start.cloned(),
+            initial_logical_size_attempt.cloned(),
         ))
     }
 
@@ -2852,7 +2866,7 @@ impl Tenant {
         remote_client: Option<RemoteTimelineClient>,
     ) -> anyhow::Result<Arc<Timeline>> {
         let timeline_data = self
-            .create_timeline_data(new_timeline_id, new_metadata, ancestor, remote_client)
+            .create_timeline_data(new_timeline_id, new_metadata, ancestor, remote_client, None)
             .context("Failed to create timeline data structure")?;
         crashsafe::create_dir_all(timeline_path).context("Failed to create timeline directory")?;
 
@@ -3420,7 +3434,7 @@ pub mod harness {
                 timelines_to_load.insert(timeline_id, timeline_metadata);
             }
             tenant
-                .load(ctx)
+                .load(None, ctx)
                 .instrument(info_span!("try_load", tenant_id=%self.tenant_id))
                 .await?;
             tenant.state.send_replace(TenantState::Active);
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 740f9621b6..a1638e4a95 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -21,9 +21,8 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind};
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{create_tenant_files, CreateTenantFilesMode, Tenant, TenantState};
-use crate::IGNORED_TENANT_FILE_NAME;
+use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
 
-use utils::completion;
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};
 
@@ -65,7 +64,7 @@ pub async fn init_tenant_mgr(
     conf: &'static PageServerConf,
     broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
-    init_done: (completion::Completion, completion::Barrier),
+    init_order: InitializationOrder,
 ) -> anyhow::Result<()> {
     // Scan local filesystem for attached tenants
     let tenants_dir = conf.tenants_path();
@@ -122,7 +121,7 @@ pub async fn init_tenant_mgr(
                         &tenant_dir_path,
                         broker_client.clone(),
                         remote_storage.clone(),
-                        Some(init_done.clone()),
+                        Some(init_order.clone()),
                         &ctx,
                     ) {
                         Ok(tenant) => {
@@ -153,14 +152,12 @@ pub async fn init_tenant_mgr(
     Ok(())
 }
 
-/// `init_done` is an optional channel used during initial load to delay background task
-/// start. It is not used later.
 pub fn schedule_local_tenant_processing(
     conf: &'static PageServerConf,
     tenant_path: &Path,
     broker_client: storage_broker::BrokerClientChannel,
     remote_storage: Option<GenericRemoteStorage>,
-    init_done: Option<(completion::Completion, completion::Barrier)>,
+    init_order: Option<InitializationOrder>,
     ctx: &RequestContext,
 ) -> anyhow::Result<Arc<Tenant>> {
     anyhow::ensure!(
@@ -219,7 +216,7 @@ pub fn schedule_local_tenant_processing(
             tenant_id,
             broker_client,
             remote_storage,
-            init_done,
+            init_order,
             ctx,
         )
     };
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 1bbc1b1c08..360818b5a7 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -15,10 +15,10 @@ use tracing::*;
 use utils::completion;
 
 /// Start per tenant background loops: compaction and gc.
-///
-/// `init_done` is an optional channel used during initial load to delay background task
-/// start. It is not used later.
-pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completion::Barrier>) {
+pub fn start_background_loops(
+    tenant: &Arc<Tenant>,
+    background_jobs_can_start: Option<&completion::Barrier>,
+) {
     let tenant_id = tenant.tenant_id;
     task_mgr::spawn(
         BACKGROUND_RUNTIME.handle(),
@@ -29,10 +29,14 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completio
         false,
         {
             let tenant = Arc::clone(tenant);
-            let init_done = init_done.cloned();
+            let background_jobs_can_start = background_jobs_can_start.cloned();
             async move {
-                completion::Barrier::maybe_wait(init_done).await;
-                compaction_loop(tenant)
+                let cancel = task_mgr::shutdown_token();
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()) },
+                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
+                };
+                compaction_loop(tenant, cancel)
                     .instrument(info_span!("compaction_loop", tenant_id = %tenant_id))
                     .await;
                 Ok(())
@@ -48,10 +52,14 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completio
         false,
         {
             let tenant = Arc::clone(tenant);
-            let init_done = init_done.cloned();
+            let background_jobs_can_start = background_jobs_can_start.cloned();
             async move {
-                completion::Barrier::maybe_wait(init_done).await;
-                gc_loop(tenant)
+                let cancel = task_mgr::shutdown_token();
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()) },
+                    _ = completion::Barrier::maybe_wait(background_jobs_can_start) => {}
+                };
+                gc_loop(tenant, cancel)
                     .instrument(info_span!("gc_loop", tenant_id = %tenant_id))
                     .await;
                 Ok(())
@@ -63,12 +71,11 @@ pub fn start_background_loops(tenant: &Arc<Tenant>, init_done: Option<&completio
 ///
 /// Compaction task's main loop
 ///
-async fn compaction_loop(tenant: Arc<Tenant>) {
+async fn compaction_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     let wait_duration = Duration::from_secs(2);
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
-        let cancel = task_mgr::shutdown_token();
         let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
         let mut first = true;
         loop {
@@ -133,12 +140,11 @@ async fn compaction_loop(tenant: Arc<Tenant>) {
 ///
 /// GC task's main loop
 ///
-async fn gc_loop(tenant: Arc<Tenant>) {
+async fn gc_loop(tenant: Arc<Tenant>, cancel: CancellationToken) {
     let wait_duration = Duration::from_secs(2);
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
-        let cancel = task_mgr::shutdown_token();
         // GC might require downloading, to find the cutoff LSN that corresponds to the
         // cutoff specified as time.
         let ctx =
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index fdaad58e16..507f0de4f3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -242,6 +242,13 @@ pub struct Timeline {
     pub delete_lock: tokio::sync::Mutex<bool>,
 
     eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
+
+    /// Barrier to wait before doing initial logical size calculation. Used only during startup.
+    initial_logical_size_can_start: Option<completion::Barrier>,
+
+    /// Completion shared between all timelines loaded during startup; used to delay heavier
+    /// background tasks until some logical sizes have been calculated.
+    initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
 }
 
 /// Internal structure to hold all data needed for logical size calculation.
@@ -932,12 +939,12 @@ impl Timeline {
     pub fn activate(
         self: &Arc<Self>,
         broker_client: BrokerClientChannel,
-        init_done: Option<&completion::Barrier>,
+        background_jobs_can_start: Option<&completion::Barrier>,
         ctx: &RequestContext,
     ) {
         self.launch_wal_receiver(ctx, broker_client);
         self.set_state(TimelineState::Active);
-        self.launch_eviction_task(init_done);
+        self.launch_eviction_task(background_jobs_can_start);
     }
 
     pub fn set_state(&self, new_state: TimelineState) {
@@ -955,6 +962,14 @@ impl Timeline {
                 error!("Not activating a Stopping timeline");
             }
             (_, new_state) => {
+                if matches!(new_state, TimelineState::Stopping | TimelineState::Broken) {
+                    // drop the copmletion guard, if any; it might be holding off the completion
+                    // forever needlessly
+                    self.initial_logical_size_attempt
+                        .lock()
+                        .unwrap_or_else(|e| e.into_inner())
+                        .take();
+                }
                 self.state.send_replace(new_state);
             }
         }
@@ -1345,6 +1360,8 @@ impl Timeline {
         walredo_mgr: Arc<dyn WalRedoManager + Send + Sync>,
         remote_client: Option<RemoteTimelineClient>,
         pg_version: u32,
+        initial_logical_size_can_start: Option<completion::Barrier>,
+        initial_logical_size_attempt: Option<completion::Completion>,
     ) -> Arc<Self> {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
         let (state, _) = watch::channel(TimelineState::Loading);
@@ -1439,6 +1456,9 @@ impl Timeline {
                     EvictionTaskTimelineState::default(),
                 ),
                 delete_lock: tokio::sync::Mutex::new(false),
+
+                initial_logical_size_can_start,
+                initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
             };
             result.repartition_threshold = result.get_checkpoint_distance() / 10;
             result
@@ -1927,7 +1947,27 @@ impl Timeline {
             false,
             // NB: don't log errors here, task_mgr will do that.
             async move {
-                // no cancellation here, because nothing really waits for this to complete compared
+
+                let cancel = task_mgr::shutdown_token();
+
+                // in case we were created during pageserver initialization, wait for
+                // initialization to complete before proceeding. startup time init runs on the same
+                // runtime.
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()); },
+                    _ = completion::Barrier::maybe_wait(self_clone.initial_logical_size_can_start.clone()) => {}
+                };
+
+                // hold off background tasks from starting until all timelines get to try at least
+                // once initial logical size calculation; though retry will rarely be useful.
+                // holding off is done because heavier tasks execute blockingly on the same
+                // runtime.
+                //
+                // dropping this at every outcome is probably better than trying to cling on to it,
+                // delay will be terminated by a timeout regardless.
+                let _completion = { self_clone.initial_logical_size_attempt.lock().expect("unexpected initial_logical_size_attempt poisoned").take() };
+
+                // no extra cancellation here, because nothing really waits for this to complete compared
                 // to spawn_ondemand_logical_size_calculation.
                 let cancel = CancellationToken::new();
 
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 7029d75d63..1040dff63d 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -49,9 +49,12 @@ pub struct EvictionTaskTenantState {
 }
 
 impl Timeline {
-    pub(super) fn launch_eviction_task(self: &Arc<Self>, init_done: Option<&completion::Barrier>) {
+    pub(super) fn launch_eviction_task(
+        self: &Arc<Self>,
+        background_tasks_can_start: Option<&completion::Barrier>,
+    ) {
         let self_clone = Arc::clone(self);
-        let init_done = init_done.cloned();
+        let background_tasks_can_start = background_tasks_can_start.cloned();
         task_mgr::spawn(
             BACKGROUND_RUNTIME.handle(),
             TaskKind::Eviction,
@@ -60,8 +63,13 @@ impl Timeline {
             &format!("layer eviction for {}/{}", self.tenant_id, self.timeline_id),
             false,
             async move {
-                completion::Barrier::maybe_wait(init_done).await;
-                self_clone.eviction_task(task_mgr::shutdown_token()).await;
+                let cancel = task_mgr::shutdown_token();
+                tokio::select! {
+                    _ = cancel.cancelled() => { return Ok(()); }
+                    _ = completion::Barrier::maybe_wait(background_tasks_can_start) => {}
+                };
+
+                self_clone.eviction_task(cancel).await;
                 info!("eviction task finishing");
                 Ok(())
             },
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index ab67518092..0ec023b9e1 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -110,6 +110,12 @@ class EvictionEnv:
             overrides=(
                 "--pageserver-config-override=disk_usage_based_eviction="
                 + enc.dump_inline_table(disk_usage_config).replace("\n", " "),
+                # Disk usage based eviction runs as a background task.
+                # But pageserver startup delays launch of background tasks for some time, to prioritize initial logical size calculations during startup.
+                # But, initial logical size calculation may not be triggered if safekeepers don't publish new broker messages.
+                # But, we only have a 10-second-timeout in this test.
+                # So, disable the delay for this test.
+                "--pageserver-config-override=background_task_maximum_delay='0s'",
             ),
         )
 

From 37bf2cac4f6416ff22f30e5bdacf190363915e58 Mon Sep 17 00:00:00 2001
From: Arseny Sher <sher-ars@yandex.ru>
Date: Wed, 7 Jun 2023 12:53:27 +0400
Subject: [PATCH 06/14] Persist safekeeper control file once in a while.

It should make remote_consistent_lsn commonly up-to-date on non actively writing
projects, which removes spike or pageserver -> safekeeper reconnections on
storage nodes restart.
---
 safekeeper/src/control_file.rs | 12 ++++++++++++
 safekeeper/src/remove_wal.rs   |  3 +++
 safekeeper/src/safekeeper.rs   | 32 ++++++++++++++++++++++++++++----
 safekeeper/src/timeline.rs     | 12 +++++++++++-
 4 files changed, 54 insertions(+), 5 deletions(-)

diff --git a/safekeeper/src/control_file.rs b/safekeeper/src/control_file.rs
index ba5e453e41..b1b0c032d7 100644
--- a/safekeeper/src/control_file.rs
+++ b/safekeeper/src/control_file.rs
@@ -7,6 +7,7 @@ use std::fs::{self, File, OpenOptions};
 use std::io::{Read, Write};
 use std::ops::Deref;
 use std::path::{Path, PathBuf};
+use std::time::Instant;
 
 use crate::control_file_upgrade::upgrade_control_file;
 use crate::metrics::PERSIST_CONTROL_FILE_SECONDS;
@@ -28,6 +29,9 @@ pub const CHECKSUM_SIZE: usize = std::mem::size_of::<u32>();
 pub trait Storage: Deref<Target = SafeKeeperState> {
     /// Persist safekeeper state on disk and update internal state.
     fn persist(&mut self, s: &SafeKeeperState) -> Result<()>;
+
+    /// Timestamp of last persist.
+    fn last_persist_at(&self) -> Instant;
 }
 
 #[derive(Debug)]
@@ -38,6 +42,8 @@ pub struct FileStorage {
 
     /// Last state persisted to disk.
     state: SafeKeeperState,
+    /// Not preserved across restarts.
+    last_persist_at: Instant,
 }
 
 impl FileStorage {
@@ -51,6 +57,7 @@ impl FileStorage {
             timeline_dir,
             conf: conf.clone(),
             state,
+            last_persist_at: Instant::now(),
         })
     }
 
@@ -66,6 +73,7 @@ impl FileStorage {
             timeline_dir,
             conf: conf.clone(),
             state,
+            last_persist_at: Instant::now(),
         };
 
         Ok(store)
@@ -216,6 +224,10 @@ impl Storage for FileStorage {
         self.state = s.clone();
         Ok(())
     }
+
+    fn last_persist_at(&self) -> Instant {
+        self.last_persist_at
+    }
 }
 
 #[cfg(test)]
diff --git a/safekeeper/src/remove_wal.rs b/safekeeper/src/remove_wal.rs
index b6d497f34e..ad9d655fae 100644
--- a/safekeeper/src/remove_wal.rs
+++ b/safekeeper/src/remove_wal.rs
@@ -17,6 +17,9 @@ pub fn thread_main(conf: SafeKeeperConf) {
             let ttid = tli.ttid;
             let _enter =
                 info_span!("", tenant = %ttid.tenant_id, timeline = %ttid.timeline_id).entered();
+            if let Err(e) = tli.maybe_pesist_control_file() {
+                warn!("failed to persist control file: {e}");
+            }
             if let Err(e) = tli.remove_old_wal(conf.wal_backup_enabled) {
                 warn!("failed to remove WAL: {}", e);
             }
diff --git a/safekeeper/src/safekeeper.rs b/safekeeper/src/safekeeper.rs
index eb434136d4..7378ccb994 100644
--- a/safekeeper/src/safekeeper.rs
+++ b/safekeeper/src/safekeeper.rs
@@ -10,6 +10,7 @@ use std::cmp::max;
 use std::cmp::min;
 use std::fmt;
 use std::io::Read;
+use std::time::Duration;
 use storage_broker::proto::SafekeeperTimelineInfo;
 
 use tracing::*;
@@ -837,6 +838,26 @@ where
         self.state.persist(&state)
     }
 
+    /// Persist control file if there is something to save and enough time
+    /// passed after the last save.
+    pub fn maybe_persist_control_file(&mut self, inmem_remote_consistent_lsn: Lsn) -> Result<()> {
+        const CF_SAVE_INTERVAL: Duration = Duration::from_secs(300);
+        if self.state.last_persist_at().elapsed() < CF_SAVE_INTERVAL {
+            return Ok(());
+        }
+        let need_persist = self.inmem.commit_lsn > self.state.commit_lsn
+            || self.inmem.backup_lsn > self.state.backup_lsn
+            || self.inmem.peer_horizon_lsn > self.state.peer_horizon_lsn
+            || inmem_remote_consistent_lsn > self.state.remote_consistent_lsn;
+        if need_persist {
+            let mut state = self.state.clone();
+            state.remote_consistent_lsn = inmem_remote_consistent_lsn;
+            self.persist_control_file(state)?;
+            trace!("saved control file: {CF_SAVE_INTERVAL:?} passed");
+        }
+        Ok(())
+    }
+
     /// Handle request to append WAL.
     #[allow(clippy::comparison_chain)]
     fn handle_append_request(
@@ -949,9 +970,8 @@ where
 
         if sync_control_file {
             let mut state = self.state.clone();
-            // Note: we do not persist remote_consistent_lsn in other paths of
-            // persisting cf -- that is not much needed currently. We could do
-            // that by storing Arc to walsenders in Safekeeper.
+            // Note: we could make remote_consistent_lsn update in cf common by
+            // storing Arc to walsenders in Safekeeper.
             state.remote_consistent_lsn = new_remote_consistent_lsn;
             self.persist_control_file(state)?;
         }
@@ -981,7 +1001,7 @@ mod tests {
 
     use super::*;
     use crate::wal_storage::Storage;
-    use std::ops::Deref;
+    use std::{ops::Deref, time::Instant};
 
     // fake storage for tests
     struct InMemoryState {
@@ -993,6 +1013,10 @@ mod tests {
             self.persisted_state = s.clone();
             Ok(())
         }
+
+        fn last_persist_at(&self) -> Instant {
+            Instant::now()
+        }
     }
 
     impl Deref for InMemoryState {
diff --git a/safekeeper/src/timeline.rs b/safekeeper/src/timeline.rs
index 2dbf215998..941f8dae54 100644
--- a/safekeeper/src/timeline.rs
+++ b/safekeeper/src/timeline.rs
@@ -234,7 +234,6 @@ impl SharedState {
             flush_lsn: self.sk.wal_store.flush_lsn().0,
             // note: this value is not flushed to control file yet and can be lost
             commit_lsn: self.sk.inmem.commit_lsn.0,
-            // TODO: rework feedbacks to avoid max here
             remote_consistent_lsn: remote_consistent_lsn.0,
             peer_horizon_lsn: self.sk.inmem.peer_horizon_lsn.0,
             safekeeper_connstr: conf.listen_pg_addr.clone(),
@@ -673,6 +672,17 @@ impl Timeline {
         Ok(())
     }
 
+    /// Persist control file if there is something to save and enough time
+    /// passed after the last save. This helps to keep remote_consistent_lsn up
+    /// to date so that storage nodes restart doesn't cause many pageserver ->
+    /// safekeeper reconnections.
+    pub fn maybe_pesist_control_file(&self) -> Result<()> {
+        let remote_consistent_lsn = self.walsenders.get_remote_consistent_lsn();
+        self.write_shared_state()
+            .sk
+            .maybe_persist_control_file(remote_consistent_lsn)
+    }
+
     /// Returns full timeline info, required for the metrics. If the timeline is
     /// not active, returns None instead.
     pub fn info_for_metrics(&self) -> Option<FullTimelineInfo> {

From 1c200bd15f34d240d5b2bf55c31414229f10e70e Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 7 Jun 2023 10:51:13 -0400
Subject: [PATCH 07/14] fix: break dev dependencies between wal_craft and
 pg_ffi (#4424)

## Problem

close https://github.com/neondatabase/neon/issues/4266

## Summary of changes

With this PR, rust-analyzer should be able to give lints and auto
complete in `mod tests`, and this makes writing tests easier.
Previously, rust-analyzer cannot do auto completion.

---------

Signed-off-by: Alex Chi <iskyzh@gmail.com>
---
 Cargo.lock                                    |   3 +-
 Cargo.toml                                    |  15 +-
 libs/postgres_ffi/Cargo.toml                  |   1 -
 libs/postgres_ffi/src/lib.rs                  |  12 +-
 .../postgres_ffi/src/wal_craft_test_export.rs |   6 +
 libs/postgres_ffi/src/xlog_utils.rs           | 218 +----------------
 libs/postgres_ffi/wal_craft/Cargo.toml        |   4 +
 libs/postgres_ffi/wal_craft/src/lib.rs        |  14 ++
 .../wal_craft/src/xlog_utils_test.rs          | 219 ++++++++++++++++++
 9 files changed, 270 insertions(+), 222 deletions(-)
 create mode 100644 libs/postgres_ffi/src/wal_craft_test_export.rs
 create mode 100644 libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs

diff --git a/Cargo.lock b/Cargo.lock
index d390df94e0..c078510129 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2874,7 +2874,6 @@ dependencies = [
  "serde",
  "thiserror",
  "utils",
- "wal_craft",
  "workspace_hack",
 ]
 
@@ -4894,7 +4893,9 @@ dependencies = [
  "once_cell",
  "postgres",
  "postgres_ffi",
+ "regex",
  "tempfile",
+ "utils",
  "workspace_hack",
 ]
 
diff --git a/Cargo.toml b/Cargo.toml
index 1cb8d65948..d7bffe67e1 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -9,7 +9,20 @@ members = [
     "storage_broker",
     "workspace_hack",
     "trace",
-    "libs/*",
+    "libs/compute_api",
+    "libs/pageserver_api",
+    "libs/postgres_ffi",
+    "libs/safekeeper_api",
+    "libs/utils",
+    "libs/consumption_metrics",
+    "libs/postgres_backend",
+    "libs/pq_proto",
+    "libs/tenant_size_model",
+    "libs/metrics",
+    "libs/postgres_connection",
+    "libs/remote_storage",
+    "libs/tracing-utils",
+    "libs/postgres_ffi/wal_craft",
 ]
 
 [workspace.package]
diff --git a/libs/postgres_ffi/Cargo.toml b/libs/postgres_ffi/Cargo.toml
index 159fc5946d..86e72f6bdd 100644
--- a/libs/postgres_ffi/Cargo.toml
+++ b/libs/postgres_ffi/Cargo.toml
@@ -24,7 +24,6 @@ workspace_hack.workspace = true
 [dev-dependencies]
 env_logger.workspace = true
 postgres.workspace = true
-wal_craft = { path = "wal_craft" }
 
 [build-dependencies]
 anyhow.workspace = true
diff --git a/libs/postgres_ffi/src/lib.rs b/libs/postgres_ffi/src/lib.rs
index b8eb469cb0..cc115664d5 100644
--- a/libs/postgres_ffi/src/lib.rs
+++ b/libs/postgres_ffi/src/lib.rs
@@ -33,6 +33,7 @@ macro_rules! postgres_ffi {
             }
             pub mod controlfile_utils;
             pub mod nonrelfile_utils;
+            pub mod wal_craft_test_export;
             pub mod waldecoder_handler;
             pub mod xlog_utils;
 
@@ -45,8 +46,15 @@ macro_rules! postgres_ffi {
     };
 }
 
-postgres_ffi!(v14);
-postgres_ffi!(v15);
+#[macro_export]
+macro_rules! for_all_postgres_versions {
+    ($macro:tt) => {
+        $macro!(v14);
+        $macro!(v15);
+    };
+}
+
+for_all_postgres_versions! { postgres_ffi }
 
 pub mod pg_constants;
 pub mod relfile_utils;
diff --git a/libs/postgres_ffi/src/wal_craft_test_export.rs b/libs/postgres_ffi/src/wal_craft_test_export.rs
new file mode 100644
index 0000000000..147567c442
--- /dev/null
+++ b/libs/postgres_ffi/src/wal_craft_test_export.rs
@@ -0,0 +1,6 @@
+//! This module is for WAL craft to test with postgres_ffi. Should not import any thing in normal usage.
+
+pub use super::PG_MAJORVERSION;
+pub use super::xlog_utils::*;
+pub use super::bindings::*;
+pub use crate::WAL_SEGMENT_SIZE;
diff --git a/libs/postgres_ffi/src/xlog_utils.rs b/libs/postgres_ffi/src/xlog_utils.rs
index 4d7bb61883..61a9c38a84 100644
--- a/libs/postgres_ffi/src/xlog_utils.rs
+++ b/libs/postgres_ffi/src/xlog_utils.rs
@@ -481,220 +481,4 @@ pub fn encode_logical_message(prefix: &str, message: &str) -> Vec<u8> {
     wal
 }
 
-#[cfg(test)]
-mod tests {
-    use super::super::PG_MAJORVERSION;
-    use super::*;
-    use regex::Regex;
-    use std::cmp::min;
-    use std::fs;
-    use std::{env, str::FromStr};
-    use utils::const_assert;
-
-    fn init_logging() {
-        let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
-            format!("wal_craft=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
-        ))
-        .is_test(true)
-        .try_init();
-    }
-
-    fn test_end_of_wal<C: wal_craft::Crafter>(test_name: &str) {
-        use wal_craft::*;
-
-        let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
-
-        // Craft some WAL
-        let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
-            .join("..")
-            .join("..");
-        let cfg = Conf {
-            pg_version,
-            pg_distrib_dir: top_path.join("pg_install"),
-            datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)),
-        };
-        if cfg.datadir.exists() {
-            fs::remove_dir_all(&cfg.datadir).unwrap();
-        }
-        cfg.initdb().unwrap();
-        let srv = cfg.start_server().unwrap();
-        let (intermediate_lsns, expected_end_of_wal_partial) =
-            C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
-        let intermediate_lsns: Vec<Lsn> = intermediate_lsns
-            .iter()
-            .map(|&lsn| u64::from(lsn).into())
-            .collect();
-        let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
-        srv.kill();
-
-        // Check find_end_of_wal on the initial WAL
-        let last_segment = cfg
-            .wal_dir()
-            .read_dir()
-            .unwrap()
-            .map(|f| f.unwrap().file_name().into_string().unwrap())
-            .filter(|fname| IsXLogFileName(fname))
-            .max()
-            .unwrap();
-        check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
-        for start_lsn in intermediate_lsns
-            .iter()
-            .chain(std::iter::once(&expected_end_of_wal))
-        {
-            // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
-            // We assume that `start_lsn` is non-decreasing.
-            info!(
-                "Checking with start_lsn={}, erasing WAL before it",
-                start_lsn
-            );
-            for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
-                let fname = file.file_name().into_string().unwrap();
-                if !IsXLogFileName(&fname) {
-                    continue;
-                }
-                let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
-                let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
-                if seg_start_lsn > u64::from(*start_lsn) {
-                    continue;
-                }
-                let mut f = File::options().write(true).open(file.path()).unwrap();
-                const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
-                f.write_all(
-                    &ZEROS[0..min(
-                        WAL_SEGMENT_SIZE,
-                        (u64::from(*start_lsn) - seg_start_lsn) as usize,
-                    )],
-                )
-                .unwrap();
-            }
-            check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
-        }
-    }
-
-    fn check_pg_waldump_end_of_wal(
-        cfg: &wal_craft::Conf,
-        last_segment: &str,
-        expected_end_of_wal: Lsn,
-    ) {
-        // Get the actual end of WAL by pg_waldump
-        let waldump_output = cfg
-            .pg_waldump("000000010000000000000001", last_segment)
-            .unwrap()
-            .stderr;
-        let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
-        let caps = match Regex::new(r"invalid record length at (.+):")
-            .unwrap()
-            .captures(waldump_output)
-        {
-            Some(caps) => caps,
-            None => {
-                error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output);
-                panic!();
-            }
-        };
-        let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
-        info!(
-            "waldump erred on {}, expected wal end at {}",
-            waldump_wal_end, expected_end_of_wal
-        );
-        assert_eq!(waldump_wal_end, expected_end_of_wal);
-    }
-
-    fn check_end_of_wal(
-        cfg: &wal_craft::Conf,
-        last_segment: &str,
-        start_lsn: Lsn,
-        expected_end_of_wal: Lsn,
-    ) {
-        // Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
-        // let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
-        // info!(
-        //     "find_end_of_wal returned wal_end={} with non-partial WAL segment",
-        //     wal_end
-        // );
-        // assert_eq!(wal_end, expected_end_of_wal_non_partial);
-
-        // Rename file to partial to actually find last valid lsn, then rename it back.
-        fs::rename(
-            cfg.wal_dir().join(last_segment),
-            cfg.wal_dir().join(format!("{}.partial", last_segment)),
-        )
-        .unwrap();
-        let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
-        info!(
-            "find_end_of_wal returned wal_end={} with partial WAL segment",
-            wal_end
-        );
-        assert_eq!(wal_end, expected_end_of_wal);
-        fs::rename(
-            cfg.wal_dir().join(format!("{}.partial", last_segment)),
-            cfg.wal_dir().join(last_segment),
-        )
-        .unwrap();
-    }
-
-    const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
-
-    #[test]
-    pub fn test_find_end_of_wal_simple() {
-        init_logging();
-        test_end_of_wal::<wal_craft::Simple>("test_find_end_of_wal_simple");
-    }
-
-    #[test]
-    pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
-        init_logging();
-        test_end_of_wal::<wal_craft::WalRecordCrossingSegmentFollowedBySmallOne>(
-            "test_find_end_of_wal_crossing_segment_followed_by_small_one",
-        );
-    }
-
-    #[test]
-    pub fn test_find_end_of_wal_last_crossing_segment() {
-        init_logging();
-        test_end_of_wal::<wal_craft::LastWalRecordCrossingSegment>(
-            "test_find_end_of_wal_last_crossing_segment",
-        );
-    }
-
-    /// Check the math in update_next_xid
-    ///
-    /// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
-    /// currently 1024.
-    #[test]
-    pub fn test_update_next_xid() {
-        let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
-        let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
-
-        checkpoint.nextXid = FullTransactionId { value: 10 };
-        assert_eq!(checkpoint.nextXid.value, 10);
-
-        // The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL
-        // boundary
-        checkpoint.update_next_xid(100);
-        assert_eq!(checkpoint.nextXid.value, 1024);
-
-        // No change
-        checkpoint.update_next_xid(500);
-        assert_eq!(checkpoint.nextXid.value, 1024);
-        checkpoint.update_next_xid(1023);
-        assert_eq!(checkpoint.nextXid.value, 1024);
-
-        // The function returns the *next* XID, given the highest XID seen so
-        // far. So when we pass 1024, the nextXid gets bumped up to the next
-        // XID_CHECKPOINT_INTERVAL boundary.
-        checkpoint.update_next_xid(1024);
-        assert_eq!(checkpoint.nextXid.value, 2048);
-    }
-
-    #[test]
-    pub fn test_encode_logical_message() {
-        let expected = [
-            64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
-            38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
-            101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
-        ];
-        let actual = encode_logical_message("prefix", "message");
-        assert_eq!(expected, actual[..]);
-    }
-}
+// If you need to craft WAL and write tests for this module, put it at wal_craft crate.
diff --git a/libs/postgres_ffi/wal_craft/Cargo.toml b/libs/postgres_ffi/wal_craft/Cargo.toml
index 992bf7460b..bea888b23e 100644
--- a/libs/postgres_ffi/wal_craft/Cargo.toml
+++ b/libs/postgres_ffi/wal_craft/Cargo.toml
@@ -15,3 +15,7 @@ postgres_ffi.workspace = true
 tempfile.workspace = true
 
 workspace_hack.workspace = true
+
+[dev-dependencies]
+regex.workspace = true
+utils.workspace = true
diff --git a/libs/postgres_ffi/wal_craft/src/lib.rs b/libs/postgres_ffi/wal_craft/src/lib.rs
index 9f3f4dc20d..d4aed88048 100644
--- a/libs/postgres_ffi/wal_craft/src/lib.rs
+++ b/libs/postgres_ffi/wal_craft/src/lib.rs
@@ -10,6 +10,20 @@ use std::process::Command;
 use std::time::{Duration, Instant};
 use tempfile::{tempdir, TempDir};
 
+macro_rules! xlog_utils_test {
+    ($version:ident) => {
+        #[path = "."]
+        mod $version {
+            pub use postgres_ffi::$version::wal_craft_test_export::*;
+            #[allow(clippy::duplicate_mod)]
+            #[cfg(test)]
+            mod xlog_utils_test;
+        }
+    };
+}
+
+postgres_ffi::for_all_postgres_versions! { xlog_utils_test }
+
 #[derive(Debug, Clone, PartialEq, Eq)]
 pub struct Conf {
     pub pg_version: u32,
diff --git a/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
new file mode 100644
index 0000000000..6ff4c563b2
--- /dev/null
+++ b/libs/postgres_ffi/wal_craft/src/xlog_utils_test.rs
@@ -0,0 +1,219 @@
+//! Tests for postgres_ffi xlog_utils module. Put it here to break cyclic dependency.
+
+use super::*;
+use crate::{error, info};
+use regex::Regex;
+use std::cmp::min;
+use std::fs::{self, File};
+use std::io::Write;
+use std::{env, str::FromStr};
+use utils::const_assert;
+use utils::lsn::Lsn;
+
+fn init_logging() {
+    let _ = env_logger::Builder::from_env(env_logger::Env::default().default_filter_or(
+        format!("crate=info,postgres_ffi::{PG_MAJORVERSION}::xlog_utils=trace"),
+    ))
+    .is_test(true)
+    .try_init();
+}
+
+fn test_end_of_wal<C: crate::Crafter>(test_name: &str) {
+    use crate::*;
+
+    let pg_version = PG_MAJORVERSION[1..3].parse::<u32>().unwrap();
+
+    // Craft some WAL
+    let top_path = PathBuf::from(env!("CARGO_MANIFEST_DIR"))
+        .join("..")
+        .join("..")
+        .join("..");
+    let cfg = Conf {
+        pg_version,
+        pg_distrib_dir: top_path.join("pg_install"),
+        datadir: top_path.join(format!("test_output/{}-{PG_MAJORVERSION}", test_name)),
+    };
+    if cfg.datadir.exists() {
+        fs::remove_dir_all(&cfg.datadir).unwrap();
+    }
+    cfg.initdb().unwrap();
+    let srv = cfg.start_server().unwrap();
+    let (intermediate_lsns, expected_end_of_wal_partial) =
+        C::craft(&mut srv.connect_with_timeout().unwrap()).unwrap();
+    let intermediate_lsns: Vec<Lsn> = intermediate_lsns
+        .iter()
+        .map(|&lsn| u64::from(lsn).into())
+        .collect();
+    let expected_end_of_wal: Lsn = u64::from(expected_end_of_wal_partial).into();
+    srv.kill();
+
+    // Check find_end_of_wal on the initial WAL
+    let last_segment = cfg
+        .wal_dir()
+        .read_dir()
+        .unwrap()
+        .map(|f| f.unwrap().file_name().into_string().unwrap())
+        .filter(|fname| IsXLogFileName(fname))
+        .max()
+        .unwrap();
+    check_pg_waldump_end_of_wal(&cfg, &last_segment, expected_end_of_wal);
+    for start_lsn in intermediate_lsns
+        .iter()
+        .chain(std::iter::once(&expected_end_of_wal))
+    {
+        // Erase all WAL before `start_lsn` to ensure it's not used by `find_end_of_wal`.
+        // We assume that `start_lsn` is non-decreasing.
+        info!(
+            "Checking with start_lsn={}, erasing WAL before it",
+            start_lsn
+        );
+        for file in fs::read_dir(cfg.wal_dir()).unwrap().flatten() {
+            let fname = file.file_name().into_string().unwrap();
+            if !IsXLogFileName(&fname) {
+                continue;
+            }
+            let (segno, _) = XLogFromFileName(&fname, WAL_SEGMENT_SIZE);
+            let seg_start_lsn = XLogSegNoOffsetToRecPtr(segno, 0, WAL_SEGMENT_SIZE);
+            if seg_start_lsn > u64::from(*start_lsn) {
+                continue;
+            }
+            let mut f = File::options().write(true).open(file.path()).unwrap();
+            const ZEROS: [u8; WAL_SEGMENT_SIZE] = [0u8; WAL_SEGMENT_SIZE];
+            f.write_all(
+                &ZEROS[0..min(
+                    WAL_SEGMENT_SIZE,
+                    (u64::from(*start_lsn) - seg_start_lsn) as usize,
+                )],
+            )
+            .unwrap();
+        }
+        check_end_of_wal(&cfg, &last_segment, *start_lsn, expected_end_of_wal);
+    }
+}
+
+fn check_pg_waldump_end_of_wal(
+    cfg: &crate::Conf,
+    last_segment: &str,
+    expected_end_of_wal: Lsn,
+) {
+    // Get the actual end of WAL by pg_waldump
+    let waldump_output = cfg
+        .pg_waldump("000000010000000000000001", last_segment)
+        .unwrap()
+        .stderr;
+    let waldump_output = std::str::from_utf8(&waldump_output).unwrap();
+    let caps = match Regex::new(r"invalid record length at (.+):")
+        .unwrap()
+        .captures(waldump_output)
+    {
+        Some(caps) => caps,
+        None => {
+            error!("Unable to parse pg_waldump's stderr:\n{}", waldump_output);
+            panic!();
+        }
+    };
+    let waldump_wal_end = Lsn::from_str(caps.get(1).unwrap().as_str()).unwrap();
+    info!(
+        "waldump erred on {}, expected wal end at {}",
+        waldump_wal_end, expected_end_of_wal
+    );
+    assert_eq!(waldump_wal_end, expected_end_of_wal);
+}
+
+fn check_end_of_wal(
+    cfg: &crate::Conf,
+    last_segment: &str,
+    start_lsn: Lsn,
+    expected_end_of_wal: Lsn,
+) {
+    // Check end_of_wal on non-partial WAL segment (we treat it as fully populated)
+    // let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
+    // info!(
+    //     "find_end_of_wal returned wal_end={} with non-partial WAL segment",
+    //     wal_end
+    // );
+    // assert_eq!(wal_end, expected_end_of_wal_non_partial);
+
+    // Rename file to partial to actually find last valid lsn, then rename it back.
+    fs::rename(
+        cfg.wal_dir().join(last_segment),
+        cfg.wal_dir().join(format!("{}.partial", last_segment)),
+    )
+    .unwrap();
+    let wal_end = find_end_of_wal(&cfg.wal_dir(), WAL_SEGMENT_SIZE, start_lsn).unwrap();
+    info!(
+        "find_end_of_wal returned wal_end={} with partial WAL segment",
+        wal_end
+    );
+    assert_eq!(wal_end, expected_end_of_wal);
+    fs::rename(
+        cfg.wal_dir().join(format!("{}.partial", last_segment)),
+        cfg.wal_dir().join(last_segment),
+    )
+    .unwrap();
+}
+
+const_assert!(WAL_SEGMENT_SIZE == 16 * 1024 * 1024);
+
+#[test]
+pub fn test_find_end_of_wal_simple() {
+    init_logging();
+    test_end_of_wal::<crate::Simple>("test_find_end_of_wal_simple");
+}
+
+#[test]
+pub fn test_find_end_of_wal_crossing_segment_followed_by_small_one() {
+    init_logging();
+    test_end_of_wal::<crate::WalRecordCrossingSegmentFollowedBySmallOne>(
+        "test_find_end_of_wal_crossing_segment_followed_by_small_one",
+    );
+}
+
+#[test]
+pub fn test_find_end_of_wal_last_crossing_segment() {
+    init_logging();
+    test_end_of_wal::<crate::LastWalRecordCrossingSegment>(
+        "test_find_end_of_wal_last_crossing_segment",
+    );
+}
+
+/// Check the math in update_next_xid
+///
+/// NOTE: These checks are sensitive to the value of XID_CHECKPOINT_INTERVAL,
+/// currently 1024.
+#[test]
+pub fn test_update_next_xid() {
+    let checkpoint_buf = [0u8; std::mem::size_of::<CheckPoint>()];
+    let mut checkpoint = CheckPoint::decode(&checkpoint_buf).unwrap();
+
+    checkpoint.nextXid = FullTransactionId { value: 10 };
+    assert_eq!(checkpoint.nextXid.value, 10);
+
+    // The input XID gets rounded up to the next XID_CHECKPOINT_INTERVAL
+    // boundary
+    checkpoint.update_next_xid(100);
+    assert_eq!(checkpoint.nextXid.value, 1024);
+
+    // No change
+    checkpoint.update_next_xid(500);
+    assert_eq!(checkpoint.nextXid.value, 1024);
+    checkpoint.update_next_xid(1023);
+    assert_eq!(checkpoint.nextXid.value, 1024);
+
+    // The function returns the *next* XID, given the highest XID seen so
+    // far. So when we pass 1024, the nextXid gets bumped up to the next
+    // XID_CHECKPOINT_INTERVAL boundary.
+    checkpoint.update_next_xid(1024);
+    assert_eq!(checkpoint.nextXid.value, 2048);
+}
+
+#[test]
+pub fn test_encode_logical_message() {
+    let expected = [
+        64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 21, 0, 0, 170, 34, 166, 227, 255,
+        38, 0, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 7, 0, 0, 0, 0, 0, 0, 0, 112, 114,
+        101, 102, 105, 120, 0, 109, 101, 115, 115, 97, 103, 101,
+    ];
+    let actual = encode_logical_message("prefix", "message");
+    assert_eq!(expected, actual[..]);
+}

From 1a1019990a59aa69675e4329d1e5e1e08bbddb71 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Wed, 7 Jun 2023 18:25:30 +0300
Subject: [PATCH 08/14] map TenantState::Broken to
 TenantAttachmentStatus::Failed (#4371)

## Problem

Attach failures are not reported in public part of the api (in
`attachment_status` field of TenantInfo).

## Summary of changes

Expose TenantState::Broken as TenantAttachmentStatus::Failed

In the way its written Failed status will be reported even if no
attachment happened. (I e if tenant become broken on startup). This is
in line with other members. I e Active will be resolved to Attached even
if no actual attach took place.

This can be tweaked if needed. At the current stage it would be overengineering without clear motivation

resolves #4344
---
 libs/pageserver_api/src/models.rs          | 22 ++++++++++++--------
 pageserver/src/http/openapi_spec.yml       | 24 ++++++++++++++++++----
 test_runner/fixtures/pageserver/utils.py   |  6 +++---
 test_runner/regress/test_remote_storage.py |  7 ++++++-
 test_runner/regress/test_tenant_detach.py  |  2 +-
 5 files changed, 43 insertions(+), 18 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 162bf6b294..ddce82324c 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -110,12 +110,11 @@ impl TenantState {
             Self::Active => Attached,
             // If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
             // However, it also becomes Broken if the regular load fails.
-            // We would need a separate TenantState variant to distinguish these cases.
-            // However, there's no practical difference from Console's perspective.
-            // It will run a Postgres-level health check as soon as it observes Attached.
-            // That will fail on Broken tenants.
-            // Console can then rollback the attach, or, wait for operator to fix the Broken tenant.
-            Self::Broken { .. } => Attached,
+            // From Console's perspective there's no practical difference
+            // because attachment_status is polled by console only during attach operation execution.
+            Self::Broken { reason, .. } => Failed {
+                reason: reason.to_owned(),
+            },
             // Why is Stopping a Maybe case? Because, during pageserver shutdown,
             // we set the Stopping state irrespective of whether the tenant
             // has finished attaching or not.
@@ -312,10 +311,11 @@ impl std::ops::Deref for TenantAttachConfig {
 
 /// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
 #[derive(Serialize, Deserialize, Clone)]
-#[serde(rename_all = "snake_case")]
+#[serde(tag = "slug", content = "data", rename_all = "snake_case")]
 pub enum TenantAttachmentStatus {
     Maybe,
     Attached,
+    Failed { reason: String },
 }
 
 #[serde_as]
@@ -809,7 +809,9 @@ mod tests {
                 "slug": "Active",
             },
             "current_physical_size": 42,
-            "attachment_status": "attached",
+            "attachment_status": {
+                "slug":"attached",
+            }
         });
 
         let original_broken = TenantInfo {
@@ -831,7 +833,9 @@ mod tests {
                 }
             },
             "current_physical_size": 42,
-            "attachment_status": "attached",
+            "attachment_status": {
+                "slug":"attached",
+            }
         });
 
         assert_eq!(
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 0d912c95e0..1f8298ca3e 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -928,12 +928,28 @@ components:
               writing to the tenant's S3 state, so, DO NOT ATTACH the
               tenant to any other pageserver, or we risk split-brain.
             - `attached` means that the attach operation has completed,
-              maybe successfully, maybe not. Perform a health check at
-              the Postgres level to determine healthiness of the tenant.
+              successfully
+            - `failed` means that attach has failed. For reason check corresponding `reason` failed.
+              `failed` is the terminal state, retrying attach call wont resolve the issue.
+              For example this can be caused by s3 being unreachable. The retry may be implemented
+              with call to detach, though it would be better to not automate it and inspec failed state
+              manually before proceeding with a retry.
 
             See the tenant `/attach` endpoint for more information.
-          type: string
-          enum: [ "maybe", "attached" ]
+          type: object
+          required:
+            - slug
+            - data
+          properties:
+            slug:
+              type: string
+              enum: [ "maybe", "attached", "failed" ]
+            data:
+            - type: object
+              properties:
+                reason:
+                  type: string
+
     TenantCreateRequest:
       allOf:
         - $ref: '#/components/schemas/TenantConfig'
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index c558387413..d7ffa633fd 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -1,5 +1,5 @@
 import time
-from typing import Optional
+from typing import Any, Dict, Optional
 
 from fixtures.log_helper import log
 from fixtures.pageserver.http import PageserverHttpClient
@@ -72,7 +72,7 @@ def wait_until_tenant_state(
     expected_state: str,
     iterations: int,
     period: float = 1.0,
-) -> bool:
+) -> Dict[str, Any]:
     """
     Does not use `wait_until` for debugging purposes
     """
@@ -81,7 +81,7 @@ def wait_until_tenant_state(
             tenant = pageserver_http.tenant_status(tenant_id=tenant_id)
             log.debug(f"Tenant {tenant_id} data: {tenant}")
             if tenant["state"]["slug"] == expected_state:
-                return True
+                return tenant
         except Exception as e:
             log.debug(f"Tenant {tenant_id} state retrieval failure: {e}")
 
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index baef8ecacc..742dbfff95 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -147,7 +147,12 @@ def test_remote_storage_backup_and_restore(
     # listing the remote timelines will fail because of the failpoint,
     # and the tenant will be marked as Broken.
     client.tenant_attach(tenant_id)
-    wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15)
+
+    tenant_info = wait_until_tenant_state(pageserver_http, tenant_id, "Broken", 15)
+    assert tenant_info["attachment_status"] == {
+        "slug": "failed",
+        "data": {"reason": "storage-sync-list-remote-timelines"},
+    }
 
     # Ensure that even though the tenant is broken, we can't attach it again.
     with pytest.raises(Exception, match=f"tenant {tenant_id} already exists, state: Broken"):
diff --git a/test_runner/regress/test_tenant_detach.py b/test_runner/regress/test_tenant_detach.py
index 9d0fdcfaf8..2a015d5d17 100644
--- a/test_runner/regress/test_tenant_detach.py
+++ b/test_runner/regress/test_tenant_detach.py
@@ -532,7 +532,7 @@ def test_ignored_tenant_reattach(
 ):
     neon_env_builder.enable_remote_storage(
         remote_storage_kind=remote_storage_kind,
-        test_name="test_remote_storage_backup_and_restore",
+        test_name="test_ignored_tenant_reattach",
     )
     env = neon_env_builder.init_start()
     pageserver_http = env.pageserver.http_client()

From 2e687bca5b4b541bdd483d6b0448367d547059c7 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Wed, 7 Jun 2023 11:28:18 -0400
Subject: [PATCH 09/14] refactor: use LayerDesc in layer map (part 1) (#4408)

## Problem

part of https://github.com/neondatabase/neon/issues/4392

## Summary of changes

This PR adds a new HashMap that maps persistent layer desc to the layer
object *inside* LayerMap. Originally I directly went towards adding such
layer cache in Timeline, but the changes are too many and cannot be
reviewed as a reasonably-sized PR. Therefore, we take this intermediate
step to change part of the codebase to use persistent layer desc, and
come up with other PRs to move this hash map of layer desc to the
timeline struct.

Also, file_size is now part of the layer desc.

---------

Signed-off-by: Alex Chi <iskyzh@gmail.com>
Co-authored-by: bojanserafimov <bojan.serafimov7@gmail.com>
---
 pageserver/benches/bench_layer_map.rs         |   4 +-
 pageserver/src/tenant/layer_map.rs            | 163 +++++++++++++-----
 pageserver/src/tenant/storage_layer.rs        |  20 ++-
 .../src/tenant/storage_layer/delta_layer.rs   |  16 +-
 .../src/tenant/storage_layer/image_layer.rs   |  32 ++--
 .../src/tenant/storage_layer/layer_desc.rs    |  69 +++++++-
 .../src/tenant/storage_layer/remote_layer.rs  |   6 +-
 pageserver/src/tenant/timeline.rs             |  27 +--
 8 files changed, 247 insertions(+), 90 deletions(-)

diff --git a/pageserver/benches/bench_layer_map.rs b/pageserver/benches/bench_layer_map.rs
index ee5980212e..45dc9fad4a 100644
--- a/pageserver/benches/bench_layer_map.rs
+++ b/pageserver/benches/bench_layer_map.rs
@@ -33,7 +33,7 @@ fn build_layer_map(filename_dump: PathBuf) -> LayerMap<LayerDescriptor> {
         min_lsn = min(min_lsn, lsn_range.start);
         max_lsn = max(max_lsn, Lsn(lsn_range.end.0 - 1));
 
-        updates.insert_historic(Arc::new(layer));
+        updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
     }
 
     println!("min: {min_lsn}, max: {max_lsn}");
@@ -215,7 +215,7 @@ fn bench_sequential(c: &mut Criterion) {
             is_incremental: false,
             short_id: format!("Layer {}", i),
         };
-        updates.insert_historic(Arc::new(layer));
+        updates.insert_historic(layer.get_persistent_layer_desc(), Arc::new(layer));
     }
     updates.flush();
     println!("Finished layer map init in {:?}", now.elapsed());
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 8d06ccd565..ca1a71b623 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,7 +51,9 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use crate::tenant::storage_layer::Layer;
+use anyhow::Context;
 use anyhow::Result;
+use std::collections::HashMap;
 use std::collections::VecDeque;
 use std::ops::Range;
 use std::sync::Arc;
@@ -61,6 +63,8 @@ use historic_layer_coverage::BufferedHistoricLayerCoverage;
 pub use historic_layer_coverage::Replacement;
 
 use super::storage_layer::range_eq;
+use super::storage_layer::PersistentLayerDesc;
+use super::storage_layer::PersistentLayerKey;
 
 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -86,11 +90,16 @@ pub struct LayerMap<L: ?Sized> {
     pub frozen_layers: VecDeque<Arc<InMemoryLayer>>,
 
     /// Index of the historic layers optimized for search
-    historic: BufferedHistoricLayerCoverage<Arc<L>>,
+    historic: BufferedHistoricLayerCoverage<Arc<PersistentLayerDesc>>,
 
     /// L0 layers have key range Key::MIN..Key::MAX, and locating them using R-Tree search is very inefficient.
     /// So L0 layers are held in l0_delta_layers vector, in addition to the R-tree.
-    l0_delta_layers: Vec<Arc<L>>,
+    l0_delta_layers: Vec<Arc<PersistentLayerDesc>>,
+
+    /// Mapping from persistent layer key to the actual layer object. Currently, it stores delta, image, and
+    /// remote layers. In future refactors, this will be eventually moved out of LayerMap into Timeline, and
+    /// RemoteLayer will be removed.
+    mapping: HashMap<PersistentLayerKey, Arc<L>>,
 }
 
 impl<L: ?Sized> Default for LayerMap<L> {
@@ -101,6 +110,7 @@ impl<L: ?Sized> Default for LayerMap<L> {
             frozen_layers: VecDeque::default(),
             l0_delta_layers: Vec::default(),
             historic: BufferedHistoricLayerCoverage::default(),
+            mapping: HashMap::default(),
         }
     }
 }
@@ -125,8 +135,9 @@ where
     ///
     /// Insert an on-disk layer.
     ///
-    pub fn insert_historic(&mut self, layer: Arc<L>) {
-        self.layer_map.insert_historic_noflush(layer)
+    // TODO remove the `layer` argument when `mapping` is refactored out of `LayerMap`
+    pub fn insert_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
+        self.layer_map.insert_historic_noflush(layer_desc, layer)
     }
 
     ///
@@ -134,8 +145,8 @@ where
     ///
     /// This should be called when the corresponding file on disk has been deleted.
     ///
-    pub fn remove_historic(&mut self, layer: Arc<L>) {
-        self.layer_map.remove_historic_noflush(layer)
+    pub fn remove_historic(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
+        self.layer_map.remove_historic_noflush(layer_desc, layer)
     }
 
     /// Replaces existing layer iff it is the `expected`.
@@ -150,12 +161,15 @@ where
     ///      that we can replace values only by updating a hashmap.
     pub fn replace_historic(
         &mut self,
+        expected_desc: PersistentLayerDesc,
         expected: &Arc<L>,
+        new_desc: PersistentLayerDesc,
         new: Arc<L>,
     ) -> anyhow::Result<Replacement<Arc<L>>> {
         fail::fail_point!("layermap-replace-notfound", |_| Ok(Replacement::NotFound));
 
-        self.layer_map.replace_historic_noflush(expected, new)
+        self.layer_map
+            .replace_historic_noflush(expected_desc, expected, new_desc, new)
     }
 
     // We will flush on drop anyway, but this method makes it
@@ -230,6 +244,7 @@ where
             (None, None) => None,
             (None, Some(image)) => {
                 let lsn_floor = image.get_lsn_range().start;
+                let image = self.get_layer_from_mapping(&image.key()).clone();
                 Some(SearchResult {
                     layer: image,
                     lsn_floor,
@@ -237,6 +252,7 @@ where
             }
             (Some(delta), None) => {
                 let lsn_floor = delta.get_lsn_range().start;
+                let delta = self.get_layer_from_mapping(&delta.key()).clone();
                 Some(SearchResult {
                     layer: delta,
                     lsn_floor,
@@ -247,6 +263,7 @@ where
                 let image_is_newer = image.get_lsn_range().end >= delta.get_lsn_range().end;
                 let image_exact_match = img_lsn + 1 == end_lsn;
                 if image_is_newer || image_exact_match {
+                    let image = self.get_layer_from_mapping(&image.key()).clone();
                     Some(SearchResult {
                         layer: image,
                         lsn_floor: img_lsn,
@@ -254,6 +271,7 @@ where
                 } else {
                     let lsn_floor =
                         std::cmp::max(delta.get_lsn_range().start, image.get_lsn_range().start + 1);
+                    let delta = self.get_layer_from_mapping(&delta.key()).clone();
                     Some(SearchResult {
                         layer: delta,
                         lsn_floor,
@@ -273,16 +291,33 @@ where
     ///
     /// Helper function for BatchedUpdates::insert_historic
     ///
-    pub(self) fn insert_historic_noflush(&mut self, layer: Arc<L>) {
+    /// TODO(chi): remove L generic so that we do not need to pass layer object.
+    pub(self) fn insert_historic_noflush(
+        &mut self,
+        layer_desc: PersistentLayerDesc,
+        layer: Arc<L>,
+    ) {
+        self.mapping.insert(layer_desc.key(), layer.clone());
+
         // TODO: See #3869, resulting #4088, attempted fix and repro #4094
-        self.historic.insert(
-            historic_layer_coverage::LayerKey::from(&*layer),
-            Arc::clone(&layer),
-        );
 
         if Self::is_l0(&layer) {
-            self.l0_delta_layers.push(layer);
+            self.l0_delta_layers.push(layer_desc.clone().into());
         }
+
+        self.historic.insert(
+            historic_layer_coverage::LayerKey::from(&*layer),
+            layer_desc.into(),
+        );
+    }
+
+    fn get_layer_from_mapping(&self, key: &PersistentLayerKey) -> &Arc<L> {
+        let layer = self
+            .mapping
+            .get(key)
+            .with_context(|| format!("{key:?}"))
+            .expect("inconsistent layer mapping");
+        layer
     }
 
     ///
@@ -290,14 +325,16 @@ where
     ///
     /// Helper function for BatchedUpdates::remove_historic
     ///
-    pub fn remove_historic_noflush(&mut self, layer: Arc<L>) {
+    pub fn remove_historic_noflush(&mut self, layer_desc: PersistentLayerDesc, layer: Arc<L>) {
         self.historic
             .remove(historic_layer_coverage::LayerKey::from(&*layer));
-
         if Self::is_l0(&layer) {
             let len_before = self.l0_delta_layers.len();
-            self.l0_delta_layers
-                .retain(|other| !Self::compare_arced_layers(other, &layer));
+            let mut l0_delta_layers = std::mem::take(&mut self.l0_delta_layers);
+            l0_delta_layers.retain(|other| {
+                !Self::compare_arced_layers(self.get_layer_from_mapping(&other.key()), &layer)
+            });
+            self.l0_delta_layers = l0_delta_layers;
             // this assertion is related to use of Arc::ptr_eq in Self::compare_arced_layers,
             // there's a chance that the comparison fails at runtime due to it comparing (pointer,
             // vtable) pairs.
@@ -307,11 +344,14 @@ where
                 "failed to locate removed historic layer from l0_delta_layers"
             );
         }
+        self.mapping.remove(&layer_desc.key());
     }
 
     pub(self) fn replace_historic_noflush(
         &mut self,
+        expected_desc: PersistentLayerDesc,
         expected: &Arc<L>,
+        new_desc: PersistentLayerDesc,
         new: Arc<L>,
     ) -> anyhow::Result<Replacement<Arc<L>>> {
         let key = historic_layer_coverage::LayerKey::from(&**expected);
@@ -332,10 +372,9 @@ where
 
         let l0_index = if expected_l0 {
             // find the index in case replace worked, we need to replace that as well
-            let pos = self
-                .l0_delta_layers
-                .iter()
-                .position(|slot| Self::compare_arced_layers(slot, expected));
+            let pos = self.l0_delta_layers.iter().position(|slot| {
+                Self::compare_arced_layers(self.get_layer_from_mapping(&slot.key()), expected)
+            });
 
             if pos.is_none() {
                 return Ok(Replacement::NotFound);
@@ -345,16 +384,28 @@ where
             None
         };
 
-        let replaced = self.historic.replace(&key, new.clone(), |existing| {
-            Self::compare_arced_layers(existing, expected)
+        let new_desc = Arc::new(new_desc);
+        let replaced = self.historic.replace(&key, new_desc.clone(), |existing| {
+            **existing == expected_desc
         });
 
         if let Replacement::Replaced { .. } = &replaced {
+            self.mapping.remove(&expected_desc.key());
+            self.mapping.insert(new_desc.key(), new);
             if let Some(index) = l0_index {
-                self.l0_delta_layers[index] = new;
+                self.l0_delta_layers[index] = new_desc;
             }
         }
 
+        let replaced = match replaced {
+            Replacement::Replaced { in_buffered } => Replacement::Replaced { in_buffered },
+            Replacement::NotFound => Replacement::NotFound,
+            Replacement::RemovalBuffered => Replacement::RemovalBuffered,
+            Replacement::Unexpected(x) => {
+                Replacement::Unexpected(self.get_layer_from_mapping(&x.key()).clone())
+            }
+        };
+
         Ok(replaced)
     }
 
@@ -383,7 +434,7 @@ where
         let start = key.start.to_i128();
         let end = key.end.to_i128();
 
-        let layer_covers = |layer: Option<Arc<L>>| match layer {
+        let layer_covers = |layer: Option<Arc<PersistentLayerDesc>>| match layer {
             Some(layer) => layer.get_lsn_range().start >= lsn.start,
             None => false,
         };
@@ -404,7 +455,9 @@ where
     }
 
     pub fn iter_historic_layers(&self) -> impl '_ + Iterator<Item = Arc<L>> {
-        self.historic.iter()
+        self.historic
+            .iter()
+            .map(|x| self.get_layer_from_mapping(&x.key()).clone())
     }
 
     ///
@@ -436,14 +489,24 @@ where
         // Loop through the change events and push intervals
         for (change_key, change_val) in version.image_coverage.range(start..end) {
             let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
-            coverage.push((kr, current_val.take()));
+            coverage.push((
+                kr,
+                current_val
+                    .take()
+                    .map(|l| self.get_layer_from_mapping(&l.key()).clone()),
+            ));
             current_key = change_key;
             current_val = change_val.clone();
         }
 
         // Add the final interval
         let kr = Key::from_i128(current_key)..Key::from_i128(end);
-        coverage.push((kr, current_val.take()));
+        coverage.push((
+            kr,
+            current_val
+                .take()
+                .map(|l| self.get_layer_from_mapping(&l.key()).clone()),
+        ));
 
         Ok(coverage)
     }
@@ -532,7 +595,9 @@ where
                     let kr = Key::from_i128(current_key)..Key::from_i128(change_key);
                     let lr = lsn.start..val.get_lsn_range().start;
                     if !kr.is_empty() {
-                        let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                        let base_count =
+                            Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
+                                as usize;
                         let new_limit = limit.map(|l| l - base_count);
                         let max_stacked_deltas_underneath =
                             self.count_deltas(&kr, &lr, new_limit)?;
@@ -555,7 +620,9 @@ where
                 let lr = lsn.start..val.get_lsn_range().start;
 
                 if !kr.is_empty() {
-                    let base_count = Self::is_reimage_worthy(&val, key) as usize;
+                    let base_count =
+                        Self::is_reimage_worthy(self.get_layer_from_mapping(&val.key()), key)
+                            as usize;
                     let new_limit = limit.map(|l| l - base_count);
                     let max_stacked_deltas_underneath = self.count_deltas(&kr, &lr, new_limit)?;
                     max_stacked_deltas = std::cmp::max(
@@ -706,7 +773,11 @@ where
 
     /// Return all L0 delta layers
     pub fn get_level0_deltas(&self) -> Result<Vec<Arc<L>>> {
-        Ok(self.l0_delta_layers.clone())
+        Ok(self
+            .l0_delta_layers
+            .iter()
+            .map(|x| self.get_layer_from_mapping(&x.key()).clone())
+            .collect())
     }
 
     /// debugging function to print out the contents of the layer map
@@ -809,12 +880,17 @@ mod tests {
             let layer = LayerDescriptor::from(layer);
 
             // same skeletan construction; see scenario below
-            let not_found: Arc<dyn Layer> = Arc::new(layer.clone());
-            let new_version: Arc<dyn Layer> = Arc::new(layer);
+            let not_found = Arc::new(layer.clone());
+            let new_version = Arc::new(layer);
 
             let mut map = LayerMap::default();
 
-            let res = map.batch_update().replace_historic(&not_found, new_version);
+            let res = map.batch_update().replace_historic(
+                not_found.get_persistent_layer_desc(),
+                &not_found,
+                new_version.get_persistent_layer_desc(),
+                new_version,
+            );
 
             assert!(matches!(res, Ok(Replacement::NotFound)), "{res:?}");
         }
@@ -823,8 +899,8 @@ mod tests {
             let name = LayerFileName::from_str(layer_name).unwrap();
             let skeleton = LayerDescriptor::from(name);
 
-            let remote: Arc<dyn Layer> = Arc::new(skeleton.clone());
-            let downloaded: Arc<dyn Layer> = Arc::new(skeleton);
+            let remote = Arc::new(skeleton.clone());
+            let downloaded = Arc::new(skeleton);
 
             let mut map = LayerMap::default();
 
@@ -834,12 +910,18 @@ mod tests {
 
             let expected_in_counts = (1, usize::from(expected_l0));
 
-            map.batch_update().insert_historic(remote.clone());
+            map.batch_update()
+                .insert_historic(remote.get_persistent_layer_desc(), remote.clone());
             assert_eq!(count_layer_in(&map, &remote), expected_in_counts);
 
             let replaced = map
                 .batch_update()
-                .replace_historic(&remote, downloaded.clone())
+                .replace_historic(
+                    remote.get_persistent_layer_desc(),
+                    &remote,
+                    downloaded.get_persistent_layer_desc(),
+                    downloaded.clone(),
+                )
                 .expect("name derived attributes are the same");
             assert!(
                 matches!(replaced, Replacement::Replaced { .. }),
@@ -847,11 +929,12 @@ mod tests {
             );
             assert_eq!(count_layer_in(&map, &downloaded), expected_in_counts);
 
-            map.batch_update().remove_historic(downloaded.clone());
+            map.batch_update()
+                .remove_historic(downloaded.get_persistent_layer_desc(), downloaded.clone());
             assert_eq!(count_layer_in(&map, &downloaded), (0, 0));
         }
 
-        fn count_layer_in(map: &LayerMap<dyn Layer>, layer: &Arc<dyn Layer>) -> (usize, usize) {
+        fn count_layer_in<L: Layer + ?Sized>(map: &LayerMap<L>, layer: &Arc<L>) -> (usize, usize) {
             let historic = map
                 .iter_historic_layers()
                 .filter(|x| LayerMap::compare_arced_layers(x, layer))
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 7c071463de..6ac4fd9470 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -38,7 +38,7 @@ pub use delta_layer::{DeltaLayer, DeltaLayerWriter};
 pub use filename::{DeltaFileName, ImageFileName, LayerFileName};
 pub use image_layer::{ImageLayer, ImageLayerWriter};
 pub use inmemory_layer::InMemoryLayer;
-pub use layer_desc::PersistentLayerDesc;
+pub use layer_desc::{PersistentLayerDesc, PersistentLayerKey};
 pub use remote_layer::RemoteLayer;
 
 use super::layer_map::BatchedUpdates;
@@ -454,7 +454,9 @@ pub trait PersistentLayer: Layer {
     ///
     /// Should not change over the lifetime of the layer object because
     /// current_physical_size is computed as the som of this value.
-    fn file_size(&self) -> u64;
+    fn file_size(&self) -> u64 {
+        self.layer_desc().file_size
+    }
 
     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo;
 
@@ -483,6 +485,20 @@ pub struct LayerDescriptor {
     pub short_id: String,
 }
 
+impl LayerDescriptor {
+    /// `LayerDescriptor` is only used for testing purpose so it does not matter whether it is image / delta,
+    /// and the tenant / timeline id does not matter.
+    pub fn get_persistent_layer_desc(&self) -> PersistentLayerDesc {
+        PersistentLayerDesc::new_delta(
+            TenantId::from_array([0; 16]),
+            TimelineId::from_array([0; 16]),
+            self.key.clone(),
+            self.lsn.clone(),
+            233,
+        )
+    }
+}
+
 impl Layer for LayerDescriptor {
     fn get_key_range(&self) -> Range<Key> {
         self.key.clone()
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 5f2fb1ebea..624fe8dac4 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -182,8 +182,6 @@ pub struct DeltaLayer {
 
     pub desc: PersistentLayerDesc,
 
-    pub file_size: u64,
-
     access_stats: LayerAccessStats,
 
     inner: RwLock<DeltaLayerInner>,
@@ -196,7 +194,7 @@ impl std::fmt::Debug for DeltaLayer {
         f.debug_struct("DeltaLayer")
             .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
             .field("lsn_range", &self.desc.lsn_range)
-            .field("file_size", &self.file_size)
+            .field("file_size", &self.desc.file_size)
             .field("inner", &self.inner)
             .finish()
     }
@@ -439,10 +437,6 @@ impl PersistentLayer for DeltaLayer {
         Ok(())
     }
 
-    fn file_size(&self) -> u64 {
-        self.file_size
-    }
-
     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
         let layer_file_name = self.filename().file_name();
         let lsn_range = self.get_lsn_range();
@@ -451,7 +445,7 @@ impl PersistentLayer for DeltaLayer {
 
         HistoricLayerInfo::Delta {
             layer_file_name,
-            layer_file_size: self.file_size,
+            layer_file_size: self.desc.file_size,
             lsn_start: lsn_range.start,
             lsn_end: lsn_range.end,
             remote: false,
@@ -602,8 +596,8 @@ impl DeltaLayer {
                 timeline_id,
                 filename.key_range.clone(),
                 filename.lsn_range.clone(),
+                file_size,
             ),
-            file_size,
             access_stats,
             inner: RwLock::new(DeltaLayerInner {
                 loaded: false,
@@ -634,8 +628,8 @@ impl DeltaLayer {
                 summary.timeline_id,
                 summary.key_range,
                 summary.lsn_range,
+                metadata.len(),
             ),
-            file_size: metadata.len(),
             access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
             inner: RwLock::new(DeltaLayerInner {
                 loaded: false,
@@ -803,8 +797,8 @@ impl DeltaLayerWriterInner {
                 self.timeline_id,
                 self.key_start..key_end,
                 self.lsn_range.clone(),
+                metadata.len(),
             ),
-            file_size: metadata.len(),
             access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
             inner: RwLock::new(DeltaLayerInner {
                 loaded: false,
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index b55dd08a6d..07a16a7de2 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -109,8 +109,6 @@ pub struct ImageLayer {
     // This entry contains an image of all pages as of this LSN, should be the same as desc.lsn
     pub lsn: Lsn,
 
-    pub file_size: u64,
-
     access_stats: LayerAccessStats,
 
     inner: RwLock<ImageLayerInner>,
@@ -122,7 +120,7 @@ impl std::fmt::Debug for ImageLayer {
 
         f.debug_struct("ImageLayer")
             .field("key_range", &RangeDisplayDebug(&self.desc.key_range))
-            .field("file_size", &self.file_size)
+            .field("file_size", &self.desc.file_size)
             .field("lsn", &self.lsn)
             .field("inner", &self.inner)
             .finish()
@@ -258,17 +256,13 @@ impl PersistentLayer for ImageLayer {
         Ok(())
     }
 
-    fn file_size(&self) -> u64 {
-        self.file_size
-    }
-
     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
         let layer_file_name = self.filename().file_name();
         let lsn_range = self.get_lsn_range();
 
         HistoricLayerInfo::Image {
             layer_file_name,
-            layer_file_size: self.file_size,
+            layer_file_size: self.desc.file_size,
             lsn_start: lsn_range.start,
             remote: false,
             access_stats: self.access_stats.as_api_model(reset),
@@ -411,9 +405,9 @@ impl ImageLayer {
                 filename.key_range.clone(),
                 filename.lsn,
                 false,
+                file_size,
             ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
             lsn: filename.lsn,
-            file_size,
             access_stats,
             inner: RwLock::new(ImageLayerInner {
                 loaded: false,
@@ -443,9 +437,9 @@ impl ImageLayer {
                 summary.key_range,
                 summary.lsn,
                 false,
+                metadata.len(),
             ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
             lsn: summary.lsn,
-            file_size: metadata.len(),
             access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
             inner: RwLock::new(ImageLayerInner {
                 file: None,
@@ -578,14 +572,6 @@ impl ImageLayerWriterInner {
             file.write_all(buf.as_ref())?;
         }
 
-        let desc = PersistentLayerDesc::new_img(
-            self.tenant_id,
-            self.timeline_id,
-            self.key_range.clone(),
-            self.lsn,
-            self.is_incremental, // for now, image layer ALWAYS covers the full range
-        );
-
         // Fill in the summary on blk 0
         let summary = Summary {
             magic: IMAGE_FILE_MAGIC,
@@ -604,6 +590,15 @@ impl ImageLayerWriterInner {
             .metadata()
             .context("get metadata to determine file size")?;
 
+        let desc = PersistentLayerDesc::new_img(
+            self.tenant_id,
+            self.timeline_id,
+            self.key_range.clone(),
+            self.lsn,
+            self.is_incremental, // for now, image layer ALWAYS covers the full range
+            metadata.len(),
+        );
+
         // Note: Because we open the file in write-only mode, we cannot
         // reuse the same VirtualFile for reading later. That's why we don't
         // set inner.file here. The first read will have to re-open it.
@@ -611,7 +606,6 @@ impl ImageLayerWriterInner {
             path_or_conf: PathOrConf::Conf(self.conf),
             desc,
             lsn: self.lsn,
-            file_size: metadata.len(),
             access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
             inner: RwLock::new(ImageLayerInner {
                 loaded: false,
diff --git a/pageserver/src/tenant/storage_layer/layer_desc.rs b/pageserver/src/tenant/storage_layer/layer_desc.rs
index a9859681d3..d1cef70253 100644
--- a/pageserver/src/tenant/storage_layer/layer_desc.rs
+++ b/pageserver/src/tenant/storage_layer/layer_desc.rs
@@ -1,10 +1,11 @@
+use anyhow::Result;
 use std::ops::Range;
 use utils::{
     id::{TenantId, TimelineId},
     lsn::Lsn,
 };
 
-use crate::repository::Key;
+use crate::{context::RequestContext, repository::Key};
 
 use super::{DeltaFileName, ImageFileName, LayerFileName};
 
@@ -24,9 +25,27 @@ pub struct PersistentLayerDesc {
     /// always be equal to `is_delta`. If we land the partial image layer PR someday, image layer could also be
     /// incremental.
     pub is_incremental: bool,
+    /// File size
+    pub file_size: u64,
+}
+
+/// A unique identifier of a persistent layer within the context of one timeline.
+#[derive(Debug, PartialEq, Eq, Clone, Hash)]
+pub struct PersistentLayerKey {
+    pub key_range: Range<Key>,
+    pub lsn_range: Range<Lsn>,
+    pub is_delta: bool,
 }
 
 impl PersistentLayerDesc {
+    pub fn key(&self) -> PersistentLayerKey {
+        PersistentLayerKey {
+            key_range: self.key_range.clone(),
+            lsn_range: self.lsn_range.clone(),
+            is_delta: self.is_delta,
+        }
+    }
+
     pub fn short_id(&self) -> String {
         self.filename().file_name()
     }
@@ -37,6 +56,7 @@ impl PersistentLayerDesc {
         key_range: Range<Key>,
         lsn: Lsn,
         is_incremental: bool,
+        file_size: u64,
     ) -> Self {
         Self {
             tenant_id,
@@ -45,6 +65,7 @@ impl PersistentLayerDesc {
             lsn_range: Self::image_layer_lsn_range(lsn),
             is_delta: false,
             is_incremental,
+            file_size,
         }
     }
 
@@ -53,6 +74,7 @@ impl PersistentLayerDesc {
         timeline_id: TimelineId,
         key_range: Range<Key>,
         lsn_range: Range<Lsn>,
+        file_size: u64,
     ) -> Self {
         Self {
             tenant_id,
@@ -61,6 +83,7 @@ impl PersistentLayerDesc {
             lsn_range,
             is_delta: true,
             is_incremental: true,
+            file_size,
         }
     }
 
@@ -106,4 +129,48 @@ impl PersistentLayerDesc {
             self.image_file_name().into()
         }
     }
+
+    // TODO: remove this in the future once we refactor timeline APIs.
+
+    pub fn get_lsn_range(&self) -> Range<Lsn> {
+        self.lsn_range.clone()
+    }
+
+    pub fn get_key_range(&self) -> Range<Key> {
+        self.key_range.clone()
+    }
+
+    pub fn get_timeline_id(&self) -> TimelineId {
+        self.timeline_id
+    }
+
+    pub fn get_tenant_id(&self) -> TenantId {
+        self.tenant_id
+    }
+
+    pub fn is_incremental(&self) -> bool {
+        self.is_incremental
+    }
+
+    pub fn is_delta(&self) -> bool {
+        self.is_delta
+    }
+
+    pub fn dump(&self, _verbose: bool, _ctx: &RequestContext) -> Result<()> {
+        println!(
+            "----- layer for ten {} tli {} keys {}-{} lsn {}-{} ----",
+            self.tenant_id,
+            self.timeline_id,
+            self.key_range.start,
+            self.key_range.end,
+            self.lsn_range.start,
+            self.lsn_range.end
+        );
+
+        Ok(())
+    }
+
+    pub fn file_size(&self) -> u64 {
+        self.file_size
+    }
 }
diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs
index ff0f44da92..387bae5b1f 100644
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -142,10 +142,6 @@ impl PersistentLayer for RemoteLayer {
         true
     }
 
-    fn file_size(&self) -> u64 {
-        self.layer_metadata.file_size()
-    }
-
     fn info(&self, reset: LayerAccessStatsReset) -> HistoricLayerInfo {
         let layer_file_name = self.filename().file_name();
         let lsn_range = self.get_lsn_range();
@@ -190,6 +186,7 @@ impl RemoteLayer {
                 fname.key_range.clone(),
                 fname.lsn,
                 false,
+                layer_metadata.file_size(),
             ),
             layer_metadata: layer_metadata.clone(),
             ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
@@ -211,6 +208,7 @@ impl RemoteLayer {
                 timelineid,
                 fname.key_range.clone(),
                 fname.lsn_range.clone(),
+                layer_metadata.file_size(),
             ),
             layer_metadata: layer_metadata.clone(),
             ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 507f0de4f3..2a50a26a23 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1211,7 +1211,12 @@ impl Timeline {
             ),
         });
 
-        let replaced = match batch_updates.replace_historic(local_layer, new_remote_layer)? {
+        let replaced = match batch_updates.replace_historic(
+            local_layer.layer_desc().clone(),
+            local_layer,
+            new_remote_layer.layer_desc().clone(),
+            new_remote_layer,
+        )? {
             Replacement::Replaced { .. } => {
                 if let Err(e) = local_layer.delete_resident_layer_file() {
                     error!("failed to remove layer file on evict after replacement: {e:#?}");
@@ -1607,7 +1612,7 @@ impl Timeline {
 
                 trace!("found layer {}", layer.path().display());
                 total_physical_size += file_size;
-                updates.insert_historic(Arc::new(layer));
+                updates.insert_historic(layer.layer_desc().clone(), Arc::new(layer));
                 num_layers += 1;
             } else if let Some(deltafilename) = DeltaFileName::parse_str(&fname) {
                 // Create a DeltaLayer struct for each delta file.
@@ -1639,7 +1644,7 @@ impl Timeline {
 
                 trace!("found layer {}", layer.path().display());
                 total_physical_size += file_size;
-                updates.insert_historic(Arc::new(layer));
+                updates.insert_historic(layer.layer_desc().clone(), Arc::new(layer));
                 num_layers += 1;
             } else if fname == METADATA_FILE_NAME || fname.ends_with(".old") {
                 // ignore these
@@ -1738,7 +1743,7 @@ impl Timeline {
                         anyhow::bail!("could not rename file {local_layer_path:?}: {err:?}");
                     } else {
                         self.metrics.resident_physical_size_gauge.sub(local_size);
-                        updates.remove_historic(local_layer);
+                        updates.remove_historic(local_layer.layer_desc().clone(), local_layer);
                         // fall-through to adding the remote layer
                     }
                 } else {
@@ -1777,7 +1782,7 @@ impl Timeline {
                     );
                     let remote_layer = Arc::new(remote_layer);
 
-                    updates.insert_historic(remote_layer);
+                    updates.insert_historic(remote_layer.layer_desc().clone(), remote_layer);
                 }
                 LayerFileName::Delta(deltafilename) => {
                     // Create a RemoteLayer for the delta file.
@@ -1804,7 +1809,7 @@ impl Timeline {
                         ),
                     );
                     let remote_layer = Arc::new(remote_layer);
-                    updates.insert_historic(remote_layer);
+                    updates.insert_historic(remote_layer.layer_desc().clone(), remote_layer);
                 }
             }
         }
@@ -2252,7 +2257,7 @@ impl Timeline {
         //      won't be needed for page reconstruction for this timeline,
         //      and mark what we can't delete yet as deleted from the layer
         //      map index without actually rebuilding the index.
-        updates.remove_historic(layer);
+        updates.remove_historic(layer.layer_desc().clone(), layer);
 
         Ok(())
     }
@@ -2962,7 +2967,7 @@ impl Timeline {
             LayerResidenceStatus::Resident,
             LayerResidenceEventReason::LayerCreate,
         );
-        batch_updates.insert_historic(l);
+        batch_updates.insert_historic(l.layer_desc().clone(), l);
         batch_updates.flush();
 
         // update the timeline's physical size
@@ -3210,7 +3215,7 @@ impl Timeline {
                 LayerResidenceStatus::Resident,
                 LayerResidenceEventReason::LayerCreate,
             );
-            updates.insert_historic(l);
+            updates.insert_historic(l.layer_desc().clone(), l);
         }
         updates.flush();
         drop(layers);
@@ -3657,7 +3662,7 @@ impl Timeline {
                 LayerResidenceStatus::Resident,
                 LayerResidenceEventReason::LayerCreate,
             );
-            updates.insert_historic(x);
+            updates.insert_historic(x.layer_desc().clone(), x);
         }
 
         // Now that we have reshuffled the data to set of new delta layers, we can
@@ -4192,7 +4197,7 @@ impl Timeline {
                     {
                         use crate::tenant::layer_map::Replacement;
                         let l: Arc<dyn PersistentLayer> = remote_layer.clone();
-                        let failure = match updates.replace_historic(&l, new_layer) {
+                        let failure = match updates.replace_historic(l.layer_desc().clone(), &l, new_layer.layer_desc().clone(), new_layer) {
                             Ok(Replacement::Replaced { .. }) => false,
                             Ok(Replacement::NotFound) => {
                                 // TODO: the downloaded file should probably be removed, otherwise

From 8560a98d6835ee9fd2fa1e2ac0f7c915ff39c333 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Thu, 8 Jun 2023 13:25:30 +0300
Subject: [PATCH 10/14] fix openapi spec to pass swagger editor validation
 (#4445)

There shouldnt be a dash before `type: object`. Also added description.
---
 pageserver/src/http/openapi_spec.yml | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 1f8298ca3e..40e4f035ae 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -386,6 +386,7 @@ paths:
         "202":
           description: Tenant attaching scheduled
         "400":
+          description: Bad Request
           content:
             application/json:
               schema:
@@ -945,7 +946,7 @@ components:
               type: string
               enum: [ "maybe", "attached", "failed" ]
             data:
-            - type: object
+              type: object
               properties:
                 reason:
                   type: string

From d53f9ab3eb246f71a267deaef31eb6d26e6016df Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Thu, 8 Jun 2023 15:01:22 +0300
Subject: [PATCH 11/14] delete timelines from s3 (#4384)

Delete data from s3 when timeline deletion is requested

## Summary of changes

UploadQueue is altered to support scheduling of delete operations in
stopped state. This looks weird, and I'm thinking whether there are
better options/refactorings for upload client to make it look better.

Probably can be part of https://github.com/neondatabase/neon/issues/4378

Deletion is implemented directly in existing endpoint because changes are not
that significant. If we want more safety we can separate those or create
feature flag for new behavior.

resolves [#4193](https://github.com/neondatabase/neon/issues/4193)

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 .github/workflows/build_and_test.yml          |   2 +-
 libs/pageserver_api/src/models.rs             |   4 +-
 libs/remote_storage/src/local_fs.rs           |  67 ++--
 .../{pagination_tests.rs => test_real_s3.rs}  | 129 +++++--
 libs/utils/src/fs_ext.rs                      |  33 ++
 libs/utils/src/http/error.rs                  |   2 +-
 pageserver/src/http/openapi_spec.yml          |   2 +-
 pageserver/src/http/routes.rs                 |  16 +-
 pageserver/src/task_mgr.rs                    |   3 +
 pageserver/src/tenant.rs                      | 364 +++++++++++-------
 pageserver/src/tenant/mgr.rs                  |   4 +-
 .../src/tenant/remote_timeline_client.rs      | 290 ++++++++++----
 .../tenant/remote_timeline_client/index.rs    |  17 +
 pageserver/src/tenant/timeline.rs             |  45 ++-
 .../walreceiver/connection_manager.rs         |   2 +-
 pageserver/src/tenant/upload_queue.rs         |  37 +-
 test_runner/fixtures/neon_fixtures.py         |   2 +
 test_runner/fixtures/pageserver/utils.py      |  55 ++-
 test_runner/regress/test_import.py            |   6 +
 test_runner/regress/test_remote_storage.py    |  18 +-
 test_runner/regress/test_timeline_delete.py   | 213 +++++++++-
 21 files changed, 984 insertions(+), 327 deletions(-)
 rename libs/remote_storage/tests/{pagination_tests.rs => test_real_s3.rs} (79%)

diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index b732095f8f..897e1a7aad 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -264,7 +264,7 @@ jobs:
           export REMOTE_STORAGE_S3_BUCKET=neon-github-public-dev
           export REMOTE_STORAGE_S3_REGION=eu-central-1
           # Avoid `$CARGO_FEATURES` since there's no `testing` feature in the e2e tests now
-          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test pagination_tests -- s3_pagination_should_work --exact
+          ${cov_prefix} cargo test $CARGO_FLAGS --package remote_storage --test test_real_s3
 
       - name: Install rust binaries
         run: |
diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index ddce82324c..df5f5896a1 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -152,7 +152,7 @@ pub enum ActivatingFrom {
 }
 
 /// A state of a timeline in pageserver's memory.
-#[derive(Debug, Clone, Copy, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
+#[derive(Debug, Clone, PartialEq, Eq, serde::Serialize, serde::Deserialize)]
 pub enum TimelineState {
     /// The timeline is recognized by the pageserver but is not yet operational.
     /// In particular, the walreceiver connection loop is not running for this timeline.
@@ -165,7 +165,7 @@ pub enum TimelineState {
     /// It cannot transition back into any other state.
     Stopping,
     /// The timeline is broken and not operational (previous states: Loading or Active).
-    Broken,
+    Broken { reason: String, backtrace: String },
 }
 
 #[serde_as]
diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index c081a6d361..c73e647845 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -17,7 +17,7 @@ use tokio::{
     io::{self, AsyncReadExt, AsyncSeekExt, AsyncWriteExt},
 };
 use tracing::*;
-use utils::crashsafe::path_with_suffix_extension;
+use utils::{crashsafe::path_with_suffix_extension, fs_ext::is_directory_empty};
 
 use crate::{Download, DownloadError, RemotePath};
 
@@ -101,19 +101,35 @@ impl RemoteStorage for LocalFs {
             Some(prefix) => Cow::Owned(prefix.with_base(&self.storage_root)),
             None => Cow::Borrowed(&self.storage_root),
         };
-        Ok(get_all_files(path.as_ref(), false)
+
+        let prefixes_to_filter = get_all_files(path.as_ref(), false)
             .await
-            .map_err(DownloadError::Other)?
-            .into_iter()
-            .map(|path| {
-                path.strip_prefix(&self.storage_root)
-                    .context("Failed to strip preifix")
+            .map_err(DownloadError::Other)?;
+
+        let mut prefixes = Vec::with_capacity(prefixes_to_filter.len());
+
+        // filter out empty directories to mirror s3 behavior.
+        for prefix in prefixes_to_filter {
+            if prefix.is_dir()
+                && is_directory_empty(&prefix)
+                    .await
+                    .map_err(DownloadError::Other)?
+            {
+                continue;
+            }
+
+            prefixes.push(
+                prefix
+                    .strip_prefix(&self.storage_root)
+                    .context("Failed to strip prefix")
                     .and_then(RemotePath::new)
                     .expect(
                         "We list files for storage root, hence should be able to remote the prefix",
-                    )
-            })
-            .collect())
+                    ),
+            )
+        }
+
+        Ok(prefixes)
     }
 
     async fn upload(
@@ -291,11 +307,18 @@ impl RemoteStorage for LocalFs {
 
     async fn delete(&self, path: &RemotePath) -> anyhow::Result<()> {
         let file_path = path.with_base(&self.storage_root);
-        if file_path.exists() && file_path.is_file() {
-            Ok(fs::remove_file(file_path).await?)
-        } else {
-            bail!("File {file_path:?} either does not exist or is not a file")
+        if !file_path.exists() {
+            // See https://docs.aws.amazon.com/AmazonS3/latest/API/API_DeleteObject.html
+            // > If there isn't a null version, Amazon S3 does not remove any objects but will still respond that the command was successful.
+            return Ok(());
         }
+
+        if !file_path.is_file() {
+            anyhow::bail!("{file_path:?} is not a file");
+        }
+        Ok(fs::remove_file(file_path)
+            .await
+            .map_err(|e| anyhow::anyhow!(e))?)
     }
 }
 
@@ -320,7 +343,7 @@ where
                     let file_type = dir_entry.file_type().await?;
                     let entry_path = dir_entry.path();
                     if file_type.is_symlink() {
-                        debug!("{entry_path:?} us a symlink, skipping")
+                        debug!("{entry_path:?} is a symlink, skipping")
                     } else if file_type.is_dir() {
                         if recursive {
                             paths.extend(get_all_files(&entry_path, true).await?.into_iter())
@@ -595,15 +618,11 @@ mod fs_tests {
         storage.delete(&upload_target).await?;
         assert!(storage.list().await?.is_empty());
 
-        match storage.delete(&upload_target).await {
-            Ok(()) => panic!("Should not allow deleting non-existing storage files"),
-            Err(e) => {
-                let error_string = e.to_string();
-                assert!(error_string.contains("does not exist"));
-                let expected_path = upload_target.with_base(&storage.storage_root);
-                assert!(error_string.contains(expected_path.to_str().unwrap()));
-            }
-        }
+        storage
+            .delete(&upload_target)
+            .await
+            .expect("Should allow deleting non-existing storage files");
+
         Ok(())
     }
 
diff --git a/libs/remote_storage/tests/pagination_tests.rs b/libs/remote_storage/tests/test_real_s3.rs
similarity index 79%
rename from libs/remote_storage/tests/pagination_tests.rs
rename to libs/remote_storage/tests/test_real_s3.rs
index 86a6888f98..48ed8f686c 100644
--- a/libs/remote_storage/tests/pagination_tests.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -7,6 +7,7 @@ use std::sync::Arc;
 use std::time::UNIX_EPOCH;
 
 use anyhow::Context;
+use once_cell::sync::OnceCell;
 use remote_storage::{
     GenericRemoteStorage, RemotePath, RemoteStorageConfig, RemoteStorageKind, S3Config,
 };
@@ -14,8 +15,12 @@ use test_context::{test_context, AsyncTestContext};
 use tokio::task::JoinSet;
 use tracing::{debug, error, info};
 
+static LOGGING_DONE: OnceCell<()> = OnceCell::new();
+
 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
 
+const BASE_PREFIX: &str = "test/";
+
 /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
 /// See the client creation in [`create_s3_client`] for details on the required env vars.
@@ -38,20 +43,20 @@ const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_
 ///
 /// Lastly, the test attempts to clean up and remove all uploaded S3 files.
 /// If any errors appear during the clean up, they get logged, but the test is not failed or stopped until clean up is finished.
-#[test_context(MaybeEnabledS3)]
+#[test_context(MaybeEnabledS3WithTestBlobs)]
 #[tokio::test]
-async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
+async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3WithTestBlobs) -> anyhow::Result<()> {
     let ctx = match ctx {
-        MaybeEnabledS3::Enabled(ctx) => ctx,
-        MaybeEnabledS3::Disabled => return Ok(()),
-        MaybeEnabledS3::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
+        MaybeEnabledS3WithTestBlobs::Enabled(ctx) => ctx,
+        MaybeEnabledS3WithTestBlobs::Disabled => return Ok(()),
+        MaybeEnabledS3WithTestBlobs::UploadsFailed(e, _) => anyhow::bail!("S3 init failed: {e:?}"),
     };
 
-    let test_client = Arc::clone(&ctx.client_with_excessive_pagination);
+    let test_client = Arc::clone(&ctx.enabled.client);
     let expected_remote_prefixes = ctx.remote_prefixes.clone();
 
-    let base_prefix =
-        RemotePath::new(Path::new(ctx.base_prefix_str)).context("common_prefix construction")?;
+    let base_prefix = RemotePath::new(Path::new(ctx.enabled.base_prefix))
+        .context("common_prefix construction")?;
     let root_remote_prefixes = test_client
         .list_prefixes(None)
         .await
@@ -83,27 +88,91 @@ async fn s3_pagination_should_work(ctx: &mut MaybeEnabledS3) -> anyhow::Result<(
     Ok(())
 }
 
+#[test_context(MaybeEnabledS3)]
+#[tokio::test]
+async fn s3_delete_non_exising_works(ctx: &mut MaybeEnabledS3) -> anyhow::Result<()> {
+    let ctx = match ctx {
+        MaybeEnabledS3::Enabled(ctx) => ctx,
+        MaybeEnabledS3::Disabled => return Ok(()),
+    };
+
+    let path = RemotePath::new(&PathBuf::from(format!(
+        "{}/for_sure_there_is_nothing_there_really",
+        ctx.base_prefix,
+    )))
+    .with_context(|| "RemotePath conversion")?;
+
+    ctx.client.delete(&path).await.expect("should succeed");
+
+    Ok(())
+}
+
+fn ensure_logging_ready() {
+    LOGGING_DONE.get_or_init(|| {
+        utils::logging::init(
+            utils::logging::LogFormat::Test,
+            utils::logging::TracingErrorLayerEnablement::Disabled,
+        )
+        .expect("logging init failed");
+    });
+}
+
+struct EnabledS3 {
+    client: Arc<GenericRemoteStorage>,
+    base_prefix: &'static str,
+}
+
+impl EnabledS3 {
+    async fn setup(max_keys_in_list_response: Option<i32>) -> Self {
+        let client = create_s3_client(max_keys_in_list_response)
+            .context("S3 client creation")
+            .expect("S3 client creation failed");
+
+        EnabledS3 {
+            client,
+            base_prefix: BASE_PREFIX,
+        }
+    }
+}
+
 enum MaybeEnabledS3 {
+    Enabled(EnabledS3),
+    Disabled,
+}
+
+#[async_trait::async_trait]
+impl AsyncTestContext for MaybeEnabledS3 {
+    async fn setup() -> Self {
+        ensure_logging_ready();
+
+        if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
+            info!(
+                "`{}` env variable is not set, skipping the test",
+                ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME
+            );
+            return Self::Disabled;
+        }
+
+        Self::Enabled(EnabledS3::setup(None).await)
+    }
+}
+
+enum MaybeEnabledS3WithTestBlobs {
     Enabled(S3WithTestBlobs),
     Disabled,
     UploadsFailed(anyhow::Error, S3WithTestBlobs),
 }
 
 struct S3WithTestBlobs {
-    client_with_excessive_pagination: Arc<GenericRemoteStorage>,
-    base_prefix_str: &'static str,
+    enabled: EnabledS3,
     remote_prefixes: HashSet<RemotePath>,
     remote_blobs: HashSet<RemotePath>,
 }
 
 #[async_trait::async_trait]
-impl AsyncTestContext for MaybeEnabledS3 {
+impl AsyncTestContext for MaybeEnabledS3WithTestBlobs {
     async fn setup() -> Self {
-        utils::logging::init(
-            utils::logging::LogFormat::Test,
-            utils::logging::TracingErrorLayerEnablement::Disabled,
-        )
-        .expect("logging init failed");
+        ensure_logging_ready();
         if env::var(ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME).is_err() {
             info!(
                 "`{}` env variable is not set, skipping the test",
@@ -115,23 +184,14 @@ impl AsyncTestContext for MaybeEnabledS3 {
         let max_keys_in_list_response = 10;
         let upload_tasks_count = 1 + (2 * usize::try_from(max_keys_in_list_response).unwrap());
 
-        let client_with_excessive_pagination = create_s3_client(max_keys_in_list_response)
-            .context("S3 client creation")
-            .expect("S3 client creation failed");
+        let enabled = EnabledS3::setup(Some(max_keys_in_list_response)).await;
 
-        let base_prefix_str = "test/";
-        match upload_s3_data(
-            &client_with_excessive_pagination,
-            base_prefix_str,
-            upload_tasks_count,
-        )
-        .await
-        {
+        match upload_s3_data(&enabled.client, enabled.base_prefix, upload_tasks_count).await {
             ControlFlow::Continue(uploads) => {
                 info!("Remote objects created successfully");
+
                 Self::Enabled(S3WithTestBlobs {
-                    client_with_excessive_pagination,
-                    base_prefix_str,
+                    enabled,
                     remote_prefixes: uploads.prefixes,
                     remote_blobs: uploads.blobs,
                 })
@@ -139,8 +199,7 @@ impl AsyncTestContext for MaybeEnabledS3 {
             ControlFlow::Break(uploads) => Self::UploadsFailed(
                 anyhow::anyhow!("One or multiple blobs failed to upload to S3"),
                 S3WithTestBlobs {
-                    client_with_excessive_pagination,
-                    base_prefix_str,
+                    enabled,
                     remote_prefixes: uploads.prefixes,
                     remote_blobs: uploads.blobs,
                 },
@@ -152,13 +211,15 @@ impl AsyncTestContext for MaybeEnabledS3 {
         match self {
             Self::Disabled => {}
             Self::Enabled(ctx) | Self::UploadsFailed(_, ctx) => {
-                cleanup(&ctx.client_with_excessive_pagination, ctx.remote_blobs).await;
+                cleanup(&ctx.enabled.client, ctx.remote_blobs).await;
             }
         }
     }
 }
 
-fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<GenericRemoteStorage>> {
+fn create_s3_client(
+    max_keys_per_list_response: Option<i32>,
+) -> anyhow::Result<Arc<GenericRemoteStorage>> {
     let remote_storage_s3_bucket = env::var("REMOTE_STORAGE_S3_BUCKET")
         .context("`REMOTE_STORAGE_S3_BUCKET` env var is not set, but real S3 tests are enabled")?;
     let remote_storage_s3_region = env::var("REMOTE_STORAGE_S3_REGION")
@@ -176,7 +237,7 @@ fn create_s3_client(max_keys_per_list_response: i32) -> anyhow::Result<Arc<Gener
             prefix_in_bucket: Some(format!("pagination_should_work_test_{random_prefix_part}/")),
             endpoint: None,
             concurrency_limit: NonZeroUsize::new(100).unwrap(),
-            max_keys_per_list_response: Some(max_keys_per_list_response),
+            max_keys_per_list_response,
         }),
     };
     Ok(Arc::new(
diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs
index d2cb7be816..0ef0464267 100644
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -1,6 +1,8 @@
 /// Extensions to `std::fs` types.
 use std::{fs, io, path::Path};
 
+use anyhow::Context;
+
 pub trait PathExt {
     /// Returns an error if `self` is not a directory.
     fn is_empty_dir(&self) -> io::Result<bool>;
@@ -15,10 +17,19 @@ where
     }
 }
 
+pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool> {
+    let mut dir = tokio::fs::read_dir(&path)
+        .await
+        .context(format!("read_dir({})", path.as_ref().display()))?;
+    Ok(dir.next_entry().await?.is_none())
+}
+
 #[cfg(test)]
 mod test {
     use std::path::PathBuf;
 
+    use crate::fs_ext::is_directory_empty;
+
     #[test]
     fn is_empty_dir() {
         use super::PathExt;
@@ -42,4 +53,26 @@ mod test {
         std::fs::remove_file(&file_path).unwrap();
         assert!(file_path.is_empty_dir().is_err());
     }
+
+    #[tokio::test]
+    async fn is_empty_dir_async() {
+        let dir = tempfile::tempdir().unwrap();
+        let dir_path = dir.path();
+
+        // test positive case
+        assert!(
+            is_directory_empty(dir_path).await.expect("test failure"),
+            "new tempdir should be empty"
+        );
+
+        // invoke on a file to ensure it returns an error
+        let file_path: PathBuf = dir_path.join("testfile");
+        let f = std::fs::File::create(&file_path).unwrap();
+        drop(f);
+        assert!(is_directory_empty(&file_path).await.is_err());
+
+        // do it again on a path, we know to be nonexistent
+        std::fs::remove_file(&file_path).unwrap();
+        assert!(is_directory_empty(file_path).await.is_err());
+    }
 }
diff --git a/libs/utils/src/http/error.rs b/libs/utils/src/http/error.rs
index 4eff16b6a3..f9c06453df 100644
--- a/libs/utils/src/http/error.rs
+++ b/libs/utils/src/http/error.rs
@@ -21,7 +21,7 @@ pub enum ApiError {
     Conflict(String),
 
     #[error("Precondition failed: {0}")]
-    PreconditionFailed(&'static str),
+    PreconditionFailed(Box<str>),
 
     #[error(transparent)]
     InternalServerError(anyhow::Error),
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 40e4f035ae..50614653be 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -215,7 +215,7 @@ paths:
               schema:
                 $ref: "#/components/schemas/NotFoundError"
         "412":
-          description: Tenant is missing
+          description: Tenant is missing, or timeline has children
           content:
             application/json:
               schema:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 22dedbe5b2..ac230e5f4a 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -183,9 +183,10 @@ impl From<crate::tenant::DeleteTimelineError> for ApiError {
         use crate::tenant::DeleteTimelineError::*;
         match value {
             NotFound => ApiError::NotFound(anyhow::anyhow!("timeline not found")),
-            HasChildren => ApiError::BadRequest(anyhow::anyhow!(
-                "Cannot delete timeline which has child timelines"
-            )),
+            HasChildren(children) => ApiError::PreconditionFailed(
+                format!("Cannot delete timeline which has child timelines: {children:?}")
+                    .into_boxed_str(),
+            ),
             Other(e) => ApiError::InternalServerError(e),
         }
     }
@@ -197,9 +198,9 @@ impl From<crate::tenant::mgr::DeleteTimelineError> for ApiError {
         match value {
             // Report Precondition failed so client can distinguish between
             // "tenant is missing" case from "timeline is missing"
-            Tenant(GetTenantError::NotFound(..)) => {
-                ApiError::PreconditionFailed("Requested tenant is missing")
-            }
+            Tenant(GetTenantError::NotFound(..)) => ApiError::PreconditionFailed(
+                "Requested tenant is missing".to_owned().into_boxed_str(),
+            ),
             Tenant(t) => ApiError::from(t),
             Timeline(t) => ApiError::from(t),
         }
@@ -494,7 +495,8 @@ async fn timeline_delete_handler(
         .instrument(info_span!("timeline_delete", tenant = %tenant_id, timeline = %timeline_id))
         .await?;
 
-    json_response(StatusCode::OK, ())
+    // FIXME: needs to be an error for console to retry it. Ideally Accepted should be used and retried until 404.
+    json_response(StatusCode::ACCEPTED, ())
 }
 
 async fn tenant_detach_handler(
diff --git a/pageserver/src/task_mgr.rs b/pageserver/src/task_mgr.rs
index 4df0e4e6f2..d8db12a113 100644
--- a/pageserver/src/task_mgr.rs
+++ b/pageserver/src/task_mgr.rs
@@ -257,6 +257,9 @@ pub enum TaskKind {
     // task that handles attaching a tenant
     Attach,
 
+    // Used mostly for background deletion from s3
+    TimelineDeletionWorker,
+
     // task that handhes metrics collection
     MetricsCollection,
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 29086cae86..d23f1cb96f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -18,6 +18,7 @@ use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
+use tokio::sync::OwnedMutexGuard;
 use tokio::task::JoinSet;
 use tracing::*;
 use utils::completion;
@@ -444,7 +445,7 @@ pub enum DeleteTimelineError {
     #[error("NotFound")]
     NotFound,
     #[error("HasChildren")]
-    HasChildren,
+    HasChildren(Vec<TimelineId>),
     #[error(transparent)]
     Other(#[from] anyhow::Error),
 }
@@ -568,7 +569,7 @@ impl Tenant {
                         .with_context(|| {
                             format!("creating broken timeline data for {tenant_id}/{timeline_id}")
                         })?;
-                    broken_timeline.set_state(TimelineState::Broken);
+                    broken_timeline.set_broken(e.to_string());
                     timelines_accessor.insert(timeline_id, broken_timeline);
                     return Err(e);
                 }
@@ -763,7 +764,7 @@ impl Tenant {
                     );
                     remote_index_and_client.insert(timeline_id, (index_part, client));
                 }
-                MaybeDeletedIndexPart::Deleted => {
+                MaybeDeletedIndexPart::Deleted(_) => {
                     info!("timeline {} is deleted, skipping", timeline_id);
                     continue;
                 }
@@ -1113,9 +1114,9 @@ impl Tenant {
     /// Subroutine of `load_tenant`, to load an individual timeline
     ///
     /// NB: The parent is assumed to be already loaded!
-    #[instrument(skip_all, fields(timeline_id))]
+    #[instrument(skip(self, local_metadata, init_order, ctx))]
     async fn load_local_timeline(
-        &self,
+        self: &Arc<Self>,
         timeline_id: TimelineId,
         local_metadata: TimelineMetadata,
         init_order: Option<&InitializationOrder>,
@@ -1132,12 +1133,20 @@ impl Tenant {
             )
         });
 
-        let remote_startup_data = match &remote_client {
+        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
+            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
+                .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?;
+            Some(ancestor_timeline)
+        } else {
+            None
+        };
+
+        let (remote_startup_data, remote_client) = match remote_client {
             Some(remote_client) => match remote_client.download_index_file().await {
                 Ok(index_part) => {
                     let index_part = match index_part {
                         MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
-                        MaybeDeletedIndexPart::Deleted => {
+                        MaybeDeletedIndexPart::Deleted(index_part) => {
                             // TODO: we won't reach here if remote storage gets de-configured after start of the deletion operation.
                             // Example:
                             //  start deletion operation
@@ -1148,37 +1157,59 @@ impl Tenant {
                             //
                             // We don't really anticipate remote storage to be de-configured, so, for now, this is fine.
                             // Also, maybe we'll remove that option entirely in the future, see https://github.com/neondatabase/neon/issues/4099.
-                            info!("is_deleted is set on remote, resuming removal of local data originally done by timeline deletion handler");
-                            std::fs::remove_dir_all(
-                                self.conf.timeline_path(&timeline_id, &self.tenant_id),
-                            )
-                            .context("remove_dir_all")?;
+                            info!("is_deleted is set on remote, resuming removal of timeline data originally done by timeline deletion handler");
+
+                            remote_client
+                                .init_upload_queue_stopped_to_continue_deletion(&index_part)?;
+
+                            let timeline = self
+                                .create_timeline_data(
+                                    timeline_id,
+                                    &local_metadata,
+                                    ancestor,
+                                    Some(remote_client),
+                                    init_order,
+                                )
+                                .context("create_timeline_data")?;
+
+                            let guard = Arc::clone(&timeline.delete_lock).lock_owned().await;
+
+                            // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
+                            // RemoteTimelineClient is the only functioning part.
+                            timeline.set_state(TimelineState::Stopping);
+                            // We meed to do this because when console retries delete request we shouldnt answer with 404
+                            // because 404 means successful deletion.
+                            // FIXME consider TimelineState::Deleting.
+                            let mut locked = self.timelines.lock().unwrap();
+                            locked.insert(timeline_id, Arc::clone(&timeline));
+
+                            Tenant::schedule_delete_timeline(
+                                Arc::clone(self),
+                                timeline_id,
+                                timeline,
+                                guard,
+                            );
 
                             return Ok(());
                         }
                     };
 
                     let remote_metadata = index_part.parse_metadata().context("parse_metadata")?;
-                    Some(RemoteStartupData {
-                        index_part,
-                        remote_metadata,
-                    })
+                    (
+                        Some(RemoteStartupData {
+                            index_part,
+                            remote_metadata,
+                        }),
+                        Some(remote_client),
+                    )
                 }
                 Err(DownloadError::NotFound) => {
                     info!("no index file was found on the remote");
-                    None
+                    (None, Some(remote_client))
                 }
                 Err(e) => return Err(anyhow::anyhow!(e)),
             },
-            None => None,
-        };
-
-        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
-            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
-            .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?;
-            Some(ancestor_timeline)
-        } else {
-            None
+            None => (None, remote_client),
         };
 
         self.timeline_init_and_sync(
@@ -1511,13 +1542,118 @@ impl Tenant {
     }
 
     /// Shuts down a timeline's tasks, removes its in-memory structures, and deletes its
-    /// data from disk.
-    ///
-    /// This doesn't currently delete all data from S3, but sets a flag in its
-    /// index_part.json file to mark it as deleted.
-    pub async fn delete_timeline(
+    /// data from both disk and s3.
+    async fn delete_timeline(
         &self,
         timeline_id: TimelineId,
+        timeline: Arc<Timeline>,
+    ) -> anyhow::Result<()> {
+        {
+            // Grab the layer_removal_cs lock, and actually perform the deletion.
+            //
+            // This lock prevents prevents GC or compaction from running at the same time.
+            // The GC task doesn't register itself with the timeline it's operating on,
+            // so it might still be running even though we called `shutdown_tasks`.
+            //
+            // Note that there are still other race conditions between
+            // GC, compaction and timeline deletion. See
+            // https://github.com/neondatabase/neon/issues/2671
+            //
+            // No timeout here, GC & Compaction should be responsive to the
+            // `TimelineState::Stopping` change.
+            info!("waiting for layer_removal_cs.lock()");
+            let layer_removal_guard = timeline.layer_removal_cs.lock().await;
+            info!("got layer_removal_cs.lock(), deleting layer files");
+
+            // NB: storage_sync upload tasks that reference these layers have been cancelled
+            //     by the caller.
+
+            let local_timeline_directory = self
+                .conf
+                .timeline_path(&timeline.timeline_id, &self.tenant_id);
+
+            fail::fail_point!("timeline-delete-before-rm", |_| {
+                Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
+            });
+
+            // NB: This need not be atomic because the deleted flag in the IndexPart
+            // will be observed during tenant/timeline load. The deletion will be resumed there.
+            //
+            // For configurations without remote storage, we tolerate that we're not crash-safe here.
+            // The timeline may come up Active but with missing layer files, in such setups.
+            // See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720
+            match std::fs::remove_dir_all(&local_timeline_directory) {
+                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+                    // This can happen if we're called a second time, e.g.,
+                    // because of a previous failure/cancellation at/after
+                    // failpoint timeline-delete-after-rm.
+                    //
+                    // It can also happen if we race with tenant detach, because,
+                    // it doesn't grab the layer_removal_cs lock.
+                    //
+                    // For now, log and continue.
+                    // warn! level is technically not appropriate for the
+                    // first case because we should expect retries to happen.
+                    // But the error is so rare, it seems better to get attention if it happens.
+                    let tenant_state = self.current_state();
+                    warn!(
+                        timeline_dir=?local_timeline_directory,
+                        ?tenant_state,
+                        "timeline directory not found, proceeding anyway"
+                    );
+                    // continue with the rest of the deletion
+                }
+                res => res.with_context(|| {
+                    format!(
+                        "Failed to remove local timeline directory '{}'",
+                        local_timeline_directory.display()
+                    )
+                })?,
+            }
+
+            info!("finished deleting layer files, releasing layer_removal_cs.lock()");
+            drop(layer_removal_guard);
+        }
+
+        fail::fail_point!("timeline-delete-after-rm", |_| {
+            Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
+        });
+
+        {
+            // Remove the timeline from the map.
+            let mut timelines = self.timelines.lock().unwrap();
+            let children_exist = timelines
+                .iter()
+                .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
+            // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
+            // We already deleted the layer files, so it's probably best to panic.
+            // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
+            if children_exist {
+                panic!("Timeline grew children while we removed layer files");
+            }
+
+            timelines.remove(&timeline_id).expect(
+                "timeline that we were deleting was concurrently removed from 'timelines' map",
+            );
+
+            drop(timelines);
+        }
+
+        let remote_client = match &timeline.remote_client {
+            Some(remote_client) => remote_client,
+            None => return Ok(()),
+        };
+
+        remote_client.delete_all().await?;
+
+        Ok(())
+    }
+
+    /// Removes timeline-related in-memory data and schedules removal from remote storage.
+    #[instrument(skip(self, _ctx))]
+    pub async fn prepare_and_schedule_delete_timeline(
+        self: Arc<Self>,
+        timeline_id: TimelineId,
         _ctx: &RequestContext,
     ) -> Result<(), DeleteTimelineError> {
         timeline::debug_assert_current_span_has_tenant_and_timeline_id();
@@ -1527,18 +1663,25 @@ impl Tenant {
         //
         // Also grab the Timeline's delete_lock to prevent another deletion from starting.
         let timeline;
-        let mut delete_lock_guard;
+        let delete_lock_guard;
         {
             let mut timelines = self.timelines.lock().unwrap();
 
             // Ensure that there are no child timelines **attached to that pageserver**,
             // because detach removes files, which will break child branches
-            let children_exist = timelines
+            let children: Vec<TimelineId> = timelines
                 .iter()
-                .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
+                .filter_map(|(id, entry)| {
+                    if entry.get_ancestor_timeline_id() == Some(timeline_id) {
+                        Some(*id)
+                    } else {
+                        None
+                    }
+                })
+                .collect();
 
-            if children_exist {
-                return Err(DeleteTimelineError::HasChildren);
+            if !children.is_empty() {
+                return Err(DeleteTimelineError::HasChildren(children));
             }
 
             let timeline_entry = match timelines.entry(timeline_id) {
@@ -1553,11 +1696,15 @@ impl Tenant {
             // XXX: We should perhaps return an HTTP "202 Accepted" to signal that the caller
             // needs to poll until the operation has finished. But for now, we return an
             // error, because the control plane knows to retry errors.
-            delete_lock_guard = timeline.delete_lock.try_lock().map_err(|_| {
-                DeleteTimelineError::Other(anyhow::anyhow!(
-                    "timeline deletion is already in progress"
-                ))
-            })?;
+
+            delete_lock_guard =
+                Arc::clone(&timeline.delete_lock)
+                    .try_lock_owned()
+                    .map_err(|_| {
+                        DeleteTimelineError::Other(anyhow::anyhow!(
+                            "timeline deletion is already in progress"
+                        ))
+                    })?;
 
             // If another task finished the deletion just before we acquired the lock,
             // return success.
@@ -1626,102 +1773,43 @@ impl Tenant {
                 }
             }
         }
-
-        {
-            // Grab the layer_removal_cs lock, and actually perform the deletion.
-            //
-            // This lock prevents prevents GC or compaction from running at the same time.
-            // The GC task doesn't register itself with the timeline it's operating on,
-            // so it might still be running even though we called `shutdown_tasks`.
-            //
-            // Note that there are still other race conditions between
-            // GC, compaction and timeline deletion. See
-            // https://github.com/neondatabase/neon/issues/2671
-            //
-            // No timeout here, GC & Compaction should be responsive to the
-            // `TimelineState::Stopping` change.
-            info!("waiting for layer_removal_cs.lock()");
-            let layer_removal_guard = timeline.layer_removal_cs.lock().await;
-            info!("got layer_removal_cs.lock(), deleting layer files");
-
-            // NB: storage_sync upload tasks that reference these layers have been cancelled
-            //     by the caller.
-
-            let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
-
-            fail::fail_point!("timeline-delete-before-rm", |_| {
-                Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
-            });
-
-            // NB: This need not be atomic because the deleted flag in the IndexPart
-            // will be observed during tenant/timeline load. The deletion will be resumed there.
-            //
-            // For configurations without remote storage, we tolerate that we're not crash-safe here.
-            // The timeline may come up Active but with missing layer files, in such setups.
-            // See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720
-            match std::fs::remove_dir_all(&local_timeline_directory) {
-                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-                    // This can happen if we're called a second time, e.g.,
-                    // because of a previous failure/cancellation at/after
-                    // failpoint timeline-delete-after-rm.
-                    //
-                    // It can also happen if we race with tenant detach, because,
-                    // it doesn't grab the layer_removal_cs lock.
-                    //
-                    // For now, log and continue.
-                    // warn! level is technically not appropriate for the
-                    // first case because we should expect retries to happen.
-                    // But the error is so rare, it seems better to get attention if it happens.
-                    let tenant_state = self.current_state();
-                    warn!(
-                        timeline_dir=?local_timeline_directory,
-                        ?tenant_state,
-                        "timeline directory not found, proceeding anyway"
-                    );
-                    // continue with the rest of the deletion
-                }
-                res => res.with_context(|| {
-                    format!(
-                        "Failed to remove local timeline directory '{}'",
-                        local_timeline_directory.display()
-                    )
-                })?,
-            }
-
-            info!("finished deleting layer files, releasing layer_removal_cs.lock()");
-            drop(layer_removal_guard);
-        }
-
-        fail::fail_point!("timeline-delete-after-rm", |_| {
-            Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
-        });
-
-        // Remove the timeline from the map.
-        {
-            let mut timelines = self.timelines.lock().unwrap();
-
-            let children_exist = timelines
-                .iter()
-                .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
-            // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
-            // We already deleted the layer files, so it's probably best to panic.
-            // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
-            if children_exist {
-                panic!("Timeline grew children while we removed layer files");
-            }
-
-            timelines.remove(&timeline_id).expect(
-                "timeline that we were deleting was concurrently removed from 'timelines' map",
-            );
-        }
-
-        // All done! Mark the deletion as completed and release the delete_lock
-        *delete_lock_guard = true;
-        drop(delete_lock_guard);
+        self.schedule_delete_timeline(timeline_id, timeline, delete_lock_guard);
 
         Ok(())
     }
 
+    fn schedule_delete_timeline(
+        self: Arc<Self>,
+        timeline_id: TimelineId,
+        timeline: Arc<Timeline>,
+        _guard: OwnedMutexGuard<bool>,
+    ) {
+        let tenant_id = self.tenant_id;
+        let timeline_clone = Arc::clone(&timeline);
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::TimelineDeletionWorker,
+            Some(self.tenant_id),
+            Some(timeline_id),
+            "timeline_delete",
+            false,
+            async move {
+                if let Err(err) = self.delete_timeline(timeline_id, timeline).await {
+                    error!("Error: {err:#}");
+                    timeline_clone.set_broken(err.to_string())
+                };
+                Ok(())
+            }
+            .instrument({
+                let span =
+                    tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
+                span.follows_from(Span::current());
+                span
+            }),
+        );
+    }
+
     pub fn current_state(&self) -> TenantState {
         self.state.borrow().clone()
     }
@@ -1764,9 +1852,9 @@ impl Tenant {
 
         if activating {
             let timelines_accessor = self.timelines.lock().unwrap();
-            let not_broken_timelines = timelines_accessor
+            let timelines_to_activate = timelines_accessor
                 .values()
-                .filter(|timeline| timeline.current_state() != TimelineState::Broken);
+                .filter(|timeline| !(timeline.is_broken() || timeline.is_stopping()));
 
             // Spawn gc and compaction loops. The loops will shut themselves
             // down when they notice that the tenant is inactive.
@@ -1774,7 +1862,7 @@ impl Tenant {
 
             let mut activated_timelines = 0;
 
-            for timeline in not_broken_timelines {
+            for timeline in timelines_to_activate {
                 timeline.activate(broker_client.clone(), background_jobs_can_start, ctx);
                 activated_timelines += 1;
             }
@@ -1925,7 +2013,7 @@ impl Tenant {
         let timelines_accessor = self.timelines.lock().unwrap();
         let not_broken_timelines = timelines_accessor
             .values()
-            .filter(|timeline| timeline.current_state() != TimelineState::Broken);
+            .filter(|timeline| !timeline.is_broken());
         for timeline in not_broken_timelines {
             timeline.set_state(TimelineState::Stopping);
         }
@@ -3758,7 +3846,7 @@ mod tests {
 
         make_some_layers(newtline.as_ref(), Lsn(0x60)).await?;
 
-        tline.set_state(TimelineState::Broken);
+        tline.set_broken("test".to_owned());
 
         tenant
             .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index a1638e4a95..7e123c3fbd 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -396,7 +396,9 @@ pub async fn delete_timeline(
     ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
     let tenant = get_tenant(tenant_id, true).await?;
-    tenant.delete_timeline(timeline_id, ctx).await?;
+    tenant
+        .prepare_and_schedule_delete_timeline(timeline_id, ctx)
+        .await?;
     Ok(())
 }
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index c4640307d0..2936e7a4af 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -210,13 +210,15 @@ use chrono::{NaiveDateTime, Utc};
 pub use download::{is_temp_download_file, list_remote_timelines};
 use scopeguard::ScopeGuard;
 
+use std::collections::{HashMap, VecDeque};
+use std::path::Path;
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
 
-use remote_storage::{DownloadError, GenericRemoteStorage};
+use remote_storage::{DownloadError, GenericRemoteStorage, RemotePath};
 use std::ops::DerefMut;
 use tokio::runtime::Runtime;
-use tracing::{debug, error, info, warn};
+use tracing::{debug, error, info, instrument, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
 
@@ -225,7 +227,9 @@ use crate::metrics::{
     RemoteTimelineClientMetricsCallTrackSize, REMOTE_ONDEMAND_DOWNLOADED_BYTES,
     REMOTE_ONDEMAND_DOWNLOADED_LAYERS,
 };
+use crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id;
 use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
+use crate::tenant::upload_queue::Delete;
 use crate::{
     config::PageServerConf,
     task_mgr,
@@ -259,7 +263,7 @@ const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
 
 pub enum MaybeDeletedIndexPart {
     IndexPart(IndexPart),
-    Deleted,
+    Deleted(IndexPart),
 }
 
 /// Errors that can arise when calling [`RemoteTimelineClient::stop`].
@@ -361,11 +365,42 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Initialize the queue in stopped state. Used in startup path
+    /// to continue deletion operation interrupted by pageserver crash or restart.
+    pub fn init_upload_queue_stopped_to_continue_deletion(
+        &self,
+        index_part: &IndexPart,
+    ) -> anyhow::Result<()> {
+        // FIXME: consider newtype for DeletedIndexPart.
+        let deleted_at = index_part.deleted_at.ok_or(anyhow::anyhow!(
+            "bug: it is responsibility of the caller to provide index part from MaybeDeletedIndexPart::Deleted"
+        ))?;
+
+        {
+            let mut upload_queue = self.upload_queue.lock().unwrap();
+            upload_queue.initialize_with_current_remote_index_part(index_part)?;
+            self.update_remote_physical_size_gauge(Some(index_part));
+        }
+        // also locks upload queue, without dropping the guard above it will be a deadlock
+        self.stop().expect("initialized line above");
+
+        let mut upload_queue = self.upload_queue.lock().unwrap();
+
+        upload_queue
+            .stopped_mut()
+            .expect("stopped above")
+            .deleted_at = SetDeletedFlagProgress::Successful(deleted_at);
+
+        Ok(())
+    }
+
     pub fn last_uploaded_consistent_lsn(&self) -> Option<Lsn> {
         match &*self.upload_queue.lock().unwrap() {
             UploadQueue::Uninitialized => None,
             UploadQueue::Initialized(q) => Some(q.last_uploaded_consistent_lsn),
-            UploadQueue::Stopped(q) => Some(q.last_uploaded_consistent_lsn),
+            UploadQueue::Stopped(q) => {
+                Some(q.upload_queue_for_deletion.last_uploaded_consistent_lsn)
+            }
         }
     }
 
@@ -420,7 +455,7 @@ impl RemoteTimelineClient {
         .await?;
 
         if index_part.deleted_at.is_some() {
-            Ok(MaybeDeletedIndexPart::Deleted)
+            Ok(MaybeDeletedIndexPart::Deleted(index_part))
         } else {
             Ok(MaybeDeletedIndexPart::IndexPart(index_part))
         }
@@ -622,7 +657,11 @@ impl RemoteTimelineClient {
 
             // schedule the actual deletions
             for name in names {
-                let op = UploadOp::Delete(RemoteOpFileKind::Layer, name.clone());
+                let op = UploadOp::Delete(Delete {
+                    file_kind: RemoteOpFileKind::Layer,
+                    layer_file_name: name.clone(),
+                    scheduled_from_timeline_delete: false,
+                });
                 self.calls_unfinished_metric_begin(&op);
                 upload_queue.queued_operations.push_back(op);
                 info!("scheduled layer file deletion {}", name.file_name());
@@ -639,18 +678,11 @@ impl RemoteTimelineClient {
     /// Wait for all previously scheduled uploads/deletions to complete
     ///
     pub async fn wait_completion(self: &Arc<Self>) -> anyhow::Result<()> {
-        let (sender, mut receiver) = tokio::sync::watch::channel(());
-        let barrier_op = UploadOp::Barrier(sender);
-
-        {
+        let mut receiver = {
             let mut guard = self.upload_queue.lock().unwrap();
             let upload_queue = guard.initialized_mut()?;
-            upload_queue.queued_operations.push_back(barrier_op);
-            // Don't count this kind of operation!
-
-            // Launch the task immediately, if possible
-            self.launch_queued_tasks(upload_queue);
-        }
+            self.schedule_barrier(upload_queue)
+        };
 
         if receiver.changed().await.is_err() {
             anyhow::bail!("wait_completion aborted because upload queue was stopped");
@@ -658,6 +690,22 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    fn schedule_barrier(
+        self: &Arc<Self>,
+        upload_queue: &mut UploadQueueInitialized,
+    ) -> tokio::sync::watch::Receiver<()> {
+        let (sender, receiver) = tokio::sync::watch::channel(());
+        let barrier_op = UploadOp::Barrier(sender);
+
+        upload_queue.queued_operations.push_back(barrier_op);
+        // Don't count this kind of operation!
+
+        // Launch the task immediately, if possible
+        self.launch_queued_tasks(upload_queue);
+
+        receiver
+    }
+
     /// Set the deleted_at field in the remote index file.
     ///
     /// This fails if the upload queue has not been `stop()`ed.
@@ -665,6 +713,7 @@ impl RemoteTimelineClient {
     /// The caller is responsible for calling `stop()` AND for waiting
     /// for any ongoing upload tasks to finish after `stop()` has succeeded.
     /// Check method [`RemoteTimelineClient::stop`] for details.
+    #[instrument(skip_all)]
     pub(crate) async fn persist_index_part_with_deleted_flag(
         self: &Arc<Self>,
     ) -> Result<(), PersistIndexPartWithDeletedFlagError> {
@@ -674,15 +723,7 @@ impl RemoteTimelineClient {
             // We must be in stopped state because otherwise
             // we can have inprogress index part upload that can overwrite the file
             // with missing is_deleted flag that we going to set below
-            let stopped = match &mut *locked {
-                UploadQueue::Uninitialized => {
-                    return Err(anyhow::anyhow!("is not Stopped but Uninitialized").into())
-                }
-                UploadQueue::Initialized(_) => {
-                    return Err(anyhow::anyhow!("is not Stopped but Initialized").into())
-                }
-                UploadQueue::Stopped(stopped) => stopped,
-            };
+            let stopped = locked.stopped_mut()?;
 
             match stopped.deleted_at {
                 SetDeletedFlagProgress::NotRunning => (), // proceed
@@ -696,27 +737,17 @@ impl RemoteTimelineClient {
             let deleted_at = Utc::now().naive_utc();
             stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);
 
-            let mut index_part = IndexPart::new(
-                stopped.latest_files.clone(),
-                stopped.last_uploaded_consistent_lsn,
-                stopped
-                    .latest_metadata
-                    .to_bytes()
-                    .context("serialize metadata")?,
-            );
+            let mut index_part = IndexPart::try_from(&stopped.upload_queue_for_deletion)
+                .context("IndexPart serialize")?;
             index_part.deleted_at = Some(deleted_at);
             index_part
         };
 
         let undo_deleted_at = scopeguard::guard(Arc::clone(self), |self_clone| {
             let mut locked = self_clone.upload_queue.lock().unwrap();
-            let stopped = match &mut *locked {
-                UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
-                    "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
-                    locked.as_str(),
-                ),
-                UploadQueue::Stopped(stopped) => stopped,
-            };
+            let stopped = locked
+                .stopped_mut()
+                .expect("there's no way out of Stopping, and we checked it's Stopping above");
             stopped.deleted_at = SetDeletedFlagProgress::NotRunning;
         });
 
@@ -751,13 +782,10 @@ impl RemoteTimelineClient {
         ScopeGuard::into_inner(undo_deleted_at);
         {
             let mut locked = self.upload_queue.lock().unwrap();
-            let stopped = match &mut *locked {
-                UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
-                    "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
-                    locked.as_str(),
-                ),
-                UploadQueue::Stopped(stopped) => stopped,
-            };
+
+            let stopped = locked
+                .stopped_mut()
+                .expect("there's no way out of Stopping, and we checked it's Stopping above");
             stopped.deleted_at = SetDeletedFlagProgress::Successful(
                 index_part_with_deleted_at
                     .deleted_at
@@ -768,6 +796,92 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Prerequisites: UploadQueue should be in stopped state and deleted_at should be successfuly set.
+    /// The function deletes layer files one by one, then lists the prefix to see if we leaked something
+    /// deletes leaked files if any and proceeds with deletion of index file at the end.
+    pub(crate) async fn delete_all(self: &Arc<Self>) -> anyhow::Result<()> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
+        let (mut receiver, deletions_queued) = {
+            let mut deletions_queued = 0;
+
+            let mut locked = self.upload_queue.lock().unwrap();
+            let stopped = locked.stopped_mut()?;
+
+            if !matches!(stopped.deleted_at, SetDeletedFlagProgress::Successful(_)) {
+                anyhow::bail!("deleted_at is not set")
+            }
+
+            debug_assert!(stopped.upload_queue_for_deletion.no_pending_work());
+
+            stopped
+                .upload_queue_for_deletion
+                .queued_operations
+                .reserve(stopped.upload_queue_for_deletion.latest_files.len());
+
+            // schedule the actual deletions
+            for name in stopped.upload_queue_for_deletion.latest_files.keys() {
+                let op = UploadOp::Delete(Delete {
+                    file_kind: RemoteOpFileKind::Layer,
+                    layer_file_name: name.clone(),
+                    scheduled_from_timeline_delete: true,
+                });
+                self.calls_unfinished_metric_begin(&op);
+                stopped
+                    .upload_queue_for_deletion
+                    .queued_operations
+                    .push_back(op);
+
+                info!("scheduled layer file deletion {}", name.file_name());
+                deletions_queued += 1;
+            }
+
+            self.launch_queued_tasks(&mut stopped.upload_queue_for_deletion);
+
+            (
+                self.schedule_barrier(&mut stopped.upload_queue_for_deletion),
+                deletions_queued,
+            )
+        };
+
+        receiver.changed().await?;
+
+        // Do not delete index part yet, it is needed for possible retry. If we remove it first
+        // and retry will arrive to different pageserver there wont be any traces of it on remote storage
+        let timeline_path = self.conf.timeline_path(&self.timeline_id, &self.tenant_id);
+        let timeline_storage_path = self.conf.remote_path(&timeline_path)?;
+
+        let remaining = self
+            .storage_impl
+            .list_prefixes(Some(&timeline_storage_path))
+            .await?;
+
+        let remaining: Vec<RemotePath> = remaining
+            .into_iter()
+            .filter(|p| p.object_name() != Some(IndexPart::FILE_NAME))
+            .collect();
+
+        if !remaining.is_empty() {
+            warn!(
+                "Found {} files not bound to index_file.json, proceeding with their deletion",
+                remaining.len()
+            );
+            for file in remaining {
+                warn!("Removing {}", file.object_name().unwrap_or_default());
+                self.storage_impl.delete(&file).await?;
+            }
+        }
+
+        let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
+
+        debug!("deleting index part");
+        self.storage_impl.delete(&index_file_path).await?;
+
+        info!(deletions_queued, "done deleting, including index_part.json");
+
+        Ok(())
+    }
+
     ///
     /// Pick next tasks from the queue, and start as many of them as possible without violating
     /// the ordering constraints.
@@ -786,7 +900,7 @@ impl RemoteTimelineClient {
                     // have finished.
                     upload_queue.inprogress_tasks.is_empty()
                 }
-                UploadOp::Delete(_, _) => {
+                UploadOp::Delete(_) => {
                     // Wait for preceding uploads to finish. Concurrent deletions are OK, though.
                     upload_queue.num_inprogress_deletions == upload_queue.inprogress_tasks.len()
                 }
@@ -817,7 +931,7 @@ impl RemoteTimelineClient {
                 UploadOp::UploadMetadata(_, _) => {
                     upload_queue.num_inprogress_metadata_uploads += 1;
                 }
-                UploadOp::Delete(_, _) => {
+                UploadOp::Delete(_) => {
                     upload_queue.num_inprogress_deletions += 1;
                 }
                 UploadOp::Barrier(sender) => {
@@ -891,7 +1005,6 @@ impl RemoteTimelineClient {
                         unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
                     }
                 }
-                self.calls_unfinished_metric_end(&task.op);
                 return;
             }
 
@@ -937,16 +1050,16 @@ impl RemoteTimelineClient {
                     }
                     res
                 }
-                UploadOp::Delete(metric_file_kind, ref layer_file_name) => {
+                UploadOp::Delete(delete) => {
                     let path = &self
                         .conf
                         .timeline_path(&self.timeline_id, &self.tenant_id)
-                        .join(layer_file_name.file_name());
+                        .join(delete.layer_file_name.file_name());
                     delete::delete_layer(self.conf, &self.storage_impl, path)
                         .measure_remote_op(
                             self.tenant_id,
                             self.timeline_id,
-                            *metric_file_kind,
+                            delete.file_kind,
                             RemoteOpKind::Delete,
                             Arc::clone(&self.metrics),
                         )
@@ -1012,11 +1125,24 @@ impl RemoteTimelineClient {
             let mut upload_queue_guard = self.upload_queue.lock().unwrap();
             let upload_queue = match upload_queue_guard.deref_mut() {
                 UploadQueue::Uninitialized => panic!("callers are responsible for ensuring this is only called on an initialized queue"),
-                UploadQueue::Stopped(_) => {
+                UploadQueue::Stopped(stopped) => {
+                    // Special care is needed for deletions, if it was an earlier deletion (not scheduled from deletion)
+                    // then stop() took care of it so we just return.
+                    // For deletions that come from delete_all we still want to maintain metrics, launch following tasks, etc.
+                    match &task.op {
+                        UploadOp::Delete(delete) if delete.scheduled_from_timeline_delete => Some(&mut stopped.upload_queue_for_deletion),
+                        _ => None
+                    }
+                },
+                UploadQueue::Initialized(qi) => { Some(qi) }
+            };
+
+            let upload_queue = match upload_queue {
+                Some(upload_queue) => upload_queue,
+                None => {
                     info!("another concurrent task already stopped the queue");
                     return;
-                }, // nothing to do
-                UploadQueue::Initialized(qi) => { qi }
+                }
             };
 
             upload_queue.inprogress_tasks.remove(&task.task_id);
@@ -1029,7 +1155,7 @@ impl RemoteTimelineClient {
                     upload_queue.num_inprogress_metadata_uploads -= 1;
                     upload_queue.last_uploaded_consistent_lsn = lsn; // XXX monotonicity check?
                 }
-                UploadOp::Delete(_, _) => {
+                UploadOp::Delete(_) => {
                     upload_queue.num_inprogress_deletions -= 1;
                 }
                 UploadOp::Barrier(_) => unreachable!(),
@@ -1063,8 +1189,8 @@ impl RemoteTimelineClient {
                     reason: "metadata uploads are tiny",
                 },
             ),
-            UploadOp::Delete(file_kind, _) => (
-                *file_kind,
+            UploadOp::Delete(delete) => (
+                delete.file_kind,
                 RemoteOpKind::Delete,
                 DontTrackSize {
                     reason: "should we track deletes? positive or negative sign?",
@@ -1111,32 +1237,36 @@ impl RemoteTimelineClient {
                 info!("another concurrent task already shut down the queue");
                 Ok(())
             }
-            UploadQueue::Initialized(UploadQueueInitialized {
-                latest_files,
-                latest_metadata,
-                last_uploaded_consistent_lsn,
-                ..
-            }) => {
+            UploadQueue::Initialized(initialized) => {
                 info!("shutting down upload queue");
 
                 // Replace the queue with the Stopped state, taking ownership of the old
                 // Initialized queue. We will do some checks on it, and then drop it.
                 let qi = {
-                    // take or clone what we need
-                    let latest_files = std::mem::take(latest_files);
-                    let last_uploaded_consistent_lsn = *last_uploaded_consistent_lsn;
-                    // this could be Copy
-                    let latest_metadata = latest_metadata.clone();
-
-                    let stopped = UploadQueueStopped {
-                        latest_files,
-                        last_uploaded_consistent_lsn,
-                        latest_metadata,
-                        deleted_at: SetDeletedFlagProgress::NotRunning,
+                    // Here we preserve working version of the upload queue for possible use during deletions.
+                    // In-place replace of Initialized to Stopped can be done with the help of https://github.com/Sgeo/take_mut
+                    // but for this use case it doesnt really makes sense to bring unsafe code only for this usage point.
+                    // Deletion is not really perf sensitive so there shouldnt be any problems with cloning a fraction of it.
+                    let upload_queue_for_deletion = UploadQueueInitialized {
+                        task_counter: 0,
+                        latest_files: initialized.latest_files.clone(),
+                        latest_files_changes_since_metadata_upload_scheduled: 0,
+                        latest_metadata: initialized.latest_metadata.clone(),
+                        last_uploaded_consistent_lsn: initialized.last_uploaded_consistent_lsn,
+                        num_inprogress_layer_uploads: 0,
+                        num_inprogress_metadata_uploads: 0,
+                        num_inprogress_deletions: 0,
+                        inprogress_tasks: HashMap::default(),
+                        queued_operations: VecDeque::default(),
                     };
 
-                    let upload_queue =
-                        std::mem::replace(&mut *guard, UploadQueue::Stopped(stopped));
+                    let upload_queue = std::mem::replace(
+                        &mut *guard,
+                        UploadQueue::Stopped(UploadQueueStopped {
+                            upload_queue_for_deletion,
+                            deleted_at: SetDeletedFlagProgress::NotRunning,
+                        }),
+                    );
                     if let UploadQueue::Initialized(qi) = upload_queue {
                         qi
                     } else {
@@ -1144,8 +1274,6 @@ impl RemoteTimelineClient {
                     }
                 };
 
-                assert!(qi.latest_files.is_empty(), "do not use this anymore");
-
                 // consistency check
                 assert_eq!(
                     qi.num_inprogress_layer_uploads
@@ -1408,7 +1536,7 @@ mod tests {
         // Download back the index.json, and check that the list of files is correct
         let index_part = match runtime.block_on(client.download_index_file())? {
             MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
-            MaybeDeletedIndexPart::Deleted => panic!("unexpectedly got deleted index part"),
+            MaybeDeletedIndexPart::Deleted(_) => panic!("unexpectedly got deleted index part"),
         };
 
         assert_file_list(
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 7a06e57a6b..c3f6dcadec 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -7,9 +7,11 @@ use std::collections::{HashMap, HashSet};
 use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
+use utils::bin_ser::SerializeError;
 
 use crate::tenant::metadata::TimelineMetadata;
 use crate::tenant::storage_layer::LayerFileName;
+use crate::tenant::upload_queue::UploadQueueInitialized;
 
 use utils::lsn::Lsn;
 
@@ -115,6 +117,21 @@ impl IndexPart {
     }
 }
 
+impl TryFrom<&UploadQueueInitialized> for IndexPart {
+    type Error = SerializeError;
+
+    fn try_from(upload_queue: &UploadQueueInitialized) -> Result<Self, Self::Error> {
+        let disk_consistent_lsn = upload_queue.latest_metadata.disk_consistent_lsn();
+        let metadata_bytes = upload_queue.latest_metadata.to_bytes()?;
+
+        Ok(Self::new(
+            upload_queue.latest_files.clone(),
+            disk_consistent_lsn,
+            metadata_bytes,
+        ))
+    }
+}
+
 /// Serialized form of [`LayerFileMetadata`].
 #[derive(Debug, PartialEq, Eq, Clone, Serialize, Deserialize, Default)]
 pub struct IndexLayerMetadata {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 2a50a26a23..71f83bf127 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -239,7 +239,7 @@ pub struct Timeline {
 
     /// Prevent two tasks from deleting the timeline at the same time. If held, the
     /// timeline is being deleted. If 'true', the timeline has already been deleted.
-    pub delete_lock: tokio::sync::Mutex<bool>,
+    pub delete_lock: Arc<tokio::sync::Mutex<bool>>,
 
     eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
 
@@ -815,8 +815,7 @@ impl Timeline {
         // above. Rewrite it.
         let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
         // Is the timeline being deleted?
-        let state = *self.state.borrow();
-        if state == TimelineState::Stopping {
+        if self.is_stopping() {
             return Err(anyhow::anyhow!("timeline is Stopping").into());
         }
 
@@ -955,14 +954,17 @@ impl Timeline {
             (st, TimelineState::Loading) => {
                 error!("ignoring transition from {st:?} into Loading state");
             }
-            (TimelineState::Broken, _) => {
-                error!("Ignoring state update {new_state:?} for broken tenant");
+            (TimelineState::Broken { .. }, new_state) => {
+                error!("Ignoring state update {new_state:?} for broken timeline");
             }
             (TimelineState::Stopping, TimelineState::Active) => {
                 error!("Not activating a Stopping timeline");
             }
             (_, new_state) => {
-                if matches!(new_state, TimelineState::Stopping | TimelineState::Broken) {
+                if matches!(
+                    new_state,
+                    TimelineState::Stopping | TimelineState::Broken { .. }
+                ) {
                     // drop the copmletion guard, if any; it might be holding off the completion
                     // forever needlessly
                     self.initial_logical_size_attempt
@@ -975,14 +977,31 @@ impl Timeline {
         }
     }
 
+    pub fn set_broken(&self, reason: String) {
+        let backtrace_str: String = format!("{}", std::backtrace::Backtrace::force_capture());
+        let broken_state = TimelineState::Broken {
+            reason,
+            backtrace: backtrace_str,
+        };
+        self.set_state(broken_state)
+    }
+
     pub fn current_state(&self) -> TimelineState {
-        *self.state.borrow()
+        self.state.borrow().clone()
+    }
+
+    pub fn is_broken(&self) -> bool {
+        matches!(&*self.state.borrow(), TimelineState::Broken { .. })
     }
 
     pub fn is_active(&self) -> bool {
         self.current_state() == TimelineState::Active
     }
 
+    pub fn is_stopping(&self) -> bool {
+        self.current_state() == TimelineState::Stopping
+    }
+
     pub fn subscribe_for_state_updates(&self) -> watch::Receiver<TimelineState> {
         self.state.subscribe()
     }
@@ -993,7 +1012,7 @@ impl Timeline {
     ) -> Result<(), TimelineState> {
         let mut receiver = self.state.subscribe();
         loop {
-            let current_state = *receiver.borrow_and_update();
+            let current_state = receiver.borrow().clone();
             match current_state {
                 TimelineState::Loading => {
                     receiver
@@ -1460,7 +1479,7 @@ impl Timeline {
                 eviction_task_timeline_state: tokio::sync::Mutex::new(
                     EvictionTaskTimelineState::default(),
                 ),
-                delete_lock: tokio::sync::Mutex::new(false),
+                delete_lock: Arc::new(tokio::sync::Mutex::new(false)),
 
                 initial_logical_size_can_start,
                 initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
@@ -2101,11 +2120,11 @@ impl Timeline {
             loop {
                 match timeline_state_updates.changed().await {
                     Ok(()) => {
-                        let new_state = *timeline_state_updates.borrow();
+                        let new_state = timeline_state_updates.borrow().clone();
                         match new_state {
                             // we're running this job for active timelines only
                             TimelineState::Active => continue,
-                            TimelineState::Broken
+                            TimelineState::Broken { .. }
                             | TimelineState::Stopping
                             | TimelineState::Loading => {
                                 break format!("aborted because timeline became inactive (new state: {new_state:?})")
@@ -3792,9 +3811,7 @@ impl Timeline {
 
         let layer_removal_cs = Arc::new(self.layer_removal_cs.clone().lock_owned().await);
         // Is the timeline being deleted?
-        let state = *self.state.borrow();
-        if state == TimelineState::Stopping {
-            // there's a global allowed_error for this
+        if self.is_stopping() {
             anyhow::bail!("timeline is Stopping");
         }
 
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index e235fab425..a5d0af32fe 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -153,7 +153,7 @@ pub(super) async fn connection_manager_loop_step(
                             match new_state {
                                 // we're already active as walreceiver, no need to reactivate
                                 TimelineState::Active => continue,
-                                TimelineState::Broken | TimelineState::Stopping => {
+                                TimelineState::Broken { .. } | TimelineState::Stopping => {
                                     debug!("timeline entered terminal state {new_state:?}, stopping wal connection manager loop");
                                     return ControlFlow::Break(());
                                 }
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 8f5faff627..a62cc99adf 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -76,6 +76,12 @@ pub(crate) struct UploadQueueInitialized {
     pub(crate) queued_operations: VecDeque<UploadOp>,
 }
 
+impl UploadQueueInitialized {
+    pub(super) fn no_pending_work(&self) -> bool {
+        self.inprogress_tasks.is_empty() && self.queued_operations.is_empty()
+    }
+}
+
 #[derive(Clone, Copy)]
 pub(super) enum SetDeletedFlagProgress {
     NotRunning,
@@ -84,9 +90,7 @@ pub(super) enum SetDeletedFlagProgress {
 }
 
 pub(super) struct UploadQueueStopped {
-    pub(super) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
-    pub(super) last_uploaded_consistent_lsn: Lsn,
-    pub(super) latest_metadata: TimelineMetadata,
+    pub(super) upload_queue_for_deletion: UploadQueueInitialized,
     pub(super) deleted_at: SetDeletedFlagProgress,
 }
 
@@ -187,6 +191,15 @@ impl UploadQueue {
             UploadQueue::Initialized(x) => Ok(x),
         }
     }
+
+    pub(crate) fn stopped_mut(&mut self) -> anyhow::Result<&mut UploadQueueStopped> {
+        match self {
+            UploadQueue::Initialized(_) | UploadQueue::Uninitialized => {
+                anyhow::bail!("queue is in state {}", self.as_str())
+            }
+            UploadQueue::Stopped(stopped) => Ok(stopped),
+        }
+    }
 }
 
 /// An in-progress upload or delete task.
@@ -199,6 +212,13 @@ pub(crate) struct UploadTask {
     pub(crate) op: UploadOp,
 }
 
+#[derive(Debug)]
+pub(crate) struct Delete {
+    pub(crate) file_kind: RemoteOpFileKind,
+    pub(crate) layer_file_name: LayerFileName,
+    pub(crate) scheduled_from_timeline_delete: bool,
+}
+
 #[derive(Debug)]
 pub(crate) enum UploadOp {
     /// Upload a layer file
@@ -207,8 +227,8 @@ pub(crate) enum UploadOp {
     /// Upload the metadata file
     UploadMetadata(IndexPart, Lsn),
 
-    /// Delete a file.
-    Delete(RemoteOpFileKind, LayerFileName),
+    /// Delete a layer file
+    Delete(Delete),
 
     /// Barrier. When the barrier operation is reached,
     Barrier(tokio::sync::watch::Sender<()>),
@@ -226,7 +246,12 @@ impl std::fmt::Display for UploadOp {
                 )
             }
             UploadOp::UploadMetadata(_, lsn) => write!(f, "UploadMetadata(lsn: {})", lsn),
-            UploadOp::Delete(_, path) => write!(f, "Delete({})", path.file_name()),
+            UploadOp::Delete(delete) => write!(
+                f,
+                "Delete(path: {}, scheduled_from_timeline_delete: {})",
+                delete.layer_file_name.file_name(),
+                delete.scheduled_from_timeline_delete
+            ),
             UploadOp::Barrier(_) => write!(f, "Barrier"),
         }
     }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 551faa116e..a8610e24df 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -663,6 +663,8 @@ class NeonEnvBuilder:
         else:
             raise RuntimeError(f"Unknown storage type: {remote_storage_kind}")
 
+        self.remote_storage_kind = remote_storage_kind
+
     def enable_local_fs_remote_storage(self, force_enable: bool = True):
         """
         Sets up the pageserver to use the local fs at the `test_dir/local_fs_remote_storage` path.
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index d7ffa633fd..83880abc77 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -2,7 +2,7 @@ import time
 from typing import Any, Dict, Optional
 
 from fixtures.log_helper import log
-from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.types import Lsn, TenantId, TimelineId
 
 
@@ -92,6 +92,41 @@ def wait_until_tenant_state(
     )
 
 
+def wait_until_timeline_state(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    expected_state: str,
+    iterations: int,
+    period: float = 1.0,
+) -> Dict[str, Any]:
+    """
+    Does not use `wait_until` for debugging purposes
+    """
+    for i in range(iterations):
+        try:
+            timeline = pageserver_http.timeline_detail(tenant_id=tenant_id, timeline_id=timeline_id)
+            log.debug(f"Timeline {tenant_id}/{timeline_id} data: {timeline}")
+            if isinstance(timeline["state"], str):
+                if timeline["state"] == expected_state:
+                    return timeline
+            elif isinstance(timeline, Dict):
+                if timeline["state"].get(expected_state):
+                    return timeline
+
+        except Exception as e:
+            log.debug(f"Timeline {tenant_id}/{timeline_id} state retrieval failure: {e}")
+
+        if i == iterations - 1:
+            # do not sleep last time, we already know that we failed
+            break
+        time.sleep(period)
+
+    raise Exception(
+        f"Timeline {tenant_id}/{timeline_id} did not become {expected_state} within {iterations * period} seconds"
+    )
+
+
 def wait_until_tenant_active(
     pageserver_http: PageserverHttpClient,
     tenant_id: TenantId,
@@ -156,3 +191,21 @@ def wait_for_upload_queue_empty(
         if all(m.value == 0 for m in tl):
             return
         time.sleep(0.2)
+
+
+def assert_timeline_detail_404(
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+):
+    """Asserts that timeline_detail returns 404, or dumps the detail."""
+    try:
+        data = pageserver_http.timeline_detail(tenant_id, timeline_id)
+        log.error(f"detail {data}")
+    except PageserverApiException as e:
+        log.error(e)
+        if e.status_code == 404:
+            return
+        else:
+            raise
+    raise Exception("detail succeeded (it should return 404)")
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 77030288f0..5c3948b027 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -79,6 +79,7 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
     # Set up pageserver for import
     neon_env_builder.enable_local_fs_remote_storage()
     env = neon_env_builder.init_start()
+
     client = env.pageserver.http_client()
     client.tenant_create(tenant)
 
@@ -145,6 +146,11 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
     )
 
     # NOTE: delete can easily come before upload operations are completed
+    # https://github.com/neondatabase/neon/issues/4326
+    env.pageserver.allowed_errors.append(
+        ".*files not bound to index_file.json, proceeding with their deletion.*"
+    )
+
     client.timeline_delete(tenant, timeline)
 
     # Importing correct backup works
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 742dbfff95..11ac9e2555 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -20,6 +20,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
 from fixtures.pageserver.utils import (
+    assert_timeline_detail_404,
     wait_for_last_record_lsn,
     wait_for_upload,
     wait_until_tenant_active,
@@ -182,7 +183,7 @@ def test_remote_storage_backup_and_restore(
     wait_until_tenant_active(
         pageserver_http=client,
         tenant_id=tenant_id,
-        iterations=5,
+        iterations=10,  # make it longer for real_s3 tests when unreliable wrapper is involved
     )
 
     detail = client.timeline_detail(tenant_id, timeline_id)
@@ -598,8 +599,23 @@ def test_timeline_deletion_with_files_stuck_in_upload_queue(
     )
     client.timeline_delete(tenant_id, timeline_id)
 
+    env.pageserver.allowed_errors.append(f".*Timeline {tenant_id}/{timeline_id} was not found.*")
+    env.pageserver.allowed_errors.append(
+        ".*files not bound to index_file.json, proceeding with their deletion.*"
+    )
+
+    wait_until(2, 0.5, lambda: assert_timeline_detail_404(client, tenant_id, timeline_id))
+
     assert not timeline_path.exists()
 
+    # to please mypy
+    assert isinstance(env.remote_storage, LocalFsStorage)
+    remote_timeline_path = (
+        env.remote_storage.root / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
+    )
+
+    assert not list(remote_timeline_path.iterdir())
+
     # timeline deletion should kill ongoing uploads, so, the metric will be gone
     assert get_queued_count(file_kind="index", op_kind="upload") is None
 
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index be79538843..28b15d03ca 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -3,6 +3,7 @@ import queue
 import shutil
 import threading
 from pathlib import Path
+from typing import Optional
 
 import pytest
 import requests
@@ -11,13 +12,16 @@ from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
     RemoteStorageKind,
+    S3Storage,
     available_remote_storages,
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
+    assert_timeline_detail_404,
     wait_for_last_record_lsn,
     wait_for_upload,
     wait_until_tenant_active,
+    wait_until_timeline_state,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import query_scalar, wait_until
@@ -68,7 +72,7 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
 
         ps_http.timeline_delete(env.initial_tenant, parent_timeline_id)
 
-    assert exc.value.status_code == 400
+    assert exc.value.status_code == 412
 
     timeline_path = (
         env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id)
@@ -130,13 +134,25 @@ def test_delete_timeline_post_rm_failure(
     env = neon_env_builder.init_start()
     assert env.initial_timeline
 
+    env.pageserver.allowed_errors.append(".*Error: failpoint: timeline-delete-after-rm")
+    env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline")
+
     ps_http = env.pageserver.http_client()
 
     failpoint_name = "timeline-delete-after-rm"
     ps_http.configure_failpoints((failpoint_name, "return"))
 
-    with pytest.raises(PageserverApiException, match=f"failpoint: {failpoint_name}"):
-        ps_http.timeline_delete(env.initial_tenant, env.initial_timeline)
+    ps_http.timeline_delete(env.initial_tenant, env.initial_timeline)
+
+    timeline_info = wait_until_timeline_state(
+        pageserver_http=ps_http,
+        tenant_id=env.initial_tenant,
+        timeline_id=env.initial_timeline,
+        expected_state="Broken",
+        iterations=2,  # effectively try immediately and retry once in one second
+    )
+
+    timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm"
 
     at_failpoint_log_message = f".*{env.initial_timeline}.*at failpoint {failpoint_name}.*"
     env.pageserver.allowed_errors.append(at_failpoint_log_message)
@@ -148,11 +164,14 @@ def test_delete_timeline_post_rm_failure(
     ps_http.configure_failpoints((failpoint_name, "off"))
 
     # this should succeed
+    # this also checks that delete can be retried even when timeline is in Broken state
     ps_http.timeline_delete(env.initial_tenant, env.initial_timeline, timeout=2)
-    # the second call will try to transition the timeline into Stopping state, but it's already in that state
-    env.pageserver.allowed_errors.append(
-        f".*{env.initial_timeline}.*Ignoring new state, equal to the existing one: Stopping"
-    )
+    with pytest.raises(PageserverApiException) as e:
+        ps_http.timeline_detail(env.initial_tenant, env.initial_timeline)
+
+    assert e.value.status_code == 404
+
+    env.pageserver.allowed_errors.append(f".*NotFound: Timeline.*{env.initial_timeline}.*")
     env.pageserver.allowed_errors.append(
         f".*{env.initial_timeline}.*timeline directory not found, proceeding anyway.*"
     )
@@ -230,6 +249,12 @@ def test_timeline_resurrection_on_attach(
     # delete new timeline
     ps_http.timeline_delete(tenant_id=tenant_id, timeline_id=branch_timeline_id)
 
+    env.pageserver.allowed_errors.append(
+        f".*Timeline {tenant_id}/{branch_timeline_id} was not found.*"
+    )
+
+    wait_until(2, 0.5, lambda: assert_timeline_detail_404(ps_http, tenant_id, branch_timeline_id))
+
     ##### Stop the pageserver instance, erase all its data
     env.endpoints.stop_all()
     env.pageserver.stop()
@@ -252,12 +277,31 @@ def test_timeline_resurrection_on_attach(
     assert all([tl["state"] == "Active" for tl in timelines])
 
 
+def assert_prefix_empty(neon_env_builder: NeonEnvBuilder, prefix: Optional[str] = None):
+    # For local_fs we need to properly handle empty directories, which we currently dont, so for simplicity stick to s3 api.
+    assert neon_env_builder.remote_storage_kind in (
+        RemoteStorageKind.MOCK_S3,
+        RemoteStorageKind.REAL_S3,
+    )
+    # For mypy
+    assert isinstance(neon_env_builder.remote_storage, S3Storage)
+
+    # Note that this doesnt use pagination, so list is not guaranteed to be exhaustive.
+    response = neon_env_builder.remote_storage_client.list_objects_v2(
+        Bucket=neon_env_builder.remote_storage.bucket_name,
+        Prefix=prefix or neon_env_builder.remote_storage.prefix_in_bucket or "",
+    )
+    objects = response.get("Contents")
+    assert (
+        response["KeyCount"] == 0
+    ), f"remote dir with prefix {prefix} is not empty after deletion: {objects}"
+
+
 def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuilder):
     """
     When deleting a timeline, if we succeed in setting the deleted flag remotely
     but fail to delete the local state, restarting the pageserver should resume
     the deletion of the local state.
-    (Deletion of the state in S3 is not implemented yet.)
     """
 
     neon_env_builder.enable_remote_storage(
@@ -293,11 +337,17 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
         env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id)
     )
 
-    with pytest.raises(
-        PageserverApiException,
-        match="failpoint: timeline-delete-before-rm",
-    ):
-        ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)
+    ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)
+
+    timeline_info = wait_until_timeline_state(
+        pageserver_http=ps_http,
+        tenant_id=env.initial_tenant,
+        timeline_id=leaf_timeline_id,
+        expected_state="Broken",
+        iterations=2,  # effectively try immediately and retry once in one second
+    )
+
+    timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm"
 
     assert leaf_timeline_path.exists(), "the failpoint didn't work"
 
@@ -305,7 +355,14 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
     env.pageserver.start()
 
     # Wait for tenant to finish loading.
-    wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=0.5)
+    wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=1)
+
+    env.pageserver.allowed_errors.append(
+        f".*Timeline {env.initial_tenant}/{leaf_timeline_id} was not found.*"
+    )
+    wait_until(
+        2, 0.5, lambda: assert_timeline_detail_404(ps_http, env.initial_tenant, leaf_timeline_id)
+    )
 
     assert (
         not leaf_timeline_path.exists()
@@ -317,6 +374,50 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
     }, "other timelines should not have been affected"
     assert all([tl["state"] == "Active" for tl in timelines])
 
+    assert_prefix_empty(
+        neon_env_builder,
+        prefix="/".join(
+            (
+                "tenants",
+                str(env.initial_tenant),
+                "timelines",
+                str(leaf_timeline_id),
+            )
+        ),
+    )
+
+    assert env.initial_timeline is not None
+
+    for timeline_id in (intermediate_timeline_id, env.initial_timeline):
+        ps_http.timeline_delete(env.initial_tenant, timeline_id)
+
+        env.pageserver.allowed_errors.append(
+            f".*Timeline {env.initial_tenant}/{timeline_id} was not found.*"
+        )
+        wait_until(
+            2, 0.5, lambda: assert_timeline_detail_404(ps_http, env.initial_tenant, timeline_id)
+        )
+
+        assert_prefix_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(env.initial_tenant),
+                    "timelines",
+                    str(timeline_id),
+                )
+            ),
+        )
+
+    # for some reason the check above doesnt immediately take effect for the below.
+    # Assume it is mock server incosistency and check twice.
+    wait_until(
+        2,
+        0.5,
+        lambda: assert_prefix_empty(neon_env_builder),
+    )
+
 
 def test_concurrent_timeline_delete_if_first_stuck_at_index_upload(
     neon_env_builder: NeonEnvBuilder,
@@ -457,3 +558,87 @@ def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
         ps_http.timeline_detail(env.initial_tenant, child_timeline_id)
 
     assert exc.value.status_code == 404
+
+
+@pytest.mark.parametrize(
+    "remote_storage_kind",
+    list(
+        filter(
+            lambda s: s in (RemoteStorageKind.MOCK_S3, RemoteStorageKind.REAL_S3),
+            available_remote_storages(),
+        )
+    ),
+)
+def test_timeline_delete_works_for_remote_smoke(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_timeline_delete_works_for_remote_smoke",
+    )
+
+    env = neon_env_builder.init_start()
+
+    ps_http = env.pageserver.http_client()
+    pg = env.endpoints.create_start("main")
+
+    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
+    main_timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+
+    assert tenant_id == env.initial_tenant
+    assert main_timeline_id == env.initial_timeline
+
+    timeline_ids = [env.initial_timeline]
+    for i in range(2):
+        branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main")
+        pg = env.endpoints.create_start(f"new{i}")
+
+        with pg.cursor() as cur:
+            cur.execute("CREATE TABLE f (i integer);")
+            cur.execute("INSERT INTO f VALUES (generate_series(1,1000));")
+            current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+            # wait until pageserver receives that data
+            wait_for_last_record_lsn(ps_http, tenant_id, branch_timeline_id, current_lsn)
+
+            # run checkpoint manually to be sure that data landed in remote storage
+            ps_http.timeline_checkpoint(tenant_id, branch_timeline_id)
+
+            # wait until pageserver successfully uploaded a checkpoint to remote storage
+            log.info("waiting for checkpoint upload")
+            wait_for_upload(ps_http, tenant_id, branch_timeline_id, current_lsn)
+            log.info("upload of checkpoint is done")
+            timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+
+        timeline_ids.append(timeline_id)
+
+    for timeline_id in reversed(timeline_ids):
+        # note that we need to finish previous deletion before scheduling next one
+        # otherwise we can get an "HasChildren" error if deletion is not fast enough (real_s3)
+        ps_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id)
+
+        env.pageserver.allowed_errors.append(
+            f".*Timeline {env.initial_tenant}/{timeline_id} was not found.*"
+        )
+        wait_until(2, 0.5, lambda: assert_timeline_detail_404(ps_http, tenant_id, timeline_id))
+
+        assert_prefix_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(env.initial_tenant),
+                    "timelines",
+                    str(timeline_id),
+                )
+            ),
+        )
+
+    # for some reason the check above doesnt immediately take effect for the below.
+    # Assume it is mock server incosistency and check twice.
+    wait_until(
+        2,
+        0.5,
+        lambda: assert_prefix_empty(neon_env_builder),
+    )

From d73639646efe894362bdf2d187f4a6b28e903434 Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Mon, 5 Jun 2023 15:32:05 +0300
Subject: [PATCH 12/14] Add more output options to proxy json endpoint

With this commit client can pass following optional headers:

`Neon-Raw-Text-Output: true`. Return postgres values as text, without parsing them. So numbers, objects, booleans, nulls and arrays will be returned as text. That can be useful in cases when client code wants to implement it's own parsing or reuse parsing libraries from e.g. node-postgres.

`Neon-Array-Mode: true`. Return postgres rows as arrays instead of objects. That is more compact representation and also helps in some edge
cases where it is hard to use rows represented as objects (e.g. when several fields have the same name).
---
 proxy/README.md                   |  9 +++++
 proxy/src/http/sql_over_http.rs   | 55 +++++++++++++++++++++++--------
 test_runner/regress/test_proxy.py | 34 +++++++++++++++++++
 3 files changed, 84 insertions(+), 14 deletions(-)

diff --git a/proxy/README.md b/proxy/README.md
index cd76a2443f..d1f2e3f27b 100644
--- a/proxy/README.md
+++ b/proxy/README.md
@@ -93,6 +93,15 @@ With the current approach we made the following design decisions:
    and column oids. Command tag capturing was added to the rust-postgres
    functionality as part of this change.
 
+### Output options
+
+User can pass several optional headers that will affect resulting json.
+
+1. `Neon-Raw-Text-Output: true`. Return postgres values as text, without parsing them. So numbers, objects, booleans, nulls and arrays will be returned as text. That can be useful in cases when client code wants to implement it's own parsing or reuse parsing libraries from e.g. node-postgres.
+2. `Neon-Array-Mode: true`. Return postgres rows as arrays instead of objects. That is more compact representation and also helps in some edge
+cases where it is hard to use rows represented as objects (e.g. when several fields have the same name).
+
+
 ## Using SNI-based routing on localhost
 
 Now proxy determines project name from the subdomain, request to the `round-rice-566201.somedomain.tld` will be routed to the project named `round-rice-566201`. Unfortunately, `/etc/hosts` does not support domain wildcards, so I usually use `*.localtest.me` which resolves to `127.0.0.1`. Now we can create self-signed certificate and play with proxy:
diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index 0438a82c12..417af1e531 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -1,6 +1,8 @@
 use futures::pin_mut;
 use futures::StreamExt;
 use hyper::body::HttpBody;
+use hyper::http::HeaderName;
+use hyper::http::HeaderValue;
 use hyper::{Body, HeaderMap, Request};
 use pq_proto::StartupMessageParams;
 use serde_json::json;
@@ -23,6 +25,10 @@ const APP_NAME: &str = "sql_over_http";
 const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
 
+static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
+static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
+static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
+
 //
 // Convert json non-string types to strings, so that they can be passed to Postgres
 // as parameters.
@@ -158,6 +164,11 @@ pub async fn handle(
         ("application_name", APP_NAME),
     ]);
 
+    // Determine the output options. Default behaviour is 'false'. Anything that is not
+    // strictly 'true' assumed to be false.
+    let raw_output = headers.get(&RAW_TEXT_OUTPUT) == Some(&HEADER_VALUE_TRUE);
+    let array_mode = headers.get(&ARRAY_MODE) == Some(&HEADER_VALUE_TRUE);
+
     //
     // Wake up the destination if needed. Code here is a bit involved because
     // we reuse the code from the usual proxy and we need to prepare few structures
@@ -272,7 +283,7 @@ pub async fn handle(
     // convert rows to JSON
     let rows = rows
         .iter()
-        .map(pg_text_row_to_json)
+        .map(|row| pg_text_row_to_json(row, raw_output, array_mode))
         .collect::<Result<Vec<_>, _>>()?;
 
     // resulting JSON format is based on the format of node-postgres result
@@ -281,26 +292,42 @@ pub async fn handle(
         "rowCount": command_tag_count,
         "rows": rows,
         "fields": fields,
+        "rowAsArray": array_mode,
     }))
 }
 
 //
 // Convert postgres row with text-encoded values to JSON object
 //
-pub fn pg_text_row_to_json(row: &Row) -> Result<Value, anyhow::Error> {
-    let res = row
-        .columns()
-        .iter()
-        .enumerate()
-        .map(|(i, column)| {
-            let name = column.name();
-            let pg_value = row.as_text(i)?;
-            let json_value = pg_text_to_json(pg_value, column.type_())?;
-            Ok((name.to_string(), json_value))
-        })
-        .collect::<Result<Map<String, Value>, anyhow::Error>>()?;
+pub fn pg_text_row_to_json(
+    row: &Row,
+    raw_output: bool,
+    array_mode: bool,
+) -> Result<Value, anyhow::Error> {
+    let iter = row.columns().iter().enumerate().map(|(i, column)| {
+        let name = column.name();
+        let pg_value = row.as_text(i)?;
+        let json_value = if raw_output {
+            match pg_value {
+                Some(v) => Value::String(v.to_string()),
+                None => Value::Null,
+            }
+        } else {
+            pg_text_to_json(pg_value, column.type_())?
+        };
+        Ok((name.to_string(), json_value))
+    });
 
-    Ok(Value::Object(res))
+    if array_mode {
+        // drop keys and aggregate into array
+        let arr = iter
+            .map(|r| r.map(|(_key, val)| val))
+            .collect::<Result<Vec<Value>, anyhow::Error>>()?;
+        Ok(Value::Array(arr))
+    } else {
+        let obj = iter.collect::<Result<Map<String, Value>, anyhow::Error>>()?;
+        Ok(Value::Object(obj))
+    }
 }
 
 //
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 6be3995714..ca19dc3fd0 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -225,3 +225,37 @@ def test_sql_over_http(static_proxy: NeonProxy):
     res = q("drop table t")
     assert res["command"] == "DROP"
     assert res["rowCount"] is None
+
+
+def test_sql_over_http_output_options(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http2 with login password 'http2' superuser")
+
+    def q(sql: str, raw_text: bool, array_mode: bool, params: List[Any] = []) -> Any:
+        connstr = (
+            f"postgresql://http2:http2@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
+        )
+        response = requests.post(
+            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+            data=json.dumps({"query": sql, "params": params}),
+            headers={
+                "Content-Type": "application/sql",
+                "Neon-Connection-String": connstr,
+                "Neon-Raw-Text-Output": "true" if raw_text else "false",
+                "Neon-Array-Mode": "true" if array_mode else "false",
+            },
+            verify=str(static_proxy.test_output_dir / "proxy.crt"),
+        )
+        assert response.status_code == 200
+        return response.json()
+
+    rows = q("select 1 as n, 'a' as s, '{1,2,3}'::int4[] as arr", False, False)["rows"]
+    assert rows == [{"arr": [1, 2, 3], "n": 1, "s": "a"}]
+
+    rows = q("select 1 as n, 'a' as s, '{1,2,3}'::int4[] as arr", False, True)["rows"]
+    assert rows == [[1, "a", [1, 2, 3]]]
+
+    rows = q("select 1 as n, 'a' as s, '{1,2,3}'::int4[] as arr", True, False)["rows"]
+    assert rows == [{"arr": "{1,2,3}", "n": "1", "s": "a"}]
+
+    rows = q("select 1 as n, 'a' as s, '{1,2,3}'::int4[] as arr", True, True)["rows"]
+    assert rows == [["1", "a", "{1,2,3}"]]

From c82d19d8d67ff9afed09d8f0b36f286db13dcb4b Mon Sep 17 00:00:00 2001
From: Stas Kelvich <stas.kelvich@gmail.com>
Date: Thu, 8 Jun 2023 13:05:04 +0300
Subject: [PATCH 13/14] Fix NULLs handling in proxy json endpoint

There were few problems with null handling:

* query_raw_txt() accepted vector of string so it always (erroneously)
treated "null" as a string instead of null. Change rust pg client
to accept the vector of Option<String> instead of just Strings. Adopt
coding here to pass nulls as None.

* pg_text_to_json() had a check that always interpreted "NULL" string
as null. That is wrong and nulls were already handled by match None.
This bug appeared as a bad attempt to parse arrays containing NULL
elements. Fix coding by checking presence of quotes while parsing an
array (no quotes -> null, quoted -> "null" string).

Array parser fix also slightly changes behavior by always cleaning
current entry when pushing to the resulting vector. This seems to be
an omission by previous coding, however looks like it was harmless
as entry was not cleared only at the end of the nested or to-level
array.
---
 Cargo.lock                      | 10 ++--
 Cargo.toml                      | 12 ++--
 proxy/src/http/sql_over_http.rs | 99 +++++++++++++++++++++------------
 3 files changed, 74 insertions(+), 47 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index c078510129..6856b9e3ac 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2770,7 +2770,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -2783,7 +2783,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
 dependencies = [
  "native-tls",
  "tokio",
@@ -2794,7 +2794,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -2812,7 +2812,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4272,7 +4272,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9#2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c#f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/Cargo.toml b/Cargo.toml
index d7bffe67e1..dc34705f8d 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -140,11 +140,11 @@ env_logger = "0.10"
 log = "0.4"
 
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
 tokio-tar = { git = "https://github.com/neondatabase/tokio-tar.git", rev="404df61437de0feef49ba2ccdbdd94eb8ad6e142" }
 
 ## Other git libraries
@@ -180,7 +180,7 @@ tonic-build = "0.9"
 
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="2e9b5f1ddc481d1a98fa79f6b9378ac4f170b7c9" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="f6ec31df3bcce89cb34f300f17c8a8c031c5ee8c" }
 
 # Changes the MAX_THREADS limit from 4096 to 32768.
 # This is a temporary workaround for using tracing from many threads in safekeepers code,
diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index 417af1e531..050f00dd7d 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -33,17 +33,20 @@ static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
 // Convert json non-string types to strings, so that they can be passed to Postgres
 // as parameters.
 //
-fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<String>, serde_json::Error> {
+fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<Option<String>>, serde_json::Error> {
     json.iter()
         .map(|value| {
             match value {
-                Value::Null => serde_json::to_string(value),
-                Value::Bool(_) => serde_json::to_string(value),
-                Value::Number(_) => serde_json::to_string(value),
-                Value::Object(_) => serde_json::to_string(value),
+                // special care for nulls
+                Value::Null => Ok(None),
 
-                // no need to escape
-                Value::String(s) => Ok(s.to_string()),
+                // convert to text with escaping
+                Value::Bool(_) => serde_json::to_string(value).map(Some),
+                Value::Number(_) => serde_json::to_string(value).map(Some),
+                Value::Object(_) => serde_json::to_string(value).map(Some),
+
+                // avoid escaping here, as we pass this as a parameter
+                Value::String(s) => Ok(Some(s.to_string())),
 
                 // special care for arrays
                 Value::Array(_) => json_array_to_pg_array(value),
@@ -60,25 +63,29 @@ fn json_to_pg_text(json: Vec<Value>) -> Result<Vec<String>, serde_json::Error> {
 //
 // Example of the same escaping in node-postgres: packages/pg/lib/utils.js
 //
-fn json_array_to_pg_array(value: &Value) -> Result<String, serde_json::Error> {
+fn json_array_to_pg_array(value: &Value) -> Result<Option<String>, serde_json::Error> {
     match value {
-        // same
-        Value::Null => serde_json::to_string(value),
-        Value::Bool(_) => serde_json::to_string(value),
-        Value::Number(_) => serde_json::to_string(value),
-        Value::Object(_) => serde_json::to_string(value),
+        // special care for nulls
+        Value::Null => Ok(None),
 
-        // now needs to be escaped, as it is part of the array
-        Value::String(_) => serde_json::to_string(value),
+        // convert to text with escaping
+        Value::Bool(_) => serde_json::to_string(value).map(Some),
+        Value::Number(_) => serde_json::to_string(value).map(Some),
+        Value::Object(_) => serde_json::to_string(value).map(Some),
+
+        // here string needs to be escaped, as it is part of the array
+        Value::String(_) => serde_json::to_string(value).map(Some),
 
         // recurse into array
         Value::Array(arr) => {
             let vals = arr
                 .iter()
                 .map(json_array_to_pg_array)
+                .map(|r| r.map(|v| v.unwrap_or_else(|| "NULL".to_string())))
                 .collect::<Result<Vec<_>, _>>()?
                 .join(",");
-            Ok(format!("{{{}}}", vals))
+
+            Ok(Some(format!("{{{}}}", vals)))
         }
     }
 }
@@ -335,10 +342,6 @@ pub fn pg_text_row_to_json(
 //
 pub fn pg_text_to_json(pg_value: Option<&str>, pg_type: &Type) -> Result<Value, anyhow::Error> {
     if let Some(val) = pg_value {
-        if val == "NULL" {
-            return Ok(Value::Null);
-        }
-
         if let Kind::Array(elem_type) = pg_type.kind() {
             return pg_array_parse(val, elem_type);
         }
@@ -400,6 +403,27 @@ fn _pg_array_parse(
         }
     }
 
+    fn push_checked(
+        entry: &mut String,
+        entries: &mut Vec<Value>,
+        elem_type: &Type,
+    ) -> Result<(), anyhow::Error> {
+        if !entry.is_empty() {
+            // While in usual postgres response we get nulls as None and everything else
+            // as Some(&str), in arrays we get NULL as unquoted 'NULL' string (while
+            // string with value 'NULL' will be represented by '"NULL"'). So catch NULLs
+            // here while we have quotation info and convert them to None.
+            if entry == "NULL" {
+                entries.push(pg_text_to_json(None, elem_type)?);
+            } else {
+                entries.push(pg_text_to_json(Some(entry), elem_type)?);
+            }
+            entry.clear();
+        }
+
+        Ok(())
+    }
+
     while let Some((mut i, mut c)) = pg_array_chr.next() {
         let mut escaped = false;
 
@@ -422,9 +446,7 @@ fn _pg_array_parse(
             '}' => {
                 level -= 1;
                 if level == 0 {
-                    if !entry.is_empty() {
-                        entries.push(pg_text_to_json(Some(&entry), elem_type)?);
-                    }
+                    push_checked(&mut entry, &mut entries, elem_type)?;
                     if nested {
                         return Ok((Value::Array(entries), i));
                     }
@@ -432,17 +454,15 @@ fn _pg_array_parse(
             }
             '"' if !escaped => {
                 if quote {
-                    // push even if empty
+                    // end of quoted string, so push it manually without any checks
+                    // for emptiness or nulls
                     entries.push(pg_text_to_json(Some(&entry), elem_type)?);
-                    entry = String::new();
+                    entry.clear();
                 }
                 quote = !quote;
             }
             ',' if !quote => {
-                if !entry.is_empty() {
-                    entries.push(pg_text_to_json(Some(&entry), elem_type)?);
-                    entry = String::new();
-                }
+                push_checked(&mut entry, &mut entries, elem_type)?;
             }
             _ => {
                 entry.push(c);
@@ -466,30 +486,35 @@ mod tests {
     fn test_atomic_types_to_pg_params() {
         let json = vec![Value::Bool(true), Value::Bool(false)];
         let pg_params = json_to_pg_text(json).unwrap();
-        assert_eq!(pg_params, vec!["true", "false"]);
+        assert_eq!(
+            pg_params,
+            vec![Some("true".to_owned()), Some("false".to_owned())]
+        );
 
         let json = vec![Value::Number(serde_json::Number::from(42))];
         let pg_params = json_to_pg_text(json).unwrap();
-        assert_eq!(pg_params, vec!["42"]);
+        assert_eq!(pg_params, vec![Some("42".to_owned())]);
 
         let json = vec![Value::String("foo\"".to_string())];
         let pg_params = json_to_pg_text(json).unwrap();
-        assert_eq!(pg_params, vec!["foo\""]);
+        assert_eq!(pg_params, vec![Some("foo\"".to_owned())]);
 
         let json = vec![Value::Null];
         let pg_params = json_to_pg_text(json).unwrap();
-        assert_eq!(pg_params, vec!["null"]);
+        assert_eq!(pg_params, vec![None]);
     }
 
     #[test]
     fn test_json_array_to_pg_array() {
         // atoms and escaping
-        let json = "[true, false, null, 42, \"foo\", \"bar\\\"-\\\\\"]";
+        let json = "[true, false, null, \"NULL\", 42, \"foo\", \"bar\\\"-\\\\\"]";
         let json: Value = serde_json::from_str(json).unwrap();
         let pg_params = json_to_pg_text(vec![json]).unwrap();
         assert_eq!(
             pg_params,
-            vec!["{true,false,null,42,\"foo\",\"bar\\\"-\\\\\"}"]
+            vec![Some(
+                "{true,false,NULL,\"NULL\",42,\"foo\",\"bar\\\"-\\\\\"}".to_owned()
+            )]
         );
 
         // nested arrays
@@ -498,7 +523,9 @@ mod tests {
         let pg_params = json_to_pg_text(vec![json]).unwrap();
         assert_eq!(
             pg_params,
-            vec!["{{true,false},{null,42},{\"foo\",\"bar\\\"-\\\\\"}}"]
+            vec![Some(
+                "{{true,false},{NULL,42},{\"foo\",\"bar\\\"-\\\\\"}}".to_owned()
+            )]
         );
     }
 

From 6bac7708116ecb82d457b150ef51bc0a330a2237 Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Thu, 8 Jun 2023 18:11:33 -0400
Subject: [PATCH 14/14] Add cold start test (#4436)

---
 test_runner/performance/test_startup.py | 55 ++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py
index fa2e058491..9c45088d62 100644
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -1,10 +1,63 @@
 from contextlib import closing
 
 import pytest
-from fixtures.benchmark_fixture import NeonBenchmarker
+import requests
+from fixtures.benchmark_fixture import MetricReport, NeonBenchmarker
 from fixtures.neon_fixtures import NeonEnvBuilder
 
 
+# Just start and measure duration.
+#
+# This test runs pretty quickly and can be informative when used in combination
+# with emulated network delay. Some useful delay commands:
+#
+# 1. Add 2msec delay to all localhost traffic
+# `sudo tc qdisc add dev lo root handle 1:0 netem delay 2msec`
+#
+# 2. Test that it works (you should see 4ms ping)
+# `ping localhost`
+#
+# 3. Revert back to normal
+# `sudo tc qdisc del dev lo root netem`
+#
+# NOTE this test might not represent the real startup time because the basebackup
+#      for a large database might be larger if there's a lof of transaction metadata,
+#      or safekeepers might need more syncing, or there might be more operations to
+#      apply during config step, like more users, databases, or extensions. By default
+#      we load extensions 'neon,pg_stat_statements,timescaledb,pg_cron', but in this
+#      test we only load neon.
+def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):
+    neon_env_builder.num_safekeepers = 3
+    env = neon_env_builder.init_start()
+
+    env.neon_cli.create_branch("test_startup")
+
+    # We do two iterations so we can see if the second startup is faster. It should
+    # be because the compute node should already be configured with roles, databases,
+    # extensions, etc from the first run.
+    for i in range(2):
+        # Start
+        with zenbenchmark.record_duration(f"{i}_start_and_select"):
+            endpoint = env.endpoints.create_start("test_startup")
+            endpoint.safe_psql("select 1;")
+
+        # Get metrics
+        metrics = requests.get(f"http://localhost:{endpoint.http_port}/metrics.json").json()
+        durations = {
+            "wait_for_spec_ms": f"{i}_wait_for_spec",
+            "sync_safekeepers_ms": f"{i}_sync_safekeepers",
+            "basebackup_ms": f"{i}_basebackup",
+            "config_ms": f"{i}_config",
+            "total_startup_ms": f"{i}_total_startup",
+        }
+        for key, name in durations.items():
+            value = metrics[key]
+            zenbenchmark.record(name, value, "ms", report=MetricReport.LOWER_IS_BETTER)
+
+        # Stop so we can restart
+        endpoint.stop()
+
+
 # This test sometimes runs for longer than the global 5 minute timeout.
 @pytest.mark.timeout(600)
 def test_startup(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenchmarker):