From f2e2b8a7f41090a29ae83fc6971f977953ac7db9 Mon Sep 17 00:00:00 2001
From: cui fliter <imcusg@gmail.com>
Date: Tue, 25 Jul 2023 19:39:29 +0800
Subject: [PATCH 01/37] fix some typos (#4662)

Typos fix

Signed-off-by: cui fliter <imcusg@gmail.com>
---
 libs/pq_proto/src/lib.rs                               |  2 +-
 pageserver/src/disk_usage_eviction_task.rs             |  4 ++--
 pageserver/src/tenant/remote_timeline_client.rs        |  6 +++---
 .../tenant/timeline/walreceiver/connection_manager.rs  | 10 +++++-----
 pgxn/neon/libpqwalproposer.c                           |  4 ++--
 pgxn/neon/walproposer.c                                |  2 +-
 pgxn/neon/walproposer.h                                |  4 ++--
 test_runner/regress/test_attach_tenant_config.py       |  2 +-
 test_runner/regress/test_compatibility.py              |  2 +-
 9 files changed, 18 insertions(+), 18 deletions(-)
diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index 5c5e8a9559..809fa5fffd 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -179,7 +179,7 @@ pub struct FeExecuteMessage {
 #[derive(Debug)]
 pub struct FeCloseMessage;
 
-/// An error occured while parsing or serializing raw stream into Postgres
+/// An error occurred while parsing or serializing raw stream into Postgres
 /// messages.
 #[derive(thiserror::Error, Debug)]
 pub enum ProtocolError {
diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs
index 042d4c6d06..b729b6a643 100644
--- a/pageserver/src/disk_usage_eviction_task.rs
+++ b/pageserver/src/disk_usage_eviction_task.rs
@@ -545,12 +545,12 @@ async fn collect_eviction_candidates(
         // We could be better here, e.g., sum of all L0 layers + most recent L1 layer.
         // That's what's typically used by the various background loops.
         //
-        // The default can be overriden with a fixed value in the tenant conf.
+        // The default can be overridden with a fixed value in the tenant conf.
         // A default override can be put in the default tenant conf in the pageserver.toml.
         let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() {
             debug!(
                 tenant_id=%tenant.tenant_id(),
-                overriden_size=s,
+                overridden_size=s,
                 "using overridden min resident size for tenant"
             );
             s
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 1355356712..fee7f0c28e 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -514,7 +514,7 @@ impl RemoteTimelineClient {
     /// updated metadata.
     ///
     /// The upload will be added to the queue immediately, but it
-    /// won't be performed until all previosuly scheduled layer file
+    /// won't be performed until all previously scheduled layer file
     /// upload operations have completed successfully.  This is to
     /// ensure that when the index file claims that layers X, Y and Z
     /// exist in remote storage, they really do. To wait for the upload
@@ -625,7 +625,7 @@ impl RemoteTimelineClient {
     /// Note: This schedules an index file upload before the deletions.  The
     /// deletion won't actually be performed, until any previously scheduled
     /// upload operations, and the index file upload, have completed
-    /// succesfully.
+    /// successfully.
     pub fn schedule_layer_file_deletion(
         self: &Arc<Self>,
         names: &[LayerFileName],
@@ -1105,7 +1105,7 @@ impl RemoteTimelineClient {
             debug!("remote task {} completed successfully", task.op);
         }
 
-        // The task has completed succesfully. Remove it from the in-progress list.
+        // The task has completed successfully. Remove it from the in-progress list.
         {
             let mut upload_queue_guard = self.upload_queue.lock().unwrap();
             let upload_queue = match upload_queue_guard.deref_mut() {
diff --git a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
index 57c09a4487..2642746c79 100644
--- a/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
+++ b/pageserver/src/tenant/timeline/walreceiver/connection_manager.rs
@@ -1123,7 +1123,7 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn lsn_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
+    async fn lsn_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
         let harness = TenantHarness::create("lsn_wal_over_threshcurrent_candidate")?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
@@ -1189,8 +1189,8 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn timeout_connection_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_connection_threshhold_current_candidate")?;
+    async fn timeout_connection_threshold_current_candidate() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("timeout_connection_threshold_current_candidate")?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let now = Utc::now().naive_utc();
@@ -1252,8 +1252,8 @@ mod tests {
     }
 
     #[tokio::test]
-    async fn timeout_wal_over_threshhold_current_candidate() -> anyhow::Result<()> {
-        let harness = TenantHarness::create("timeout_wal_over_threshhold_current_candidate")?;
+    async fn timeout_wal_over_threshold_current_candidate() -> anyhow::Result<()> {
+        let harness = TenantHarness::create("timeout_wal_over_threshold_current_candidate")?;
         let mut state = dummy_state(&harness).await;
         let current_lsn = Lsn(100_000).align();
         let new_lsn = Lsn(100_100).align();
diff --git a/pgxn/neon/libpqwalproposer.c b/pgxn/neon/libpqwalproposer.c
index 9b6175a621..ed3b8a817f 100644
--- a/pgxn/neon/libpqwalproposer.c
+++ b/pgxn/neon/libpqwalproposer.c
@@ -292,7 +292,7 @@ walprop_async_read(WalProposerConn *conn, char **buf, int *amount)
 	/*
 	 * The docs for PQgetCopyData list the return values as: 0 if the copy is
 	 * still in progress, but no "complete row" is available -1 if the copy is
-	 * done -2 if an error occured (> 0) if it was successful; that value is
+	 * done -2 if an error occurred (> 0) if it was successful; that value is
 	 * the amount transferred.
 	 *
 	 * The protocol we use between walproposer and safekeeper means that we
@@ -353,7 +353,7 @@ walprop_async_write(WalProposerConn *conn, void const *buf, size_t size)
 	/*
 	 * The docs for PQputcopyData list the return values as: 1 if the data was
 	 * queued, 0 if it was not queued because of full buffers, or -1 if an
-	 * error occured
+	 * error occurred
 	 */
 	result = PQputCopyData(conn->pg_conn, buf, size);
 
diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c
index 765966092d..807fd5c91b 100644
--- a/pgxn/neon/walproposer.c
+++ b/pgxn/neon/walproposer.c
@@ -788,7 +788,7 @@ ReconnectSafekeepers(void)
 
 /*
  * Performs the logic for advancing the state machine of the specified safekeeper,
- * given that a certain set of events has occured.
+ * given that a certain set of events has occurred.
  */
 static void
 AdvancePollState(Safekeeper *sk, uint32 events)
diff --git a/pgxn/neon/walproposer.h b/pgxn/neon/walproposer.h
index f016a229eb..615fbf9399 100644
--- a/pgxn/neon/walproposer.h
+++ b/pgxn/neon/walproposer.h
@@ -23,7 +23,7 @@
 									 * message header */
 
 /*
- * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occured,
+ * In the spirit of WL_SOCKET_READABLE and others, this corresponds to no events having occurred,
  * because all WL_* events are given flags equal to some (1 << i), starting from i = 0
  */
 #define WL_NO_EVENTS 0
@@ -317,7 +317,7 @@ typedef struct AppendResponse
 	/* this is a criterion for walproposer --sync mode exit */
 	XLogRecPtr	commitLsn;
 	HotStandbyFeedback hs;
-	/* Feedback recieved from pageserver includes standby_status_update fields */
+	/* Feedback received from pageserver includes standby_status_update fields */
 	/* and custom neon feedback. */
 	/* This part of the message is extensible. */
 	PageserverFeedback rf;
diff --git a/test_runner/regress/test_attach_tenant_config.py b/test_runner/regress/test_attach_tenant_config.py
index 4df5ae18d6..b6b46f9401 100644
--- a/test_runner/regress/test_attach_tenant_config.py
+++ b/test_runner/regress/test_attach_tenant_config.py
@@ -123,7 +123,7 @@ def test_config_with_unknown_keys_is_bad_request(negative_env: NegativeTests):
 @pytest.mark.parametrize("content_type", [None, "application/json"])
 def test_empty_body(positive_env: NeonEnv, content_type: Optional[str]):
     """
-    For backwards-compatiblity: if we send an empty body,
+    For backwards-compatibility: if we send an empty body,
     the request should be accepted and the config should be the default config.
     """
     env = positive_env
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index a3d02c3f5a..6c592d3249 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -276,7 +276,7 @@ def prepare_snapshot(
     pageserver_config["listen_pg_addr"] = port_distributor.replace_with_new_port(
         pageserver_config["listen_pg_addr"]
     )
-    # since storage_broker these are overriden by neon_local during pageserver
+    # since storage_broker these are overridden by neon_local during pageserver
     # start; remove both to prevent unknown options during etcd ->
     # storage_broker migration. TODO: remove once broker is released
     pageserver_config.pop("broker_endpoint", None)

From 062159ac1719285d3397366585418e12b943d8c5 Mon Sep 17 00:00:00 2001
From: Nick Randall <nicksrandall@users.noreply.github.com>
Date: Tue, 25 Jul 2023 06:03:55 -0600
Subject: [PATCH 02/37] support non-interactive transactions in sql-over-http
 (#4654)

This PR adds support for non-interactive transaction query endpoint.
It accepts an array of queries and parameters and returns an array of
query results. The queries will be run in a single transaction one
after another on the proxy side.
---
 Cargo.lock                      | 10 +++---
 Cargo.toml                      | 12 +++----
 proxy/src/http/sql_over_http.rs | 61 +++++++++++++++++++++++++++------
 3 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 506e5f6c7c..dbeeb539d4 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2782,7 +2782,7 @@ dependencies = [
 [[package]]
 name = "postgres"
 version = "0.19.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -2795,7 +2795,7 @@ dependencies = [
 [[package]]
 name = "postgres-native-tls"
 version = "0.5.0"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
  "native-tls",
  "tokio",
@@ -2806,7 +2806,7 @@ dependencies = [
 [[package]]
 name = "postgres-protocol"
 version = "0.6.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
  "base64 0.20.0",
  "byteorder",
@@ -2824,7 +2824,7 @@ dependencies = [
 [[package]]
 name = "postgres-types"
 version = "0.2.4"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
  "bytes",
  "fallible-iterator",
@@ -4314,7 +4314,7 @@ dependencies = [
 [[package]]
 name = "tokio-postgres"
 version = "0.7.7"
-source = "git+https://github.com/neondatabase/rust-postgres.git?rev=1aaedab101b23f7612042850d8f2036810fa7c7f#1aaedab101b23f7612042850d8f2036810fa7c7f"
+source = "git+https://github.com/neondatabase/rust-postgres.git?rev=9011f7110db12b5e15afaf98f8ac834501d50ddc#9011f7110db12b5e15afaf98f8ac834501d50ddc"
 dependencies = [
  "async-trait",
  "byteorder",
diff --git a/Cargo.toml b/Cargo.toml
index 44d49d95e8..a0acc061fb 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -144,11 +144,11 @@ env_logger = "0.10"
 log = "0.4"
 
 ## Libraries from neondatabase/ git forks, ideally with changes to be upstreamed
-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-native-tls = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-protocol = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+postgres-types = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
 
 ## Other git libraries
 heapless = { default-features=false, features=[], git = "https://github.com/japaric/heapless.git", rev = "644653bf3b831c6bb4963be2de24804acf5e5001" } # upstream release pending
@@ -183,7 +183,7 @@ tonic-build = "0.9"
 
 # This is only needed for proxy's tests.
 # TODO: we should probably fork `tokio-postgres-rustls` instead.
-tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="1aaedab101b23f7612042850d8f2036810fa7c7f" }
+tokio-postgres = { git = "https://github.com/neondatabase/rust-postgres.git", rev="9011f7110db12b5e15afaf98f8ac834501d50ddc" }
 
 ################# Binary contents sections
 
diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index adf7252f72..4ed650a392 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -11,6 +11,7 @@ use serde_json::Map;
 use serde_json::Value;
 use tokio_postgres::types::Kind;
 use tokio_postgres::types::Type;
+use tokio_postgres::GenericClient;
 use tokio_postgres::Row;
 use url::Url;
 
@@ -23,6 +24,13 @@ struct QueryData {
     params: Vec<serde_json::Value>,
 }
 
+#[derive(serde::Deserialize)]
+#[serde(untagged)]
+enum Payload {
+    Single(QueryData),
+    Batch(Vec<QueryData>),
+}
+
 pub const MAX_RESPONSE_SIZE: usize = 1024 * 1024; // 1 MB
 const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
 
@@ -192,15 +200,53 @@ pub async fn handle(
     // Read the query and query params from the request body
     //
     let body = hyper::body::to_bytes(request.into_body()).await?;
-    let QueryData { query, params } = serde_json::from_slice(&body)?;
-    let query_params = json_to_pg_text(params)?;
+    let payload: Payload = serde_json::from_slice(&body)?;
+
+    let mut client = conn_pool.get(&conn_info, !allow_pool).await?;
 
     //
     // Now execute the query and return the result
     //
-    let client = conn_pool.get(&conn_info, !allow_pool).await?;
+    let result = match payload {
+        Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode).await,
+        Payload::Batch(queries) => {
+            let mut results = Vec::new();
+            let transaction = client.transaction().await?;
+            for query in queries {
+                let result = query_to_json(&transaction, query, raw_output, array_mode).await;
+                match result {
+                    Ok(r) => results.push(r),
+                    Err(e) => {
+                        transaction.rollback().await?;
+                        return Err(e);
+                    }
+                }
+            }
+            transaction.commit().await?;
+            Ok(Value::Array(results))
+        }
+    };
 
-    let row_stream = client.query_raw_txt(query, query_params).await?;
+    if allow_pool {
+        // return connection to the pool
+        tokio::task::spawn(async move {
+            let _ = conn_pool.put(&conn_info, client).await;
+        });
+    }
+
+    result
+}
+
+async fn query_to_json<T: GenericClient>(
+    client: &T,
+    data: QueryData,
+    raw_output: bool,
+    array_mode: bool,
+) -> anyhow::Result<Value> {
+    let query_params = json_to_pg_text(data.params)?;
+    let row_stream = client
+        .query_raw_txt::<String, _>(data.query, query_params)
+        .await?;
 
     // Manually drain the stream into a vector to leave row_stream hanging
     // around to get a command tag. Also check that the response is not too
@@ -256,13 +302,6 @@ pub async fn handle(
         .map(|row| pg_text_row_to_json(row, raw_output, array_mode))
         .collect::<Result<Vec<_>, _>>()?;
 
-    if allow_pool {
-        // return connection to the pool
-        tokio::task::spawn(async move {
-            let _ = conn_pool.put(&conn_info, client).await;
-        });
-    }
-
     // resulting JSON format is based on the format of node-postgres result
     Ok(json!({
         "command": command_tag_name,

From 6d023484ed413d43ad13b37559413396cbfadd81 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Tue, 25 Jul 2023 16:25:27 +0300
Subject: [PATCH 03/37] Use mark file to allow for deletion operations to
 continue through restarts (#4552)

## Problem

Currently we delete local files first, so if pageserver restarts after
local files deletion then remote deletion is not continued. This can be
solved with inversion of these steps.

But even if these steps are inverted when index_part.json is deleted
there is no way to distinguish between "this timeline is good, we just
didnt upload it to remote" and "this timeline is deleted we should
continue with removal of local state". So to solve it we use another
mark file. After index part is deleted presence of this mark file
indentifies that it was a deletion intention.

Alternative approach that was discussed was to delete all except
metadata first, and then delete metadata and index part. In this case we
still do not support local only configs making them rather unsafe
(deletion in them is already unsafe, but this direction solidifies this
direction instead of fixing it). Another downside is that if we crash
after local metadata gets removed we may leave dangling index part on
the remote which in theory shouldnt be a big deal because the file is
small.

It is not a big change to choose another approach at this point.

## Summary of changes

Timeline deletion sequence:
1. Set deleted_at in remote index part.
2. Create local mark file.
3. Delete local files except metadata (it is simpler this way, to be
able to reuse timeline initialization code that expects metadata)
4. Delete remote layers
5. Delete index part
6. Delete meta, timeline directory.
7. Delete mark file.

This works for local only configuration without remote storage.
Sequence is resumable from any point.

resolves #4453
resolves https://github.com/neondatabase/neon/pull/4552 (the issue was
created with async cancellation in mind, but we can still have issues
with retries if metadata is deleted among the first by remove_dir_all
(which doesnt have any ordering guarantees))

---------

Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 libs/utils/src/fs_ext.rs                      |  34 ++
 libs/utils/src/id.rs                          |  14 +
 pageserver/src/config.rs                      |  14 +-
 pageserver/src/lib.rs                         |  25 +-
 pageserver/src/tenant.rs                      | 557 ++++++-----------
 pageserver/src/tenant/delete.rs               | 575 ++++++++++++++++++
 pageserver/src/tenant/mgr.rs                  |   8 +-
 .../src/tenant/remote_timeline_client.rs      |  14 +-
 pageserver/src/tenant/timeline.rs             |  21 +-
 pageserver/src/tenant/timeline/uninit.rs      |  12 +-
 test_runner/fixtures/pageserver/utils.py      |  13 +-
 test_runner/regress/test_timeline_delete.py   | 206 +++++--
 12 files changed, 1070 insertions(+), 423 deletions(-)
 create mode 100644 pageserver/src/tenant/delete.rs

diff --git a/libs/utils/src/fs_ext.rs b/libs/utils/src/fs_ext.rs
index 0ef0464267..090912d276 100644
--- a/libs/utils/src/fs_ext.rs
+++ b/libs/utils/src/fs_ext.rs
@@ -24,12 +24,29 @@ pub async fn is_directory_empty(path: impl AsRef<Path>) -> anyhow::Result<bool>
     Ok(dir.next_entry().await?.is_none())
 }
 
+pub fn ignore_not_found(e: io::Error) -> io::Result<()> {
+    if e.kind() == io::ErrorKind::NotFound {
+        Ok(())
+    } else {
+        Err(e)
+    }
+}
+
+pub fn ignore_absent_files<F>(fs_operation: F) -> io::Result<()>
+where
+    F: Fn() -> io::Result<()>,
+{
+    fs_operation().or_else(ignore_not_found)
+}
+
 #[cfg(test)]
 mod test {
     use std::path::PathBuf;
 
     use crate::fs_ext::is_directory_empty;
 
+    use super::ignore_absent_files;
+
     #[test]
     fn is_empty_dir() {
         use super::PathExt;
@@ -75,4 +92,21 @@ mod test {
         std::fs::remove_file(&file_path).unwrap();
         assert!(is_directory_empty(file_path).await.is_err());
     }
+
+    #[test]
+    fn ignore_absent_files_works() {
+        let dir = tempfile::tempdir().unwrap();
+        let dir_path = dir.path();
+
+        let file_path: PathBuf = dir_path.join("testfile");
+
+        ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
+
+        let f = std::fs::File::create(&file_path).unwrap();
+        drop(f);
+
+        ignore_absent_files(|| std::fs::remove_file(&file_path)).expect("should execute normally");
+
+        assert!(!file_path.exists());
+    }
 }
diff --git a/libs/utils/src/id.rs b/libs/utils/src/id.rs
index 20b601f68d..2ce92ee914 100644
--- a/libs/utils/src/id.rs
+++ b/libs/utils/src/id.rs
@@ -1,5 +1,7 @@
+use std::ffi::OsStr;
 use std::{fmt, str::FromStr};
 
+use anyhow::Context;
 use hex::FromHex;
 use rand::Rng;
 use serde::{Deserialize, Serialize};
@@ -213,6 +215,18 @@ pub struct TimelineId(Id);
 
 id_newtype!(TimelineId);
 
+impl TryFrom<Option<&OsStr>> for TimelineId {
+    type Error = anyhow::Error;
+
+    fn try_from(value: Option<&OsStr>) -> Result<Self, Self::Error> {
+        value
+            .and_then(OsStr::to_str)
+            .unwrap_or_default()
+            .parse::<TimelineId>()
+            .with_context(|| format!("Could not parse timeline id from {:?}", value))
+    }
+}
+
 /// Neon Tenant Id represents identifiar of a particular tenant.
 /// Is used for distinguishing requests and data belonging to different users.
 ///
diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs
index 4c6df469aa..be806c77ec 100644
--- a/pageserver/src/config.rs
+++ b/pageserver/src/config.rs
@@ -33,7 +33,8 @@ use crate::tenant::config::TenantConf;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::{TENANT_ATTACHING_MARKER_FILENAME, TIMELINES_SEGMENT_NAME};
 use crate::{
-    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_UNINIT_MARK_SUFFIX,
+    IGNORED_TENANT_FILE_NAME, METADATA_FILE_NAME, TENANT_CONFIG_NAME, TIMELINE_DELETE_MARK_SUFFIX,
+    TIMELINE_UNINIT_MARK_SUFFIX,
 };
 
 pub mod defaults {
@@ -601,6 +602,17 @@ impl PageServerConf {
         )
     }
 
+    pub fn timeline_delete_mark_file_path(
+        &self,
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> PathBuf {
+        path_with_suffix_extension(
+            self.timeline_path(&tenant_id, &timeline_id),
+            TIMELINE_DELETE_MARK_SUFFIX,
+        )
+    }
+
     pub fn traces_path(&self) -> PathBuf {
         self.workdir.join("traces")
     }
diff --git a/pageserver/src/lib.rs b/pageserver/src/lib.rs
index 5831091098..f43651e931 100644
--- a/pageserver/src/lib.rs
+++ b/pageserver/src/lib.rs
@@ -109,6 +109,8 @@ pub const TEMP_FILE_SUFFIX: &str = "___temp";
 /// Full path: `tenants/<tenant_id>/timelines/<timeline_id>___uninit`.
 pub const TIMELINE_UNINIT_MARK_SUFFIX: &str = "___uninit";
 
+pub const TIMELINE_DELETE_MARK_SUFFIX: &str = "___delete";
+
 /// A marker file to prevent pageserver from loading a certain tenant on restart.
 /// Different from [`TIMELINE_UNINIT_MARK_SUFFIX`] due to semantics of the corresponding
 /// `ignore` management API command, that expects the ignored tenant to be properly loaded
@@ -123,15 +125,30 @@ pub fn is_temporary(path: &Path) -> bool {
     }
 }
 
-pub fn is_uninit_mark(path: &Path) -> bool {
+fn ends_with_suffix(path: &Path, suffix: &str) -> bool {
     match path.file_name() {
-        Some(name) => name
-            .to_string_lossy()
-            .ends_with(TIMELINE_UNINIT_MARK_SUFFIX),
+        Some(name) => name.to_string_lossy().ends_with(suffix),
         None => false,
     }
 }
 
+pub fn is_uninit_mark(path: &Path) -> bool {
+    ends_with_suffix(path, TIMELINE_UNINIT_MARK_SUFFIX)
+}
+
+pub fn is_delete_mark(path: &Path) -> bool {
+    ends_with_suffix(path, TIMELINE_DELETE_MARK_SUFFIX)
+}
+
+fn is_walkdir_io_not_found(e: &walkdir::Error) -> bool {
+    if let Some(e) = e.io_error() {
+        if e.kind() == std::io::ErrorKind::NotFound {
+            return true;
+        }
+    }
+    false
+}
+
 /// During pageserver startup, we need to order operations not to exhaust tokio worker threads by
 /// blocking.
 ///
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 0b1183fc50..67447bc45c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -18,22 +18,20 @@ use remote_storage::DownloadError;
 use remote_storage::GenericRemoteStorage;
 use storage_broker::BrokerClientChannel;
 use tokio::sync::watch;
-use tokio::sync::OwnedMutexGuard;
 use tokio::task::JoinSet;
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
+use utils::fs_ext;
 
 use std::cmp::min;
 use std::collections::hash_map::Entry;
 use std::collections::BTreeSet;
 use std::collections::HashMap;
-use std::ffi::OsStr;
 use std::fs;
 use std::fs::File;
 use std::fs::OpenOptions;
-use std::io;
 use std::io::Write;
 use std::ops::Bound::Included;
 use std::path::Path;
@@ -48,6 +46,7 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};
 
 use self::config::TenantConf;
+use self::delete::DeleteTimelineFlow;
 use self::metadata::TimelineMetadata;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineUninitMark;
@@ -65,7 +64,6 @@ use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
 use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
-use crate::tenant::remote_timeline_client::PersistIndexPartWithDeletedFlagError;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
@@ -118,6 +116,7 @@ mod remote_timeline_client;
 pub mod storage_layer;
 
 pub mod config;
+pub mod delete;
 pub mod mgr;
 pub mod tasks;
 pub mod upload_queue;
@@ -266,6 +265,14 @@ pub enum GetTimelineError {
     },
 }
 
+#[derive(Debug, thiserror::Error)]
+pub enum LoadLocalTimelineError {
+    #[error("FailedToLoad")]
+    Load(#[source] anyhow::Error),
+    #[error("FailedToResumeDeletion")]
+    ResumeDeletion(#[source] anyhow::Error),
+}
+
 #[derive(Debug, thiserror::Error)]
 pub enum DeleteTimelineError {
     #[error("NotFound")]
@@ -319,14 +326,6 @@ impl std::fmt::Display for WaitToBecomeActiveError {
     }
 }
 
-struct DeletionGuard(OwnedMutexGuard<bool>);
-
-impl DeletionGuard {
-    fn is_deleted(&self) -> bool {
-        *self.0
-    }
-}
-
 #[derive(thiserror::Error, Debug)]
 pub enum CreateTimelineError {
     #[error("a timeline with the given ID already exists")]
@@ -337,6 +336,16 @@ pub enum CreateTimelineError {
     Other(#[from] anyhow::Error),
 }
 
+struct TenantDirectoryScan {
+    sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>,
+    timelines_to_resume_deletion: Vec<(TimelineId, TimelineMetadata)>,
+}
+
+enum CreateTimelineCause {
+    Load,
+    Delete,
+}
+
 impl Tenant {
     /// Yet another helper for timeline initialization.
     /// Contains the common part of `load_local_timeline` and `load_remote_timeline`.
@@ -375,6 +384,7 @@ impl Tenant {
             ancestor.clone(),
             remote_client,
             init_order,
+            CreateTimelineCause::Load,
         )?;
         let new_disk_consistent_lsn = timeline.get_disk_consistent_lsn();
         anyhow::ensure!(
@@ -803,11 +813,13 @@ impl Tenant {
         tenant
     }
 
-    pub fn scan_and_sort_timelines_dir(
-        self: Arc<Tenant>,
-    ) -> anyhow::Result<Vec<(TimelineId, TimelineMetadata)>> {
-        let timelines_dir = self.conf.timelines_path(&self.tenant_id);
+    fn scan_and_sort_timelines_dir(self: Arc<Tenant>) -> anyhow::Result<TenantDirectoryScan> {
         let mut timelines_to_load: HashMap<TimelineId, TimelineMetadata> = HashMap::new();
+        // Note timelines_to_resume_deletion needs to be separate because it can be not sortable
+        // from the point of `tree_sort_timelines`. I e some parents can be missing because deletion
+        // completed in non topological order (for example because parent has smaller number of layer files in it)
+        let mut timelines_to_resume_deletion: Vec<(TimelineId, TimelineMetadata)> = vec![];
+        let timelines_dir = self.conf.timelines_path(&self.tenant_id);
 
         for entry in
             std::fs::read_dir(&timelines_dir).context("list timelines directory for tenant")?
@@ -835,16 +847,13 @@ impl Tenant {
                     );
                     continue;
                 }
+
                 let timeline_uninit_mark_file = &timeline_dir;
                 info!(
                     "Found an uninit mark file {}, removing the timeline and its uninit mark",
                     timeline_uninit_mark_file.display()
                 );
-                let timeline_id = timeline_uninit_mark_file
-                    .file_stem()
-                    .and_then(OsStr::to_str)
-                    .unwrap_or_default()
-                    .parse::<TimelineId>()
+                let timeline_id = TimelineId::try_from(timeline_uninit_mark_file.file_stem())
                     .with_context(|| {
                         format!(
                             "Could not parse timeline id out of the timeline uninit mark name {}",
@@ -857,6 +866,44 @@ impl Tenant {
                 {
                     error!("Failed to clean up uninit marked timeline: {e:?}");
                 }
+            } else if crate::is_delete_mark(&timeline_dir) {
+                // If metadata exists, load as usual, continue deletion
+                // If metadata doesnt exist remove timeline dir and delete mark
+                let timeline_id =
+                    TimelineId::try_from(timeline_dir.file_stem()).with_context(|| {
+                        format!(
+                            "Could not parse timeline id out of the timeline uninit mark name {}",
+                            timeline_dir.display()
+                        )
+                    })?;
+
+                let metadata_path = self.conf.metadata_path(&self.tenant_id, &timeline_id);
+                if metadata_path.exists() {
+                    // Remote deletion did not finish. Need to resume.
+                    timelines_to_resume_deletion.push((
+                        timeline_id,
+                        load_metadata(self.conf, &self.tenant_id, &timeline_id)?,
+                    ));
+                    continue;
+                }
+
+                // Missing metadata means that timeline directory should be empty at this point.
+                // Remove delete mark afterwards.
+                // Note that failure during the process wont prevent tenant from successfully loading.
+                // TODO: this is very much similar to DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces
+                // but here we're inside spawn_blocking.
+                if let Err(e) = fs_ext::ignore_absent_files(|| {
+                    fs::remove_dir(self.conf.timeline_path(&self.tenant_id, &timeline_id))
+                })
+                .context("remove deleted timeline dir")
+                .and_then(|_| fs::remove_file(&timeline_dir).context("remove delete mark"))
+                {
+                    warn!(
+                        "cannot clean up deleted timeline dir at: {} error: {:#}",
+                        timeline_dir.display(),
+                        e
+                    );
+                };
             } else {
                 if !timeline_dir.exists() {
                     warn!(
@@ -865,12 +912,8 @@ impl Tenant {
                     );
                     continue;
                 }
-                let timeline_id = timeline_dir
-                    .file_name()
-                    .and_then(OsStr::to_str)
-                    .unwrap_or_default()
-                    .parse::<TimelineId>()
-                    .with_context(|| {
+                let timeline_id =
+                    TimelineId::try_from(timeline_dir.file_name()).with_context(|| {
                         format!(
                             "Could not parse timeline id out of the timeline dir name {}",
                             timeline_dir.display()
@@ -892,6 +935,14 @@ impl Tenant {
                     continue;
                 }
 
+                let timeline_delete_mark_file = self
+                    .conf
+                    .timeline_delete_mark_file_path(self.tenant_id, timeline_id);
+                if timeline_delete_mark_file.exists() {
+                    // Cleanup should be done in `is_delete_mark` branch above
+                    continue;
+                }
+
                 let file_name = entry.file_name();
                 if let Ok(timeline_id) =
                     file_name.to_str().unwrap_or_default().parse::<TimelineId>()
@@ -911,7 +962,10 @@ impl Tenant {
 
         // Sort the array of timeline IDs into tree-order, so that parent comes before
         // all its children.
-        tree_sort_timelines(timelines_to_load)
+        tree_sort_timelines(timelines_to_load).map(|sorted_timelines| TenantDirectoryScan {
+            sorted_timelines_to_load: sorted_timelines,
+            timelines_to_resume_deletion,
+        })
     }
 
     ///
@@ -937,7 +991,7 @@ impl Tenant {
         let span = info_span!("blocking");
         let cloned = Arc::clone(self);
 
-        let sorted_timelines: Vec<(_, _)> = tokio::task::spawn_blocking(move || {
+        let scan = tokio::task::spawn_blocking(move || {
             let _g = span.entered();
             cloned.scan_and_sort_timelines_dir()
         })
@@ -948,10 +1002,44 @@ impl Tenant {
         // FIXME original collect_timeline_files contained one more check:
         //    1. "Timeline has no ancestor and no layer files"
 
-        for (timeline_id, local_metadata) in sorted_timelines {
-            self.load_local_timeline(timeline_id, local_metadata, init_order, ctx)
+        // Process loadable timelines first
+        for (timeline_id, local_metadata) in scan.sorted_timelines_to_load {
+            if let Err(e) = self
+                .load_local_timeline(timeline_id, local_metadata, init_order, ctx, false)
                 .await
-                .with_context(|| format!("load local timeline {timeline_id}"))?;
+            {
+                match e {
+                    LoadLocalTimelineError::Load(source) => {
+                        return Err(anyhow::anyhow!(source)
+                            .context("Failed to load local timeline: {timeline_id}"))
+                    }
+                    LoadLocalTimelineError::ResumeDeletion(source) => {
+                        // Make sure resumed deletion wont fail loading for entire tenant.
+                        error!("Failed to resume timeline deletion: {source:#}")
+                    }
+                }
+            }
+        }
+
+        // Resume deletion ones with deleted_mark
+        for (timeline_id, local_metadata) in scan.timelines_to_resume_deletion {
+            if let Err(e) = self
+                .load_local_timeline(timeline_id, local_metadata, init_order, ctx, true)
+                .await
+            {
+                match e {
+                    LoadLocalTimelineError::Load(source) => {
+                        // We tried to load deleted timeline, this is a bug.
+                        return Err(anyhow::anyhow!(source).context(
+                            "This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}"
+                        ));
+                    }
+                    LoadLocalTimelineError::ResumeDeletion(source) => {
+                        // Make sure resumed deletion wont fail loading for entire tenant.
+                        error!("Failed to resume timeline deletion: {source:#}")
+                    }
+                }
+            }
         }
 
         trace!("Done");
@@ -969,7 +1057,8 @@ impl Tenant {
         local_metadata: TimelineMetadata,
         init_order: Option<&InitializationOrder>,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+        found_delete_mark: bool,
+    ) -> Result<(), LoadLocalTimelineError> {
         span::debug_assert_current_span_has_tenant_id();
 
         let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
@@ -981,14 +1070,6 @@ impl Tenant {
             )
         });
 
-        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
-            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
-                .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?;
-            Some(ancestor_timeline)
-        } else {
-            None
-        };
-
         let (remote_startup_data, remote_client) = match remote_client {
             Some(remote_client) => match remote_client.download_index_file().await {
                 Ok(index_part) => {
@@ -1008,45 +1089,29 @@ impl Tenant {
                             info!("is_deleted is set on remote, resuming removal of timeline data originally done by timeline deletion handler");
 
                             remote_client
-                                .init_upload_queue_stopped_to_continue_deletion(&index_part)?;
+                                .init_upload_queue_stopped_to_continue_deletion(&index_part)
+                                .context("init queue stopped")
+                                .map_err(LoadLocalTimelineError::ResumeDeletion)?;
 
-                            let timeline = self
-                                .create_timeline_struct(
-                                    timeline_id,
-                                    &local_metadata,
-                                    ancestor,
-                                    Some(remote_client),
-                                    init_order,
-                                )
-                                .context("create_timeline_struct")?;
-
-                            let guard = DeletionGuard(
-                                Arc::clone(&timeline.delete_lock)
-                                    .try_lock_owned()
-                                    .expect("cannot happen because we're the only owner"),
-                            );
-
-                            // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
-                            // RemoteTimelineClient is the only functioning part.
-                            timeline.set_state(TimelineState::Stopping);
-                            // We meed to do this because when console retries delete request we shouldnt answer with 404
-                            // because 404 means successful deletion.
-                            // FIXME consider TimelineState::Deleting.
-                            let mut locked = self.timelines.lock().unwrap();
-                            locked.insert(timeline_id, Arc::clone(&timeline));
-
-                            Tenant::schedule_delete_timeline(
+                            DeleteTimelineFlow::resume_deletion(
                                 Arc::clone(self),
                                 timeline_id,
-                                timeline,
-                                guard,
-                            );
+                                &local_metadata,
+                                Some(remote_client),
+                                init_order,
+                            )
+                            .await
+                            .context("resume deletion")
+                            .map_err(LoadLocalTimelineError::ResumeDeletion)?;
 
                             return Ok(());
                         }
                     };
 
-                    let remote_metadata = index_part.parse_metadata().context("parse_metadata")?;
+                    let remote_metadata = index_part
+                        .parse_metadata()
+                        .context("parse_metadata")
+                        .map_err(LoadLocalTimelineError::Load)?;
                     (
                         Some(RemoteStartupData {
                             index_part,
@@ -1056,12 +1121,54 @@ impl Tenant {
                     )
                 }
                 Err(DownloadError::NotFound) => {
-                    info!("no index file was found on the remote");
+                    info!("no index file was found on the remote, found_delete_mark: {found_delete_mark}");
+
+                    if found_delete_mark {
+                        // We could've resumed at a point where remote index was deleted, but metadata file wasnt.
+                        // Cleanup:
+                        return DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(
+                            self,
+                            timeline_id,
+                        )
+                        .await
+                        .context("cleanup_remaining_timeline_fs_traces")
+                        .map_err(LoadLocalTimelineError::ResumeDeletion);
+                    }
+
+                    // We're loading fresh timeline that didnt yet make it into remote.
                     (None, Some(remote_client))
                 }
-                Err(e) => return Err(anyhow::anyhow!(e)),
+                Err(e) => return Err(LoadLocalTimelineError::Load(anyhow::Error::new(e))),
             },
-            None => (None, remote_client),
+            None => {
+                // No remote client
+                if found_delete_mark {
+                    // There is no remote client, we found local metadata.
+                    // Continue cleaning up local disk.
+                    DeleteTimelineFlow::resume_deletion(
+                        Arc::clone(self),
+                        timeline_id,
+                        &local_metadata,
+                        None,
+                        init_order,
+                    )
+                    .await
+                    .context("resume deletion")
+                    .map_err(LoadLocalTimelineError::ResumeDeletion)?;
+                    return Ok(());
+                }
+
+                (None, remote_client)
+            }
+        };
+
+        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
+            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
+                .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))
+                .map_err(LoadLocalTimelineError::Load)?;
+            Some(ancestor_timeline)
+        } else {
+            None
         };
 
         self.timeline_init_and_sync(
@@ -1075,6 +1182,7 @@ impl Tenant {
             ctx,
         )
         .await
+        .map_err(LoadLocalTimelineError::Load)
     }
 
     pub fn tenant_id(&self) -> TenantId {
@@ -1435,269 +1543,6 @@ impl Tenant {
         }
     }
 
-    /// Shuts down a timeline's tasks, removes its in-memory structures, and deletes its
-    /// data from both disk and s3.
-    async fn delete_timeline(
-        &self,
-        timeline_id: TimelineId,
-        timeline: Arc<Timeline>,
-        guard: DeletionGuard,
-    ) -> anyhow::Result<()> {
-        {
-            // Grab the layer_removal_cs lock, and actually perform the deletion.
-            //
-            // This lock prevents prevents GC or compaction from running at the same time.
-            // The GC task doesn't register itself with the timeline it's operating on,
-            // so it might still be running even though we called `shutdown_tasks`.
-            //
-            // Note that there are still other race conditions between
-            // GC, compaction and timeline deletion. See
-            // https://github.com/neondatabase/neon/issues/2671
-            //
-            // No timeout here, GC & Compaction should be responsive to the
-            // `TimelineState::Stopping` change.
-            info!("waiting for layer_removal_cs.lock()");
-            let layer_removal_guard = timeline.layer_removal_cs.lock().await;
-            info!("got layer_removal_cs.lock(), deleting layer files");
-
-            // NB: remote_timeline_client upload tasks that reference these layers have been cancelled
-            //     by the caller.
-
-            let local_timeline_directory = self
-                .conf
-                .timeline_path(&self.tenant_id, &timeline.timeline_id);
-
-            fail::fail_point!("timeline-delete-before-rm", |_| {
-                Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
-            });
-
-            // NB: This need not be atomic because the deleted flag in the IndexPart
-            // will be observed during tenant/timeline load. The deletion will be resumed there.
-            //
-            // For configurations without remote storage, we tolerate that we're not crash-safe here.
-            // The timeline may come up Active but with missing layer files, in such setups.
-            // See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720
-            match std::fs::remove_dir_all(&local_timeline_directory) {
-                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
-                    // This can happen if we're called a second time, e.g.,
-                    // because of a previous failure/cancellation at/after
-                    // failpoint timeline-delete-after-rm.
-                    //
-                    // It can also happen if we race with tenant detach, because,
-                    // it doesn't grab the layer_removal_cs lock.
-                    //
-                    // For now, log and continue.
-                    // warn! level is technically not appropriate for the
-                    // first case because we should expect retries to happen.
-                    // But the error is so rare, it seems better to get attention if it happens.
-                    let tenant_state = self.current_state();
-                    warn!(
-                        timeline_dir=?local_timeline_directory,
-                        ?tenant_state,
-                        "timeline directory not found, proceeding anyway"
-                    );
-                    // continue with the rest of the deletion
-                }
-                res => res.with_context(|| {
-                    format!(
-                        "Failed to remove local timeline directory '{}'",
-                        local_timeline_directory.display()
-                    )
-                })?,
-            }
-
-            info!("finished deleting layer files, releasing layer_removal_cs.lock()");
-            drop(layer_removal_guard);
-        }
-
-        fail::fail_point!("timeline-delete-after-rm", |_| {
-            Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
-        });
-
-        if let Some(remote_client) = &timeline.remote_client {
-            remote_client.delete_all().await.context("delete_all")?
-        };
-
-        pausable_failpoint!("in_progress_delete");
-
-        {
-            // Remove the timeline from the map.
-            let mut timelines = self.timelines.lock().unwrap();
-            let children_exist = timelines
-                .iter()
-                .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
-            // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
-            // We already deleted the layer files, so it's probably best to panic.
-            // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
-            if children_exist {
-                panic!("Timeline grew children while we removed layer files");
-            }
-
-            timelines.remove(&timeline_id).expect(
-                "timeline that we were deleting was concurrently removed from 'timelines' map",
-            );
-
-            drop(timelines);
-        }
-
-        drop(guard);
-
-        Ok(())
-    }
-
-    /// Removes timeline-related in-memory data and schedules removal from remote storage.
-    #[instrument(skip(self, _ctx))]
-    pub async fn prepare_and_schedule_delete_timeline(
-        self: Arc<Self>,
-        timeline_id: TimelineId,
-        _ctx: &RequestContext,
-    ) -> Result<(), DeleteTimelineError> {
-        debug_assert_current_span_has_tenant_and_timeline_id();
-
-        // Transition the timeline into TimelineState::Stopping.
-        // This should prevent new operations from starting.
-        //
-        // Also grab the Timeline's delete_lock to prevent another deletion from starting.
-        let timeline;
-        let delete_lock_guard;
-        {
-            let mut timelines = self.timelines.lock().unwrap();
-
-            // Ensure that there are no child timelines **attached to that pageserver**,
-            // because detach removes files, which will break child branches
-            let children: Vec<TimelineId> = timelines
-                .iter()
-                .filter_map(|(id, entry)| {
-                    if entry.get_ancestor_timeline_id() == Some(timeline_id) {
-                        Some(*id)
-                    } else {
-                        None
-                    }
-                })
-                .collect();
-
-            if !children.is_empty() {
-                return Err(DeleteTimelineError::HasChildren(children));
-            }
-
-            let timeline_entry = match timelines.entry(timeline_id) {
-                Entry::Occupied(e) => e,
-                Entry::Vacant(_) => return Err(DeleteTimelineError::NotFound),
-            };
-
-            timeline = Arc::clone(timeline_entry.get());
-
-            // Prevent two tasks from trying to delete the timeline at the same time.
-            delete_lock_guard = DeletionGuard(
-                Arc::clone(&timeline.delete_lock)
-                    .try_lock_owned()
-                    .map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
-            );
-
-            // If another task finished the deletion just before we acquired the lock,
-            // return success.
-            if delete_lock_guard.is_deleted() {
-                return Ok(());
-            }
-
-            timeline.set_state(TimelineState::Stopping);
-
-            drop(timelines);
-        }
-
-        // Now that the Timeline is in Stopping state, request all the related tasks to
-        // shut down.
-        //
-        // NB: If this fails half-way through, and is retried, the retry will go through
-        // all the same steps again. Make sure the code here is idempotent, and don't
-        // error out if some of the shutdown tasks have already been completed!
-
-        // Stop the walreceiver first.
-        debug!("waiting for wal receiver to shutdown");
-        let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
-        if let Some(walreceiver) = maybe_started_walreceiver {
-            walreceiver.stop().await;
-        }
-        debug!("wal receiver shutdown confirmed");
-
-        // Prevent new uploads from starting.
-        if let Some(remote_client) = timeline.remote_client.as_ref() {
-            let res = remote_client.stop();
-            match res {
-                Ok(()) => {}
-                Err(e) => match e {
-                    remote_timeline_client::StopError::QueueUninitialized => {
-                        // This case shouldn't happen currently because the
-                        // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
-                        // That is, before we declare the Tenant as Active.
-                        // But we only allow calls to delete_timeline on Active tenants.
-                        return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
-                    }
-                },
-            }
-        }
-
-        // Stop & wait for the remaining timeline tasks, including upload tasks.
-        // NB: This and other delete_timeline calls do not run as a task_mgr task,
-        //     so, they are not affected by this shutdown_tasks() call.
-        info!("waiting for timeline tasks to shutdown");
-        task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await;
-
-        // Mark timeline as deleted in S3 so we won't pick it up next time
-        // during attach or pageserver restart.
-        // See comment in persist_index_part_with_deleted_flag.
-        if let Some(remote_client) = timeline.remote_client.as_ref() {
-            match remote_client.persist_index_part_with_deleted_flag().await {
-                // If we (now, or already) marked it successfully as deleted, we can proceed
-                Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
-                // Bail out otherwise
-                //
-                // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
-                // two tasks from performing the deletion at the same time. The first task
-                // that starts deletion should run it to completion.
-                Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
-                | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
-                    return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
-                }
-            }
-        }
-        self.schedule_delete_timeline(timeline_id, timeline, delete_lock_guard);
-
-        Ok(())
-    }
-
-    fn schedule_delete_timeline(
-        self: Arc<Self>,
-        timeline_id: TimelineId,
-        timeline: Arc<Timeline>,
-        guard: DeletionGuard,
-    ) {
-        let tenant_id = self.tenant_id;
-        let timeline_clone = Arc::clone(&timeline);
-
-        task_mgr::spawn(
-            task_mgr::BACKGROUND_RUNTIME.handle(),
-            TaskKind::TimelineDeletionWorker,
-            Some(self.tenant_id),
-            Some(timeline_id),
-            "timeline_delete",
-            false,
-            async move {
-                if let Err(err) = self.delete_timeline(timeline_id, timeline, guard).await {
-                    error!("Error: {err:#}");
-                    timeline_clone.set_broken(err.to_string())
-                };
-                Ok(())
-            }
-            .instrument({
-                let span =
-                    tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
-                span.follows_from(Span::current());
-                span
-            }),
-        );
-    }
-
     pub fn current_state(&self) -> TenantState {
         self.state.borrow().clone()
     }
@@ -2154,6 +1999,10 @@ impl Tenant {
     /// The returned Timeline is in Loading state. The caller is responsible for
     /// initializing any on-disk state, and for inserting the Timeline to the 'timelines'
     /// map.
+    ///
+    /// `validate_ancestor == false` is used when a timeline is created for deletion
+    /// and we might not have the ancestor present anymore which is fine for to be
+    /// deleted timelines.
     fn create_timeline_struct(
         &self,
         new_timeline_id: TimelineId,
@@ -2161,12 +2010,14 @@ impl Tenant {
         ancestor: Option<Arc<Timeline>>,
         remote_client: Option<RemoteTimelineClient>,
         init_order: Option<&InitializationOrder>,
+        cause: CreateTimelineCause,
     ) -> anyhow::Result<Arc<Timeline>> {
-        if let Some(ancestor_timeline_id) = new_metadata.ancestor_timeline() {
+        if matches!(cause, CreateTimelineCause::Load) {
+            let ancestor_id = new_metadata.ancestor_timeline();
             anyhow::ensure!(
-                ancestor.is_some(),
-                "Timeline's {new_timeline_id} ancestor {ancestor_timeline_id} was not found"
-            )
+                ancestor_id == ancestor.as_ref().map(|t| t.timeline_id),
+                "Timeline's {new_timeline_id} ancestor {ancestor_id:?} was not found"
+            );
         }
 
         let initial_logical_size_can_start = init_order.map(|x| &x.initial_logical_size_can_start);
@@ -2852,7 +2703,14 @@ impl Tenant {
         };
 
         let timeline_struct = self
-            .create_timeline_struct(new_timeline_id, new_metadata, ancestor, remote_client, None)
+            .create_timeline_struct(
+                new_timeline_id,
+                new_metadata,
+                ancestor,
+                remote_client,
+                None,
+                CreateTimelineCause::Load,
+            )
             .context("Failed to create timeline data structure")?;
 
         timeline_struct.init_empty_layer_map(start_lsn);
@@ -3270,19 +3128,6 @@ pub async fn dump_layerfile_from_path(
     Ok(())
 }
 
-fn ignore_absent_files<F>(fs_operation: F) -> io::Result<()>
-where
-    F: Fn() -> io::Result<()>,
-{
-    fs_operation().or_else(|e| {
-        if e.kind() == io::ErrorKind::NotFound {
-            Ok(())
-        } else {
-            Err(e)
-        }
-    })
-}
-
 #[cfg(test)]
 pub mod harness {
     use bytes::{Bytes, BytesMut};
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
new file mode 100644
index 0000000000..02d1c997a3
--- /dev/null
+++ b/pageserver/src/tenant/delete.rs
@@ -0,0 +1,575 @@
+use std::{
+    ops::{Deref, DerefMut},
+    sync::Arc,
+};
+
+use anyhow::Context;
+use pageserver_api::models::TimelineState;
+use tokio::sync::OwnedMutexGuard;
+use tracing::{debug, error, info, instrument, warn, Instrument, Span};
+use utils::{
+    crashsafe, fs_ext,
+    id::{TenantId, TimelineId},
+};
+
+use crate::{
+    config::PageServerConf,
+    task_mgr::{self, TaskKind},
+    tenant::{remote_timeline_client, DeleteTimelineError},
+    InitializationOrder,
+};
+
+use super::{
+    metadata::TimelineMetadata,
+    remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
+    CreateTimelineCause, Tenant, Timeline,
+};
+
+/// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
+async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
+    // Stop the walreceiver first.
+    debug!("waiting for wal receiver to shutdown");
+    let maybe_started_walreceiver = { timeline.walreceiver.lock().unwrap().take() };
+    if let Some(walreceiver) = maybe_started_walreceiver {
+        walreceiver.stop().await;
+    }
+    debug!("wal receiver shutdown confirmed");
+
+    // Prevent new uploads from starting.
+    if let Some(remote_client) = timeline.remote_client.as_ref() {
+        let res = remote_client.stop();
+        match res {
+            Ok(()) => {}
+            Err(e) => match e {
+                remote_timeline_client::StopError::QueueUninitialized => {
+                    // This case shouldn't happen currently because the
+                    // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
+                    // That is, before we declare the Tenant as Active.
+                    // But we only allow calls to delete_timeline on Active tenants.
+                    return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
+                }
+            },
+        }
+    }
+
+    // Stop & wait for the remaining timeline tasks, including upload tasks.
+    // NB: This and other delete_timeline calls do not run as a task_mgr task,
+    //     so, they are not affected by this shutdown_tasks() call.
+    info!("waiting for timeline tasks to shutdown");
+    task_mgr::shutdown_tasks(None, Some(timeline.tenant_id), Some(timeline.timeline_id)).await;
+
+    fail::fail_point!("timeline-delete-before-index-deleted-at", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: timeline-delete-before-index-deleted-at"
+        ))?
+    });
+    Ok(())
+}
+
+/// Mark timeline as deleted in S3 so we won't pick it up next time
+/// during attach or pageserver restart.
+/// See comment in persist_index_part_with_deleted_flag.
+async fn set_deleted_in_remote_index(timeline: &Timeline) -> Result<(), DeleteTimelineError> {
+    if let Some(remote_client) = timeline.remote_client.as_ref() {
+        match remote_client.persist_index_part_with_deleted_flag().await {
+            // If we (now, or already) marked it successfully as deleted, we can proceed
+            Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
+            // Bail out otherwise
+            //
+            // AlreadyInProgress shouldn't happen, because the 'delete_lock' prevents
+            // two tasks from performing the deletion at the same time. The first task
+            // that starts deletion should run it to completion.
+            Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
+            | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
+                return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
+            }
+        }
+    }
+    Ok(())
+}
+
+// We delete local files first, so if pageserver restarts after local files deletion then remote deletion is not continued.
+// This can be solved with inversion of these steps. But even if these steps are inverted then, when index_part.json
+// gets deleted there is no way to distinguish between "this timeline is good, we just didnt upload it to remote"
+// and "this timeline is deleted we should continue with removal of local state". So to avoid the ambiguity we use a mark file.
+// After index part is deleted presence of this mark file indentifies that it was a deletion intention.
+// So we can just remove the mark file.
+async fn create_delete_mark(
+    conf: &PageServerConf,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+) -> Result<(), DeleteTimelineError> {
+    fail::fail_point!("timeline-delete-before-delete-mark", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: timeline-delete-before-delete-mark"
+        ))?
+    });
+    let marker_path = conf.timeline_delete_mark_file_path(tenant_id, timeline_id);
+
+    // Note: we're ok to replace existing file.
+    let _ = std::fs::OpenOptions::new()
+        .write(true)
+        .create(true)
+        .open(&marker_path)
+        .with_context(|| format!("could not create delete marker file {marker_path:?}"))?;
+
+    crashsafe::fsync_file_and_parent(&marker_path).context("sync_mark")?;
+    Ok(())
+}
+
+/// Grab the layer_removal_cs lock, and actually perform the deletion.
+///
+/// This lock prevents prevents GC or compaction from running at the same time.
+/// The GC task doesn't register itself with the timeline it's operating on,
+/// so it might still be running even though we called `shutdown_tasks`.
+///
+/// Note that there are still other race conditions between
+/// GC, compaction and timeline deletion. See
+/// <https://github.com/neondatabase/neon/issues/2671>
+///
+/// No timeout here, GC & Compaction should be responsive to the
+/// `TimelineState::Stopping` change.
+async fn delete_local_layer_files(
+    conf: &PageServerConf,
+    tenant_id: TenantId,
+    timeline: &Timeline,
+) -> anyhow::Result<()> {
+    info!("waiting for layer_removal_cs.lock()");
+    let layer_removal_guard = timeline.layer_removal_cs.lock().await;
+    info!("got layer_removal_cs.lock(), deleting layer files");
+
+    // NB: storage_sync upload tasks that reference these layers have been cancelled
+    //     by the caller.
+
+    let local_timeline_directory = conf.timeline_path(&tenant_id, &timeline.timeline_id);
+
+    fail::fail_point!("timeline-delete-before-rm", |_| {
+        Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
+    });
+
+    // NB: This need not be atomic because the deleted flag in the IndexPart
+    // will be observed during tenant/timeline load. The deletion will be resumed there.
+    //
+    // For configurations without remote storage, we guarantee crash-safety by persising delete mark file.
+    //
+    // Note that here we do not bail out on std::io::ErrorKind::NotFound.
+    // This can happen if we're called a second time, e.g.,
+    // because of a previous failure/cancellation at/after
+    // failpoint timeline-delete-after-rm.
+    //
+    // It can also happen if we race with tenant detach, because,
+    // it doesn't grab the layer_removal_cs lock.
+    //
+    // For now, log and continue.
+    // warn! level is technically not appropriate for the
+    // first case because we should expect retries to happen.
+    // But the error is so rare, it seems better to get attention if it happens.
+    //
+    // Note that metadata removal is skipped, this is not technically needed,
+    // but allows to reuse timeline loading code during resumed deletion.
+    // (we always expect that metadata is in place when timeline is being loaded)
+
+    #[cfg(feature = "testing")]
+    let mut counter = 0;
+
+    // Timeline directory may not exist if we failed to delete mark file and request was retried.
+    if !local_timeline_directory.exists() {
+        return Ok(());
+    }
+
+    let metadata_path = conf.metadata_path(&tenant_id, &timeline.timeline_id);
+
+    for entry in walkdir::WalkDir::new(&local_timeline_directory).contents_first(true) {
+        #[cfg(feature = "testing")]
+        {
+            counter += 1;
+            if counter == 2 {
+                fail::fail_point!("timeline-delete-during-rm", |_| {
+                    Err(anyhow::anyhow!("failpoint: timeline-delete-during-rm"))?
+                });
+            }
+        }
+
+        let entry = entry?;
+        if entry.path() == metadata_path {
+            debug!("found metadata, skipping");
+            continue;
+        }
+
+        if entry.path() == local_timeline_directory {
+            // Keeping directory because metedata file is still there
+            debug!("found timeline dir itself, skipping");
+            continue;
+        }
+
+        let metadata = match entry.metadata() {
+            Ok(metadata) => metadata,
+            Err(e) => {
+                if crate::is_walkdir_io_not_found(&e) {
+                    warn!(
+                        timeline_dir=?local_timeline_directory,
+                        path=?entry.path().display(),
+                        "got not found err while removing timeline dir, proceeding anyway"
+                    );
+                    continue;
+                }
+                anyhow::bail!(e);
+            }
+        };
+
+        let r = if metadata.is_dir() {
+            // There shouldnt be any directories inside timeline dir as of current layout.
+            tokio::fs::remove_dir(entry.path()).await
+        } else {
+            tokio::fs::remove_file(entry.path()).await
+        };
+
+        if let Err(e) = r {
+            if e.kind() == std::io::ErrorKind::NotFound {
+                warn!(
+                    timeline_dir=?local_timeline_directory,
+                    path=?entry.path().display(),
+                    "got not found err while removing timeline dir, proceeding anyway"
+                );
+                continue;
+            }
+            anyhow::bail!(anyhow::anyhow!(
+                "Failed to remove: {}. Error: {e}",
+                entry.path().display()
+            ));
+        }
+    }
+
+    info!("finished deleting layer files, releasing layer_removal_cs.lock()");
+    drop(layer_removal_guard);
+
+    fail::fail_point!("timeline-delete-after-rm", |_| {
+        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
+    });
+
+    Ok(())
+}
+
+/// Removes remote layers and an index file after them.
+async fn delete_remote_layers_and_index(timeline: &Timeline) -> anyhow::Result<()> {
+    if let Some(remote_client) = &timeline.remote_client {
+        remote_client.delete_all().await.context("delete_all")?
+    };
+
+    Ok(())
+}
+
+// This function removs remaining traces of a timeline on disk.
+// Namely: metadata file, timeline directory, delete mark.
+// Note: io::ErrorKind::NotFound are ignored for metadata and timeline dir.
+// delete mark should be present because it is the last step during deletion.
+// (nothing can fail after its deletion)
+async fn cleanup_remaining_timeline_fs_traces(
+    conf: &PageServerConf,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+) -> anyhow::Result<()> {
+    // Remove local metadata
+    tokio::fs::remove_file(conf.metadata_path(&tenant_id, &timeline_id))
+        .await
+        .or_else(fs_ext::ignore_not_found)
+        .context("remove metadata")?;
+
+    fail::fail_point!("timeline-delete-after-rm-metadata", |_| {
+        Err(anyhow::anyhow!(
+            "failpoint: timeline-delete-after-rm-metadata"
+        ))?
+    });
+
+    // Remove timeline dir
+    tokio::fs::remove_dir(conf.timeline_path(&tenant_id, &timeline_id))
+        .await
+        .or_else(fs_ext::ignore_not_found)
+        .context("timeline dir")?;
+
+    fail::fail_point!("timeline-delete-after-rm-dir", |_| {
+        Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm-dir"))?
+    });
+
+    // Remove delete mark
+    tokio::fs::remove_file(conf.timeline_delete_mark_file_path(tenant_id, timeline_id))
+        .await
+        .context("remove delete mark")
+}
+
+/// It is important that this gets called when DeletionGuard is being held.
+/// For more context see comments in [`DeleteTimelineFlow::prepare`]
+async fn remove_timeline_from_tenant(
+    tenant: &Tenant,
+    timeline_id: TimelineId,
+    _: &DeletionGuard, // using it as a witness
+) -> anyhow::Result<()> {
+    // Remove the timeline from the map.
+    let mut timelines = tenant.timelines.lock().unwrap();
+    let children_exist = timelines
+        .iter()
+        .any(|(_, entry)| entry.get_ancestor_timeline_id() == Some(timeline_id));
+    // XXX this can happen because `branch_timeline` doesn't check `TimelineState::Stopping`.
+    // We already deleted the layer files, so it's probably best to panic.
+    // (Ideally, above remove_dir_all is atomic so we don't see this timeline after a restart)
+    if children_exist {
+        panic!("Timeline grew children while we removed layer files");
+    }
+
+    timelines
+        .remove(&timeline_id)
+        .expect("timeline that we were deleting was concurrently removed from 'timelines' map");
+
+    drop(timelines);
+
+    Ok(())
+}
+
+/// Orchestrates timeline shut down of all timeline tasks, removes its in-memory structures,
+/// and deletes its data from both disk and s3.
+/// The sequence of steps:
+/// 1. Set deleted_at in remote index part.
+/// 2. Create local mark file.
+/// 3. Delete local files except metadata (it is simpler this way, to be able to reuse timeline initialization code that expects metadata)
+/// 4. Delete remote layers
+/// 5. Delete index part
+/// 6. Delete meta, timeline directory
+/// 7. Delete mark file
+/// It is resumable from any step in case a crash/restart occurs.
+/// There are three entrypoints to the process:
+/// 1. [`DeleteTimelineFlow::run`] this is the main one called by a management api handler.
+/// 2. [`DeleteTimelineFlow::resume_deletion`] is called during restarts when local metadata is still present
+/// and we possibly neeed to continue deletion of remote files.
+/// 3. [`DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`] is used when we deleted remote
+/// index but still have local metadata, timeline directory and delete mark.
+/// Note the only other place that messes around timeline delete mark is the logic that scans directory with timelines during tenant load.
+#[derive(Default)]
+pub enum DeleteTimelineFlow {
+    #[default]
+    NotStarted,
+    InProgress,
+    Finished,
+}
+
+impl DeleteTimelineFlow {
+    // These steps are run in the context of management api request handler.
+    // Long running steps are continued to run in the background.
+    // NB: If this fails half-way through, and is retried, the retry will go through
+    // all the same steps again. Make sure the code here is idempotent, and don't
+    // error out if some of the shutdown tasks have already been completed!
+    #[instrument(skip_all, fields(tenant_id=%tenant.tenant_id, %timeline_id))]
+    pub async fn run(
+        tenant: &Arc<Tenant>,
+        timeline_id: TimelineId,
+    ) -> Result<(), DeleteTimelineError> {
+        let (timeline, mut guard) = Self::prepare(tenant, timeline_id)?;
+
+        guard.mark_in_progress()?;
+
+        stop_tasks(&timeline).await?;
+
+        set_deleted_in_remote_index(&timeline).await?;
+
+        create_delete_mark(tenant.conf, timeline.tenant_id, timeline.timeline_id).await?;
+
+        fail::fail_point!("timeline-delete-before-schedule", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: timeline-delete-before-schedule"
+            ))?
+        });
+
+        Self::schedule_background(guard, tenant.conf, Arc::clone(tenant), timeline);
+
+        Ok(())
+    }
+
+    fn mark_in_progress(&mut self) -> anyhow::Result<()> {
+        match self {
+            Self::Finished => anyhow::bail!("Bug. Is in finished state"),
+            Self::InProgress { .. } => { /* We're in a retry */ }
+            Self::NotStarted => { /* Fresh start */ }
+        }
+
+        *self = Self::InProgress;
+
+        Ok(())
+    }
+
+    /// Shortcut to create Timeline in stopping state and spawn deletion task.
+    pub async fn resume_deletion(
+        tenant: Arc<Tenant>,
+        timeline_id: TimelineId,
+        local_metadata: &TimelineMetadata,
+        remote_client: Option<RemoteTimelineClient>,
+        init_order: Option<&InitializationOrder>,
+    ) -> anyhow::Result<()> {
+        let timeline = tenant
+            .create_timeline_struct(
+                timeline_id,
+                local_metadata,
+                None, // Ancestor is not needed for deletion.
+                remote_client,
+                init_order,
+                // Important. We dont pass ancestor above because it can be missing.
+                // Thus we need to skip the validation here.
+                CreateTimelineCause::Delete,
+            )
+            .context("create_timeline_struct")?;
+
+        let mut guard = DeletionGuard(
+            Arc::clone(&timeline.delete_progress)
+                .try_lock_owned()
+                .expect("cannot happen because we're the only owner"),
+        );
+
+        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
+        // RemoteTimelineClient is the only functioning part.
+        timeline.set_state(TimelineState::Stopping);
+        // We meed to do this because when console retries delete request we shouldnt answer with 404
+        // because 404 means successful deletion.
+        {
+            let mut locked = tenant.timelines.lock().unwrap();
+            locked.insert(timeline_id, Arc::clone(&timeline));
+        }
+
+        guard.mark_in_progress()?;
+
+        // Note that delete mark can be missing on resume
+        // because we create delete mark after we set deleted_at in the index part.
+        create_delete_mark(tenant.conf, tenant.tenant_id, timeline_id).await?;
+
+        Self::schedule_background(guard, tenant.conf, tenant, timeline);
+
+        Ok(())
+    }
+
+    pub async fn cleanup_remaining_timeline_fs_traces(
+        tenant: &Tenant,
+        timeline_id: TimelineId,
+    ) -> anyhow::Result<()> {
+        cleanup_remaining_timeline_fs_traces(tenant.conf, tenant.tenant_id, timeline_id).await
+    }
+
+    fn prepare(
+        tenant: &Tenant,
+        timeline_id: TimelineId,
+    ) -> Result<(Arc<Timeline>, DeletionGuard), DeleteTimelineError> {
+        // Note the interaction between this guard and deletion guard.
+        // Here we attempt to lock deletion guard when we're holding a lock on timelines.
+        // This is important because when you take into account `remove_timeline_from_tenant`
+        // we remove timeline from memory when we still hold the deletion guard.
+        // So here when timeline deletion is finished timeline wont be present in timelines map at all
+        // which makes the following sequence impossible:
+        // T1: get preempted right before the try_lock on `Timeline::delete_progress`
+        // T2: do a full deletion, acquire and drop `Timeline::delete_progress`
+        // T1: acquire deletion lock, do another `DeleteTimelineFlow::run`
+        // For more context see this discussion: `https://github.com/neondatabase/neon/pull/4552#discussion_r1253437346`
+        let timelines = tenant.timelines.lock().unwrap();
+
+        let timeline = match timelines.get(&timeline_id) {
+            Some(t) => t,
+            None => return Err(DeleteTimelineError::NotFound),
+        };
+
+        // Ensure that there are no child timelines **attached to that pageserver**,
+        // because detach removes files, which will break child branches
+        let children: Vec<TimelineId> = timelines
+            .iter()
+            .filter_map(|(id, entry)| {
+                if entry.get_ancestor_timeline_id() == Some(timeline_id) {
+                    Some(*id)
+                } else {
+                    None
+                }
+            })
+            .collect();
+
+        if !children.is_empty() {
+            return Err(DeleteTimelineError::HasChildren(children));
+        }
+
+        // Note that using try_lock here is important to avoid a deadlock.
+        // Here we take lock on timelines and then the deletion guard.
+        // At the end of the operation we're holding the guard and need to lock timelines map
+        // to remove the timeline from it.
+        // Always if you have two locks that are taken in different order this can result in a deadlock.
+        let delete_lock_guard = DeletionGuard(
+            Arc::clone(&timeline.delete_progress)
+                .try_lock_owned()
+                .map_err(|_| DeleteTimelineError::AlreadyInProgress)?,
+        );
+
+        timeline.set_state(TimelineState::Stopping);
+
+        Ok((Arc::clone(timeline), delete_lock_guard))
+    }
+
+    fn schedule_background(
+        guard: DeletionGuard,
+        conf: &'static PageServerConf,
+        tenant: Arc<Tenant>,
+        timeline: Arc<Timeline>,
+    ) {
+        let tenant_id = timeline.tenant_id;
+        let timeline_id = timeline.timeline_id;
+
+        task_mgr::spawn(
+            task_mgr::BACKGROUND_RUNTIME.handle(),
+            TaskKind::TimelineDeletionWorker,
+            Some(tenant_id),
+            Some(timeline_id),
+            "timeline_delete",
+            false,
+            async move {
+                if let Err(err) = Self::background(guard, conf, &tenant, &timeline).await {
+                    error!("Error: {err:#}");
+                    timeline.set_broken(format!("{err:#}"))
+                };
+                Ok(())
+            }
+            .instrument({
+                let span =
+                    tracing::info_span!(parent: None, "delete_timeline", tenant_id=%tenant_id, timeline_id=%timeline_id);
+                span.follows_from(Span::current());
+                span
+            }),
+        );
+    }
+
+    async fn background(
+        mut guard: DeletionGuard,
+        conf: &PageServerConf,
+        tenant: &Tenant,
+        timeline: &Timeline,
+    ) -> Result<(), DeleteTimelineError> {
+        delete_local_layer_files(conf, tenant.tenant_id, timeline).await?;
+
+        delete_remote_layers_and_index(timeline).await?;
+
+        pausable_failpoint!("in_progress_delete");
+
+        cleanup_remaining_timeline_fs_traces(conf, tenant.tenant_id, timeline.timeline_id).await?;
+
+        remove_timeline_from_tenant(tenant, timeline.timeline_id, &guard).await?;
+
+        *guard.0 = Self::Finished;
+
+        Ok(())
+    }
+}
+
+struct DeletionGuard(OwnedMutexGuard<DeleteTimelineFlow>);
+
+impl Deref for DeletionGuard {
+    type Target = DeleteTimelineFlow;
+
+    fn deref(&self) -> &Self::Target {
+        &self.0
+    }
+}
+
+impl DerefMut for DeletionGuard {
+    fn deref_mut(&mut self) -> &mut Self::Target {
+        &mut self.0
+    }
+}
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index eeb84caf13..aeecc88602 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -26,6 +26,8 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};
 
+use super::delete::DeleteTimelineFlow;
+
 /// The tenants known to the pageserver.
 /// The enum variants are used to distinguish the different states that the pageserver can be in.
 enum TenantsMap {
@@ -421,12 +423,10 @@ pub enum DeleteTimelineError {
 pub async fn delete_timeline(
     tenant_id: TenantId,
     timeline_id: TimelineId,
-    ctx: &RequestContext,
+    _ctx: &RequestContext,
 ) -> Result<(), DeleteTimelineError> {
     let tenant = get_tenant(tenant_id, true).await?;
-    tenant
-        .prepare_and_schedule_delete_timeline(timeline_id, ctx)
-        .await?;
+    DeleteTimelineFlow::run(&tenant, timeline_id).await?;
     Ok(())
 }
 
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index fee7f0c28e..8d002a8570 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -827,7 +827,7 @@ impl RemoteTimelineClient {
             )
         };
 
-        receiver.changed().await?;
+        receiver.changed().await.context("upload queue shut down")?;
 
         // Do not delete index part yet, it is needed for possible retry. If we remove it first
         // and retry will arrive to different pageserver there wont be any traces of it on remote storage
@@ -855,11 +855,23 @@ impl RemoteTimelineClient {
             self.storage_impl.delete_objects(&remaining).await?;
         }
 
+        fail::fail_point!("timeline-delete-before-index-delete", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: timeline-delete-before-index-delete"
+            ))?
+        });
+
         let index_file_path = timeline_storage_path.join(Path::new(IndexPart::FILE_NAME));
 
         debug!("deleting index part");
         self.storage_impl.delete(&index_file_path).await?;
 
+        fail::fail_point!("timeline-delete-after-index-delete", |_| {
+            Err(anyhow::anyhow!(
+                "failpoint: timeline-delete-after-index-delete"
+            ))?
+        });
+
         info!(prefix=%timeline_storage_path, referenced=deletions_queued, not_referenced=%remaining.len(), "done deleting in timeline prefix, including index_part.json");
 
         Ok(())
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c663a4f9ad..af9edbf95e 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -86,6 +86,7 @@ use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
 use super::config::TenantConf;
+use super::delete::DeleteTimelineFlow;
 use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
 use super::storage_layer::{
@@ -237,11 +238,10 @@ pub struct Timeline {
 
     /// Layer removal lock.
     /// A lock to ensure that no layer of the timeline is removed concurrently by other tasks.
-    /// This lock is acquired in [`Timeline::gc`], [`Timeline::compact`],
-    /// and [`Tenant::delete_timeline`]. This is an `Arc<Mutex>` lock because we need an owned
+    /// This lock is acquired in [`Timeline::gc`] and [`Timeline::compact`].
+    /// This is an `Arc<Mutex>` lock because we need an owned
     /// lock guard in functions that will be spawned to tokio I/O pool (which requires `'static`).
-    ///
-    /// [`Tenant::delete_timeline`]: super::Tenant::delete_timeline
+    /// Note that [`DeleteTimelineFlow`] uses `delete_progress` field.
     pub(super) layer_removal_cs: Arc<tokio::sync::Mutex<()>>,
 
     // Needed to ensure that we can't create a branch at a point that was already garbage collected
@@ -283,7 +283,7 @@ pub struct Timeline {
 
     /// Prevent two tasks from deleting the timeline at the same time. If held, the
     /// timeline is being deleted. If 'true', the timeline has already been deleted.
-    pub delete_lock: Arc<tokio::sync::Mutex<bool>>,
+    pub delete_progress: Arc<tokio::sync::Mutex<DeleteTimelineFlow>>,
 
     eviction_task_timeline_state: tokio::sync::Mutex<EvictionTaskTimelineState>,
 
@@ -1453,7 +1453,7 @@ impl Timeline {
                 eviction_task_timeline_state: tokio::sync::Mutex::new(
                     EvictionTaskTimelineState::default(),
                 ),
-                delete_lock: Arc::new(tokio::sync::Mutex::new(false)),
+                delete_progress: Arc::new(tokio::sync::Mutex::new(DeleteTimelineFlow::default())),
 
                 initial_logical_size_can_start,
                 initial_logical_size_attempt: Mutex::new(initial_logical_size_attempt),
@@ -1918,6 +1918,15 @@ impl Timeline {
     }
 
     fn try_spawn_size_init_task(self: &Arc<Self>, lsn: Lsn, ctx: &RequestContext) {
+        let state = self.current_state();
+        if matches!(
+            state,
+            TimelineState::Broken { .. } | TimelineState::Stopping
+        ) {
+            // Can happen when timeline detail endpoint is used when deletion is ongoing (or its broken).
+            return;
+        }
+
         let permit = match Arc::clone(&self.current_logical_size.initial_size_computation)
             .try_acquire_owned()
         {
diff --git a/pageserver/src/tenant/timeline/uninit.rs b/pageserver/src/tenant/timeline/uninit.rs
index b8cc65f4b1..5a15e86458 100644
--- a/pageserver/src/tenant/timeline/uninit.rs
+++ b/pageserver/src/tenant/timeline/uninit.rs
@@ -2,13 +2,9 @@ use std::{collections::hash_map::Entry, fs, path::PathBuf, sync::Arc};
 
 use anyhow::Context;
 use tracing::{error, info, info_span, warn};
-use utils::{crashsafe, id::TimelineId, lsn::Lsn};
+use utils::{crashsafe, fs_ext, id::TimelineId, lsn::Lsn};
 
-use crate::{
-    context::RequestContext,
-    import_datadir,
-    tenant::{ignore_absent_files, Tenant},
-};
+use crate::{context::RequestContext, import_datadir, tenant::Tenant};
 
 use super::Timeline;
 
@@ -141,7 +137,7 @@ impl Drop for UninitializedTimeline<'_> {
 
 pub(crate) fn cleanup_timeline_directory(uninit_mark: TimelineUninitMark) {
     let timeline_path = &uninit_mark.timeline_path;
-    match ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
+    match fs_ext::ignore_absent_files(|| fs::remove_dir_all(timeline_path)) {
         Ok(()) => {
             info!("Timeline dir {timeline_path:?} removed successfully, removing the uninit mark")
         }
@@ -185,7 +181,7 @@ impl TimelineUninitMark {
         let uninit_mark_parent = uninit_mark_file
             .parent()
             .with_context(|| format!("Uninit mark file {uninit_mark_file:?} has no parent"))?;
-        ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
+        fs_ext::ignore_absent_files(|| fs::remove_file(uninit_mark_file)).with_context(|| {
             format!("Failed to remove uninit mark file at path {uninit_mark_file:?}")
         })?;
         crashsafe::fsync(uninit_mark_parent).context("Failed to fsync uninit mark parent")?;
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index ad89ebad00..f8a4423ffa 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -194,14 +194,18 @@ def wait_for_upload_queue_empty(
 
 
 def wait_timeline_detail_404(
-    pageserver_http: PageserverHttpClient, tenant_id: TenantId, timeline_id: TimelineId
+    pageserver_http: PageserverHttpClient,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+    wait_longer: bool = False,
 ):
     last_exc = None
-    for _ in range(2):
+    iterations = 10 if wait_longer else 2
+    for _ in range(iterations):
         time.sleep(0.250)
         try:
             data = pageserver_http.timeline_detail(tenant_id, timeline_id)
-            log.error(f"detail {data}")
+            log.info(f"detail {data}")
         except PageserverApiException as e:
             log.debug(e)
             if e.status_code == 404:
@@ -216,7 +220,8 @@ def timeline_delete_wait_completed(
     pageserver_http: PageserverHttpClient,
     tenant_id: TenantId,
     timeline_id: TimelineId,
+    wait_longer: bool = False,  # Use when running with RemoteStorageKind.REAL_S3
     **delete_args,
 ):
     pageserver_http.timeline_delete(tenant_id=tenant_id, timeline_id=timeline_id, **delete_args)
-    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id)
+    wait_timeline_detail_404(pageserver_http, tenant_id, timeline_id, wait_longer)
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index a4c5bf626a..9226ca21d2 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -1,3 +1,4 @@
+import enum
 import os
 import queue
 import shutil
@@ -11,9 +12,12 @@ from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
+    PgBin,
     RemoteStorageKind,
     S3Storage,
     available_remote_storages,
+    last_flush_lsn_upload,
+    wait_for_last_flush_lsn,
 )
 from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
@@ -117,59 +121,183 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
         ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
 
 
+class Check(enum.Enum):
+    RETRY_WITHOUT_RESTART = enum.auto()
+    RETRY_WITH_RESTART = enum.auto()
+
+
+DELETE_FAILPOINTS = [
+    "timeline-delete-before-index-deleted-at",
+    "timeline-delete-before-schedule",
+    "timeline-delete-before-rm",
+    "timeline-delete-during-rm",
+    "timeline-delete-after-rm",
+    "timeline-delete-before-index-delete",
+    "timeline-delete-after-index-delete",
+    "timeline-delete-after-rm-metadata",
+    "timeline-delete-after-rm-dir",
+]
+
+
+def combinations():
+    result = []
+
+    remotes = [RemoteStorageKind.NOOP, RemoteStorageKind.MOCK_S3]
+    if os.getenv("ENABLE_REAL_S3_REMOTE_STORAGE"):
+        remotes.append(RemoteStorageKind.REAL_S3)
+
+    for remote_storage_kind in remotes:
+        for delete_failpoint in DELETE_FAILPOINTS:
+            if remote_storage_kind == RemoteStorageKind.NOOP and delete_failpoint in (
+                "timeline-delete-before-index-delete",
+                "timeline-delete-after-index-delete",
+            ):
+                # the above failpoints are not relevant for config without remote storage
+                continue
+
+            result.append((remote_storage_kind, delete_failpoint))
+    return result
+
+
 # cover the two cases: remote storage configured vs not configured
-@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
-def test_delete_timeline_post_rm_failure(
-    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+@pytest.mark.parametrize("remote_storage_kind, failpoint", combinations())
+@pytest.mark.parametrize("check", list(Check))
+def test_delete_timeline_exercise_crash_safety_failpoints(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    failpoint: str,
+    check: Check,
+    pg_bin: PgBin,
 ):
     """
-    If there is a failure after removing the timeline directory, the delete operation
-    should be retryable.
+    If there is a failure during deletion in one of the associated failpoints (or crash restart happens at this point) the delete operation
+    should be retryable and should be successfully resumed.
+
+    We iterate over failpoints list, changing failpoint to the next one.
+
+    1. Set settings to generate many layers
+    2. Create branch.
+    3. Insert something
+    4. Go with the test.
+    5. Iterate over failpoints
+    6. Execute delete for each failpoint
+    7. Ensure failpoint is hit
+    8. Retry or restart without the failpoint and check the result.
     """
 
     if remote_storage_kind is not None:
         neon_env_builder.enable_remote_storage(
-            remote_storage_kind, "test_delete_timeline_post_rm_failure"
+            remote_storage_kind, "test_delete_timeline_exercise_crash_safety_failpoints"
         )
 
-    env = neon_env_builder.init_start()
-    assert env.initial_timeline
-
-    env.pageserver.allowed_errors.append(".*Error: failpoint: timeline-delete-after-rm")
-    env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline")
+    env = neon_env_builder.init_start(
+        initial_tenant_conf={
+            "gc_period": "0s",
+            "compaction_period": "0s",
+            "checkpoint_distance": f"{1024 ** 2}",
+            "image_creation_threshold": "100",
+        }
+    )
 
     ps_http = env.pageserver.http_client()
 
-    failpoint_name = "timeline-delete-after-rm"
-    ps_http.configure_failpoints((failpoint_name, "return"))
+    timeline_id = env.neon_cli.create_timeline("delete")
+    with env.endpoints.create_start("delete") as endpoint:
+        # generate enough layers
+        pg_bin.run(["pgbench", "-i", "-I dtGvp", "-s1", endpoint.connstr()])
+        if remote_storage_kind is RemoteStorageKind.NOOP:
+            wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, timeline_id)
+        else:
+            last_flush_lsn_upload(env, endpoint, env.initial_tenant, timeline_id)
 
-    ps_http.timeline_delete(env.initial_tenant, env.initial_timeline)
-    wait_until_timeline_state(
-        pageserver_http=ps_http,
-        tenant_id=env.initial_tenant,
-        timeline_id=env.initial_timeline,
-        expected_state="Broken",
-        iterations=2,  # effectively try immediately and retry once in one second
-    )
-
-    # FIXME: #4719
-    # timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm"
-
-    at_failpoint_log_message = f".*{env.initial_timeline}.*at failpoint {failpoint_name}.*"
-    env.pageserver.allowed_errors.append(at_failpoint_log_message)
+    env.pageserver.allowed_errors.append(f".*{timeline_id}.*failpoint: {failpoint}")
+    # It appears when we stopped flush loop during deletion and then pageserver is stopped
     env.pageserver.allowed_errors.append(
-        f".*DELETE.*{env.initial_timeline}.*InternalServerError.*{failpoint_name}"
+        ".*freeze_and_flush_on_shutdown.*failed to freeze and flush: cannot flush frozen layers when flush_loop is not running, state is Exited"
     )
-
-    # retry without failpoint, it should succeed
-    ps_http.configure_failpoints((failpoint_name, "off"))
-
-    # this should succeed
-    # this also checks that delete can be retried even when timeline is in Broken state
-    timeline_delete_wait_completed(ps_http, env.initial_tenant, env.initial_timeline)
+    # This happens when we fail before scheduling background operation.
+    # Timeline is left in stopping state and retry tries to stop it again.
     env.pageserver.allowed_errors.append(
-        f".*{env.initial_timeline}.*timeline directory not found, proceeding anyway.*"
+        ".*Ignoring new state, equal to the existing one: Stopping"
     )
+    # This happens when we retry delete requests for broken timelines
+    env.pageserver.allowed_errors.append(".*Ignoring state update Stopping for broken timeline")
+    # This happens when timeline remains are cleaned up during loading
+    env.pageserver.allowed_errors.append(".*Timeline dir entry become invalid.*")
+    # In one of the branches we poll for tenant to become active. Polls can generate this log message:
+    env.pageserver.allowed_errors.append(f".*Tenant {env.initial_tenant} is not active*")
+
+    ps_http.configure_failpoints((failpoint, "return"))
+
+    # These failpoints are earlier than background task is spawned.
+    # so they result in api request failure.
+    if failpoint in (
+        "timeline-delete-before-index-deleted-at",
+        "timeline-delete-before-schedule",
+    ):
+        with pytest.raises(PageserverApiException, match=failpoint):
+            ps_http.timeline_delete(env.initial_tenant, timeline_id)
+
+    else:
+        ps_http.timeline_delete(env.initial_tenant, timeline_id)
+        timeline_info = wait_until_timeline_state(
+            pageserver_http=ps_http,
+            tenant_id=env.initial_tenant,
+            timeline_id=timeline_id,
+            expected_state="Broken",
+            iterations=2,  # effectively try immediately and retry once in one second
+        )
+
+        reason = timeline_info["state"]["Broken"]["reason"]
+        log.info(f"timeline broken: {reason}")
+
+        # failpoint may not be the only error in the stack
+        assert reason.endswith(f"failpoint: {failpoint}"), reason
+
+    wait_longer = remote_storage_kind is RemoteStorageKind.REAL_S3
+    if check is Check.RETRY_WITH_RESTART:
+        env.pageserver.stop()
+        env.pageserver.start()
+        if failpoint == "timeline-delete-before-index-deleted-at":
+            # We crashed before persisting this to remote storage, need to retry delete request
+
+            # Wait till tenant is loaded. Shouldnt take longer than 2 seconds (we shouldnt block tenant loading)
+            wait_until_tenant_active(ps_http, env.initial_tenant, iterations=2)
+
+            timeline_delete_wait_completed(ps_http, env.initial_tenant, timeline_id)
+        else:
+            # Pageserver should've resumed deletion after restart.
+            wait_timeline_detail_404(
+                ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer
+            )
+    elif check is Check.RETRY_WITHOUT_RESTART:
+        # this should succeed
+        # this also checks that delete can be retried even when timeline is in Broken state
+        ps_http.configure_failpoints((failpoint, "off"))
+
+        timeline_delete_wait_completed(
+            ps_http, env.initial_tenant, timeline_id, wait_longer=wait_longer
+        )
+
+    # Check remote is impty
+    if remote_storage_kind is RemoteStorageKind.MOCK_S3:
+        assert_prefix_empty(
+            neon_env_builder,
+            prefix="/".join(
+                (
+                    "tenants",
+                    str(env.initial_tenant),
+                    "timelines",
+                    str(timeline_id),
+                )
+            ),
+        )
+
+    timeline_dir = env.timeline_dir(env.initial_tenant, timeline_id)
+    # Check local is empty
+    assert not timeline_dir.exists()
+    # Check no delete mark present
+    assert not (timeline_dir.parent / f"{timeline_id}.___deleted").exists()
 
 
 @pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
@@ -327,7 +455,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
     )
 
     ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)
-    wait_until_timeline_state(
+    timeline_info = wait_until_timeline_state(
         pageserver_http=ps_http,
         tenant_id=env.initial_tenant,
         timeline_id=leaf_timeline_id,
@@ -335,8 +463,7 @@ def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuild
         iterations=2,  # effectively try immediately and retry once in one second
     )
 
-    # FIXME: #4719
-    # timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-after-rm"
+    assert timeline_info["state"]["Broken"]["reason"] == "failpoint: timeline-delete-before-rm"
 
     assert leaf_timeline_path.exists(), "the failpoint didn't work"
 
@@ -588,6 +715,7 @@ def test_timeline_delete_works_for_remote_smoke(
     assert tenant_id == env.initial_tenant
     assert main_timeline_id == env.initial_timeline
 
+    assert env.initial_timeline is not None
     timeline_ids = [env.initial_timeline]
     for i in range(2):
         branch_timeline_id = env.neon_cli.create_branch(f"new{i}", "main")

From bcc2aee704100425de3f10e760fca88b689774b0 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Tue, 25 Jul 2023 11:08:24 -0400
Subject: [PATCH 04/37] proxy: add tests for batch http sql (#4793)

This PR adds an integration test case for batch HTTP SQL endpoint.
https://github.com/neondatabase/neon/pull/4654/ should be merged first
before we land this PR.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 proxy/src/http/sql_over_http.rs   |  2 +-
 test_runner/regress/test_proxy.py | 53 ++++++++++++++++++++++++++++++-
 2 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index 4ed650a392..1f83fcfc68 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -223,7 +223,7 @@ pub async fn handle(
                 }
             }
             transaction.commit().await?;
-            Ok(Value::Array(results))
+            Ok(json!({ "results": results }))
         }
     };
 
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index 1f6dcd39e9..d5bf98109c 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -1,6 +1,6 @@
 import json
 import subprocess
-from typing import Any, List, Optional
+from typing import Any, List, Optional, Tuple
 
 import psycopg2
 import pytest
@@ -260,3 +260,54 @@ def test_sql_over_http_output_options(static_proxy: NeonProxy):
 
     rows = q("select 1 as n, 'a' as s, '{1,2,3}'::int4[] as arr", True, True)["rows"]
     assert rows == [["1", "a", "{1,2,3}"]]
+
+
+def test_sql_over_http_batch(static_proxy: NeonProxy):
+    static_proxy.safe_psql("create role http with login password 'http' superuser")
+
+    def qq(queries: List[Tuple[str, Optional[List[Any]]]]) -> Any:
+        connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
+        response = requests.post(
+            f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
+            data=json.dumps(list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))),
+            headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr},
+            verify=str(static_proxy.test_output_dir / "proxy.crt"),
+        )
+        assert response.status_code == 200
+        return response.json()["results"]
+
+    result = qq(
+        [
+            ("select 42 as answer", None),
+            ("select $1 as answer", [42]),
+            ("select $1 * 1 as answer", [42]),
+            ("select $1::int[] as answer", [[1, 2, 3]]),
+            ("select $1::json->'a' as answer", [{"a": {"b": 42}}]),
+            ("select * from pg_class limit 1", None),
+            ("create table t(id serial primary key, val int)", None),
+            ("insert into t(val) values (10), (20), (30) returning id", None),
+            ("select * from t", None),
+            ("drop table t", None),
+        ]
+    )
+
+    assert result[0]["rows"] == [{"answer": 42}]
+    assert result[1]["rows"] == [{"answer": "42"}]
+    assert result[2]["rows"] == [{"answer": 42}]
+    assert result[3]["rows"] == [{"answer": [1, 2, 3]}]
+    assert result[4]["rows"] == [{"answer": {"b": 42}}]
+    assert len(result[5]["rows"]) == 1
+    res = result[6]
+    assert res["command"] == "CREATE"
+    assert res["rowCount"] is None
+    res = result[7]
+    assert res["command"] == "INSERT"
+    assert res["rowCount"] == 3
+    assert res["rows"] == [{"id": 1}, {"id": 2}, {"id": 3}]
+    res = result[8]
+    assert res["command"] == "SELECT"
+    assert res["rowCount"] == 3
+    res = result[9]
+    assert res["command"] == "DROP"
+    assert res["rowCount"] is None
+    assert len(result) == 10

From 2ebd2ce2b6dbc143d2ec4715d3d4caa2cfb5d640 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Tue, 25 Jul 2023 16:57:42 +0100
Subject: [PATCH 05/37] proxy: record connection type (#4802)

## Problem

We want to measure how many users are using TCP/WS connections.
We also want to measure how long it takes to establish a connection with
the compute node.

I plan to also add a separate counter for HTTP requests, but because of
pooling this needs to be disambiguated against new HTTP compute
connections

## Summary of changes

* record connection type (ws/tcp) in the connection counters.
* record connection latency including retry latency
---
 proxy/src/proxy.rs | 43 ++++++++++++++++++++++++++++++++++---------
 1 file changed, 34 insertions(+), 9 deletions(-)

diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index d317d382a7..2cdd1582ac 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -17,7 +17,9 @@ use anyhow::{bail, Context};
 use async_trait::async_trait;
 use futures::TryFutureExt;
 use hyper::StatusCode;
-use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec};
+use metrics::{
+    exponential_buckets, register_histogram, register_int_counter_vec, Histogram, IntCounterVec,
+};
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use std::{error::Error, io, ops::ControlFlow, sync::Arc};
@@ -38,18 +40,30 @@ const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100
 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)";
 const ERR_PROTO_VIOLATION: &str = "protocol violation";
 
-static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
+static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
         "proxy_accepted_connections_total",
-        "Number of TCP client connections accepted."
+        "Number of TCP client connections accepted.",
+        &["protocol"],
     )
     .unwrap()
 });
 
-static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
-    register_int_counter!(
+static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
         "proxy_closed_connections_total",
-        "Number of TCP client connections closed."
+        "Number of TCP client connections closed.",
+        &["protocol"],
+    )
+    .unwrap()
+});
+
+static COMPUTE_CONNECTION_LATENCY: Lazy<Histogram> = Lazy::new(|| {
+    register_histogram!(
+        "proxy_compute_connection_latency_seconds",
+        "Time it took for proxy to establish a connection to the compute endpoint",
+        // largest bucket = 2^16 * 0.5ms = 32s
+        exponential_buckets(0.0005, 2.0, 16).unwrap(),
     )
     .unwrap()
 });
@@ -137,6 +151,13 @@ pub enum ClientMode {
 
 /// Abstracts the logic of handling TCP vs WS clients
 impl ClientMode {
+    fn protocol_label(&self) -> &'static str {
+        match self {
+            ClientMode::Tcp => "tcp",
+            ClientMode::Websockets { .. } => "ws",
+        }
+    }
+
     fn allow_cleartext(&self) -> bool {
         match self {
             ClientMode::Tcp => false,
@@ -176,9 +197,11 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     mode: ClientMode,
 ) -> anyhow::Result<()> {
     // The `closed` counter will increase when this future is destroyed.
-    NUM_CONNECTIONS_ACCEPTED_COUNTER.inc();
+    NUM_CONNECTIONS_ACCEPTED_COUNTER
+        .with_label_values(&[mode.protocol_label()])
+        .inc();
     scopeguard::defer! {
-        NUM_CONNECTIONS_CLOSED_COUNTER.inc();
+        NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[mode.protocol_label()]).inc();
     }
 
     let tls = config.tls_config.as_ref();
@@ -380,6 +403,8 @@ where
     M::ConnectError: ShouldRetry + std::fmt::Debug,
     M::Error: From<WakeComputeError>,
 {
+    let _timer = COMPUTE_CONNECTION_LATENCY.start_timer();
+
     mechanism.update_connect_config(&mut node_info.config);
 
     let mut num_retries = 0;

From 520046f5bdf5b76c73eeafa0093127644843ccd3 Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Tue, 25 Jul 2023 19:44:18 -0400
Subject: [PATCH 06/37] cold starts: Add sync-safekeepers fast path (#4804)

---
 compute_tools/src/compute.rs            | 112 ++++++++++++++++++++++--
 compute_tools/src/lib.rs                |   1 +
 compute_tools/src/sync_sk.rs            |  95 ++++++++++++++++++++
 libs/compute_api/src/responses.rs       |   1 +
 safekeeper/src/handler.rs               |  38 ++++++++
 test_runner/performance/test_startup.py |   1 +
 6 files changed, 243 insertions(+), 5 deletions(-)
 create mode 100644 compute_tools/src/sync_sk.rs

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index b33f4f05dd..73e7a4e5e9 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -8,9 +8,11 @@ use std::sync::{Condvar, Mutex};
 
 use anyhow::{Context, Result};
 use chrono::{DateTime, Utc};
+use futures::stream::FuturesUnordered;
+use futures::StreamExt;
 use postgres::{Client, NoTls};
 use tokio_postgres;
-use tracing::{info, instrument, warn};
+use tracing::{error, info, instrument, warn};
 use utils::id::{TenantId, TimelineId};
 use utils::lsn::Lsn;
 
@@ -21,6 +23,7 @@ use utils::measured_stream::MeasuredReader;
 use crate::config;
 use crate::pg_helpers::*;
 use crate::spec::*;
+use crate::sync_sk::{check_if_synced, ping_safekeeper};
 
 /// Compute node info shared across several `compute_ctl` threads.
 pub struct ComputeNode {
@@ -309,6 +312,101 @@ impl ComputeNode {
         Ok(())
     }
 
+    pub async fn check_safekeepers_synced_async(
+        &self,
+        compute_state: &ComputeState,
+    ) -> Result<Option<Lsn>> {
+        // Construct a connection config for each safekeeper
+        let pspec: ParsedSpec = compute_state
+            .pspec
+            .as_ref()
+            .expect("spec must be set")
+            .clone();
+        let sk_connstrs: Vec<String> = pspec.spec.safekeeper_connstrings.clone();
+        let sk_configs = sk_connstrs.into_iter().map(|connstr| {
+            // Format connstr
+            let connstr = format!("postgresql://no_user@{}", connstr);
+            let options = format!(
+                "-c timeline_id={} tenant_id={}",
+                pspec.timeline_id, pspec.tenant_id
+            );
+
+            // Construct client
+            let mut config = tokio_postgres::Config::from_str(&connstr).unwrap();
+            config.options(&options);
+            if let Some(storage_auth_token) = pspec.storage_auth_token.clone() {
+                config.password(storage_auth_token);
+            }
+
+            config
+        });
+
+        // Create task set to query all safekeepers
+        let mut tasks = FuturesUnordered::new();
+        let quorum = sk_configs.len() / 2 + 1;
+        for config in sk_configs {
+            let timeout = tokio::time::Duration::from_millis(100);
+            let task = tokio::time::timeout(timeout, ping_safekeeper(config));
+            tasks.push(tokio::spawn(task));
+        }
+
+        // Get a quorum of responses or errors
+        let mut responses = Vec::new();
+        let mut join_errors = Vec::new();
+        let mut task_errors = Vec::new();
+        let mut timeout_errors = Vec::new();
+        while let Some(response) = tasks.next().await {
+            match response {
+                Ok(Ok(Ok(r))) => responses.push(r),
+                Ok(Ok(Err(e))) => task_errors.push(e),
+                Ok(Err(e)) => timeout_errors.push(e),
+                Err(e) => join_errors.push(e),
+            };
+            if responses.len() >= quorum {
+                break;
+            }
+            if join_errors.len() + task_errors.len() + timeout_errors.len() >= quorum {
+                break;
+            }
+        }
+
+        // In case of error, log and fail the check, but don't crash.
+        // We're playing it safe because these errors could be transient
+        // and we don't yet retry. Also being careful here allows us to
+        // be backwards compatible with safekeepers that don't have the
+        // TIMELINE_STATUS API yet.
+        if responses.len() < quorum {
+            error!(
+                "failed sync safekeepers check {:?} {:?} {:?}",
+                join_errors, task_errors, timeout_errors
+            );
+            return Ok(None);
+        }
+
+        Ok(check_if_synced(responses))
+    }
+
+    // Fast path for sync_safekeepers. If they're already synced we get the lsn
+    // in one roundtrip. If not, we should do a full sync_safekeepers.
+    pub fn check_safekeepers_synced(&self, compute_state: &ComputeState) -> Result<Option<Lsn>> {
+        let start_time = Utc::now();
+
+        // Run actual work with new tokio runtime
+        let rt = tokio::runtime::Builder::new_current_thread()
+            .enable_all()
+            .build()
+            .expect("failed to create rt");
+        let result = rt.block_on(self.check_safekeepers_synced_async(compute_state));
+
+        // Record runtime
+        self.state.lock().unwrap().metrics.sync_sk_check_ms = Utc::now()
+            .signed_duration_since(start_time)
+            .to_std()
+            .unwrap()
+            .as_millis() as u64;
+        result
+    }
+
     // Run `postgres` in a special mode with `--sync-safekeepers` argument
     // and return the reported LSN back to the caller.
     #[instrument(skip_all)]
@@ -371,10 +469,14 @@ impl ComputeNode {
         // cannot sync safekeepers.
         let lsn = match spec.mode {
             ComputeMode::Primary => {
-                info!("starting safekeepers syncing");
-                let lsn = self
-                    .sync_safekeepers(pspec.storage_auth_token.clone())
-                    .with_context(|| "failed to sync safekeepers")?;
+                info!("checking if safekeepers are synced");
+                let lsn = if let Ok(Some(lsn)) = self.check_safekeepers_synced(compute_state) {
+                    lsn
+                } else {
+                    info!("starting safekeepers syncing");
+                    self.sync_safekeepers(pspec.storage_auth_token.clone())
+                        .with_context(|| "failed to sync safekeepers")?
+                };
                 info!("safekeepers synced at LSN {}", lsn);
                 lsn
             }
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index 24811f75ee..1d7b09f095 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -13,3 +13,4 @@ pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
 pub mod spec;
+pub mod sync_sk;
diff --git a/compute_tools/src/sync_sk.rs b/compute_tools/src/sync_sk.rs
new file mode 100644
index 0000000000..dd4ea94db2
--- /dev/null
+++ b/compute_tools/src/sync_sk.rs
@@ -0,0 +1,95 @@
+// Utils for running sync_safekeepers
+use anyhow::Result;
+use tracing::info;
+use utils::lsn::Lsn;
+
+#[derive(Copy, Clone, Debug)]
+pub enum TimelineStatusResponse {
+    NotFound,
+    Ok(TimelineStatusOkResponse),
+}
+
+#[derive(Copy, Clone, Debug)]
+pub struct TimelineStatusOkResponse {
+    flush_lsn: Lsn,
+    commit_lsn: Lsn,
+}
+
+/// Get a safekeeper's metadata for our timeline
+pub async fn ping_safekeeper(config: tokio_postgres::Config) -> Result<TimelineStatusResponse> {
+    // TODO add retries
+
+    // Connect
+    info!("connecting to {:?}", config);
+    let (client, conn) = config.connect(tokio_postgres::NoTls).await?;
+    tokio::spawn(async move {
+        if let Err(e) = conn.await {
+            eprintln!("connection error: {}", e);
+        }
+    });
+
+    // Query
+    info!("querying {:?}", config);
+    let result = client.simple_query("TIMELINE_STATUS").await?;
+
+    // Parse result
+    info!("done with {:?}", config);
+    if let postgres::SimpleQueryMessage::Row(row) = &result[0] {
+        use std::str::FromStr;
+        let response = TimelineStatusResponse::Ok(TimelineStatusOkResponse {
+            flush_lsn: Lsn::from_str(row.get("flush_lsn").unwrap())?,
+            commit_lsn: Lsn::from_str(row.get("commit_lsn").unwrap())?,
+        });
+        Ok(response)
+    } else {
+        // Timeline doesn't exist
+        Ok(TimelineStatusResponse::NotFound)
+    }
+}
+
+/// Given a quorum of responses, check if safekeepers are synced at some Lsn
+pub fn check_if_synced(responses: Vec<TimelineStatusResponse>) -> Option<Lsn> {
+    // Check if all responses are ok
+    let ok_responses: Vec<TimelineStatusOkResponse> = responses
+        .iter()
+        .filter_map(|r| match r {
+            TimelineStatusResponse::Ok(ok_response) => Some(ok_response),
+            _ => None,
+        })
+        .cloned()
+        .collect();
+    if ok_responses.len() < responses.len() {
+        info!(
+            "not synced. Only {} out of {} know about this timeline",
+            ok_responses.len(),
+            responses.len()
+        );
+        return None;
+    }
+
+    // Get the min and the max of everything
+    let commit: Vec<Lsn> = ok_responses.iter().map(|r| r.commit_lsn).collect();
+    let flush: Vec<Lsn> = ok_responses.iter().map(|r| r.flush_lsn).collect();
+    let commit_max = commit.iter().max().unwrap();
+    let commit_min = commit.iter().min().unwrap();
+    let flush_max = flush.iter().max().unwrap();
+    let flush_min = flush.iter().min().unwrap();
+
+    // Check that all values are equal
+    if commit_min != commit_max {
+        info!("not synced. {:?} {:?}", commit_min, commit_max);
+        return None;
+    }
+    if flush_min != flush_max {
+        info!("not synced. {:?} {:?}", flush_min, flush_max);
+        return None;
+    }
+
+    // Check that commit == flush
+    if commit_max != flush_max {
+        info!("not synced. {:?} {:?}", commit_max, flush_max);
+        return None;
+    }
+
+    Some(*commit_max)
+}
diff --git a/libs/compute_api/src/responses.rs b/libs/compute_api/src/responses.rs
index 6124c81f50..f2865b71ec 100644
--- a/libs/compute_api/src/responses.rs
+++ b/libs/compute_api/src/responses.rs
@@ -70,6 +70,7 @@ where
 pub struct ComputeMetrics {
     pub wait_for_spec_ms: u64,
     pub sync_safekeepers_ms: u64,
+    pub sync_sk_check_ms: u64,
     pub basebackup_ms: u64,
     pub basebackup_bytes: u64,
     pub start_postgres_ms: u64,
diff --git a/safekeeper/src/handler.rs b/safekeeper/src/handler.rs
index 5fe9db9628..f11075a736 100644
--- a/safekeeper/src/handler.rs
+++ b/safekeeper/src/handler.rs
@@ -11,6 +11,7 @@ use crate::auth::check_permission;
 use crate::json_ctrl::{handle_json_ctrl, AppendLogicalMessage};
 
 use crate::metrics::{TrafficMetrics, PG_QUERIES_FINISHED, PG_QUERIES_RECEIVED};
+use crate::timeline::TimelineError;
 use crate::wal_service::ConnectionId;
 use crate::{GlobalTimelines, SafeKeeperConf};
 use postgres_backend::QueryError;
@@ -45,6 +46,7 @@ enum SafekeeperPostgresCommand {
     StartWalPush,
     StartReplication { start_lsn: Lsn },
     IdentifySystem,
+    TimelineStatus,
     JSONCtrl { cmd: AppendLogicalMessage },
 }
 
@@ -64,6 +66,8 @@ fn parse_cmd(cmd: &str) -> anyhow::Result<SafekeeperPostgresCommand> {
         Ok(SafekeeperPostgresCommand::StartReplication { start_lsn })
     } else if cmd.starts_with("IDENTIFY_SYSTEM") {
         Ok(SafekeeperPostgresCommand::IdentifySystem)
+    } else if cmd.starts_with("TIMELINE_STATUS") {
+        Ok(SafekeeperPostgresCommand::TimelineStatus)
     } else if cmd.starts_with("JSON_CTRL") {
         let cmd = cmd.strip_prefix("JSON_CTRL").context("invalid prefix")?;
         Ok(SafekeeperPostgresCommand::JSONCtrl {
@@ -78,6 +82,7 @@ fn cmd_to_string(cmd: &SafekeeperPostgresCommand) -> &str {
     match cmd {
         SafekeeperPostgresCommand::StartWalPush => "START_WAL_PUSH",
         SafekeeperPostgresCommand::StartReplication { .. } => "START_REPLICATION",
+        SafekeeperPostgresCommand::TimelineStatus => "TIMELINE_STATUS",
         SafekeeperPostgresCommand::IdentifySystem => "IDENTIFY_SYSTEM",
         SafekeeperPostgresCommand::JSONCtrl { .. } => "JSON_CTRL",
     }
@@ -219,6 +224,7 @@ impl<IO: AsyncRead + AsyncWrite + Unpin + Send> postgres_backend::Handler<IO>
                     .await
             }
             SafekeeperPostgresCommand::IdentifySystem => self.handle_identify_system(pgb).await,
+            SafekeeperPostgresCommand::TimelineStatus => self.handle_timeline_status(pgb).await,
             SafekeeperPostgresCommand::JSONCtrl { ref cmd } => {
                 handle_json_ctrl(self, pgb, cmd).await
             }
@@ -263,6 +269,38 @@ impl SafekeeperPostgresHandler {
         check_permission(claims, tenant_id)
     }
 
+    async fn handle_timeline_status<IO: AsyncRead + AsyncWrite + Unpin>(
+        &mut self,
+        pgb: &mut PostgresBackend<IO>,
+    ) -> Result<(), QueryError> {
+        // Get timeline, handling "not found" error
+        let tli = match GlobalTimelines::get(self.ttid) {
+            Ok(tli) => Ok(Some(tli)),
+            Err(TimelineError::NotFound(_)) => Ok(None),
+            Err(e) => Err(QueryError::Other(e.into())),
+        }?;
+
+        // Write row description
+        pgb.write_message_noflush(&BeMessage::RowDescription(&[
+            RowDescriptor::text_col(b"flush_lsn"),
+            RowDescriptor::text_col(b"commit_lsn"),
+        ]))?;
+
+        // Write row if timeline exists
+        if let Some(tli) = tli {
+            let (inmem, _state) = tli.get_state().await;
+            let flush_lsn = tli.get_flush_lsn().await;
+            let commit_lsn = inmem.commit_lsn;
+            pgb.write_message_noflush(&BeMessage::DataRow(&[
+                Some(flush_lsn.to_string().as_bytes()),
+                Some(commit_lsn.to_string().as_bytes()),
+            ]))?;
+        }
+
+        pgb.write_message_noflush(&BeMessage::CommandComplete(b"TIMELINE_STATUS"))?;
+        Ok(())
+    }
+
     ///
     /// Handle IDENTIFY_SYSTEM replication command
     ///
diff --git a/test_runner/performance/test_startup.py b/test_runner/performance/test_startup.py
index 875be3b7b0..fade78504a 100644
--- a/test_runner/performance/test_startup.py
+++ b/test_runner/performance/test_startup.py
@@ -61,6 +61,7 @@ def test_startup_simple(neon_env_builder: NeonEnvBuilder, zenbenchmark: NeonBenc
         durations = {
             "wait_for_spec_ms": f"{i}_wait_for_spec",
             "sync_safekeepers_ms": f"{i}_sync_safekeepers",
+            "sync_sk_check_ms": f"{i}_sync_sk_check",
             "basebackup_ms": f"{i}_basebackup",
             "start_postgres_ms": f"{i}_start_postgres",
             "config_ms": f"{i}_config",

From 700d92952955f5ad3dbf87657dd4d39f63d889b7 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Wed, 26 Jul 2023 14:05:18 +0300
Subject: [PATCH 07/37] Init Timeline in Stopping state in
 create_timeline_struct when Cause::Delete (#4780)

See
https://github.com/neondatabase/neon/pull/4552#discussion_r1258368127
for context.

TLDR: use CreateTimelineCause to infer desired state instead of using
.set_stopping after initialization
---
 pageserver/src/tenant.rs          | 26 +++++++++++++++++---------
 pageserver/src/tenant/delete.rs   |  5 ++---
 pageserver/src/tenant/timeline.rs |  3 ++-
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 67447bc45c..8f13055cc1 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2012,19 +2012,24 @@ impl Tenant {
         init_order: Option<&InitializationOrder>,
         cause: CreateTimelineCause,
     ) -> anyhow::Result<Arc<Timeline>> {
-        if matches!(cause, CreateTimelineCause::Load) {
-            let ancestor_id = new_metadata.ancestor_timeline();
-            anyhow::ensure!(
-                ancestor_id == ancestor.as_ref().map(|t| t.timeline_id),
-                "Timeline's {new_timeline_id} ancestor {ancestor_id:?} was not found"
-            );
-        }
+        let state = match cause {
+            CreateTimelineCause::Load => {
+                let ancestor_id = new_metadata.ancestor_timeline();
+                anyhow::ensure!(
+                    ancestor_id == ancestor.as_ref().map(|t| t.timeline_id),
+                    "Timeline's {new_timeline_id} ancestor {ancestor_id:?} was not found"
+                );
+                TimelineState::Loading
+            }
+            CreateTimelineCause::Delete => TimelineState::Stopping,
+        };
 
         let initial_logical_size_can_start = init_order.map(|x| &x.initial_logical_size_can_start);
         let initial_logical_size_attempt = init_order.map(|x| &x.initial_logical_size_attempt);
 
         let pg_version = new_metadata.pg_version();
-        Ok(Timeline::new(
+
+        let timeline = Timeline::new(
             self.conf,
             Arc::clone(&self.tenant_conf),
             new_metadata,
@@ -2036,7 +2041,10 @@ impl Tenant {
             pg_version,
             initial_logical_size_can_start.cloned(),
             initial_logical_size_attempt.cloned(),
-        ))
+            state,
+        );
+
+        Ok(timeline)
     }
 
     fn new(
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/delete.rs
index 02d1c997a3..eeaaf15c32 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/delete.rs
@@ -403,6 +403,8 @@ impl DeleteTimelineFlow {
         remote_client: Option<RemoteTimelineClient>,
         init_order: Option<&InitializationOrder>,
     ) -> anyhow::Result<()> {
+        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
+        // RemoteTimelineClient is the only functioning part.
         let timeline = tenant
             .create_timeline_struct(
                 timeline_id,
@@ -422,9 +424,6 @@ impl DeleteTimelineFlow {
                 .expect("cannot happen because we're the only owner"),
         );
 
-        // Note: here we even skip populating layer map. Timeline is essentially uninitialized.
-        // RemoteTimelineClient is the only functioning part.
-        timeline.set_state(TimelineState::Stopping);
         // We meed to do this because when console retries delete request we shouldnt answer with 404
         // because 404 means successful deletion.
         {
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index af9edbf95e..1002595909 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1360,9 +1360,10 @@ impl Timeline {
         pg_version: u32,
         initial_logical_size_can_start: Option<completion::Barrier>,
         initial_logical_size_attempt: Option<completion::Completion>,
+        state: TimelineState,
     ) -> Arc<Self> {
         let disk_consistent_lsn = metadata.disk_consistent_lsn();
-        let (state, _) = watch::channel(TimelineState::Loading);
+        let (state, _) = watch::channel(state);
 
         let (layer_flush_start_tx, _) = tokio::sync::watch::channel(0);
         let (layer_flush_done_tx, _) = tokio::sync::watch::channel((0, Ok(())));

From 916a5871a6ded9942ef3d6a8169504a64e8016a1 Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Wed, 26 Jul 2023 08:10:49 -0400
Subject: [PATCH 08/37] compute_ctl: Parse sk connstring (#4809)

---
 compute_tools/src/compute.rs | 26 ++++++++++++++++++++++----
 compute_tools/src/sync_sk.rs | 13 ++++++++-----
 2 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index 73e7a4e5e9..ceda189364 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -89,6 +89,7 @@ pub struct ParsedSpec {
     pub tenant_id: TenantId,
     pub timeline_id: TimelineId,
     pub pageserver_connstr: String,
+    pub safekeeper_connstrings: Vec<String>,
     pub storage_auth_token: Option<String>,
 }
 
@@ -106,6 +107,21 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
             .clone()
             .or_else(|| spec.cluster.settings.find("neon.pageserver_connstring"))
             .ok_or("pageserver connstr should be provided")?;
+        let safekeeper_connstrings = if spec.safekeeper_connstrings.is_empty() {
+            if matches!(spec.mode, ComputeMode::Primary) {
+                spec.cluster
+                    .settings
+                    .find("neon.safekeepers")
+                    .ok_or("safekeeper connstrings should be provided")?
+                    .split(',')
+                    .map(|str| str.to_string())
+                    .collect()
+            } else {
+                vec![]
+            }
+        } else {
+            spec.safekeeper_connstrings.clone()
+        };
         let storage_auth_token = spec.storage_auth_token.clone();
         let tenant_id: TenantId = if let Some(tenant_id) = spec.tenant_id {
             tenant_id
@@ -131,6 +147,7 @@ impl TryFrom<ComputeSpec> for ParsedSpec {
         Ok(ParsedSpec {
             spec,
             pageserver_connstr,
+            safekeeper_connstrings,
             storage_auth_token,
             tenant_id,
             timeline_id,
@@ -322,9 +339,10 @@ impl ComputeNode {
             .as_ref()
             .expect("spec must be set")
             .clone();
-        let sk_connstrs: Vec<String> = pspec.spec.safekeeper_connstrings.clone();
+        let sk_connstrs: Vec<String> = pspec.safekeeper_connstrings.clone();
         let sk_configs = sk_connstrs.into_iter().map(|connstr| {
             // Format connstr
+            let id = connstr.clone();
             let connstr = format!("postgresql://no_user@{}", connstr);
             let options = format!(
                 "-c timeline_id={} tenant_id={}",
@@ -338,15 +356,15 @@ impl ComputeNode {
                 config.password(storage_auth_token);
             }
 
-            config
+            (id, config)
         });
 
         // Create task set to query all safekeepers
         let mut tasks = FuturesUnordered::new();
         let quorum = sk_configs.len() / 2 + 1;
-        for config in sk_configs {
+        for (id, config) in sk_configs {
             let timeout = tokio::time::Duration::from_millis(100);
-            let task = tokio::time::timeout(timeout, ping_safekeeper(config));
+            let task = tokio::time::timeout(timeout, ping_safekeeper(id, config));
             tasks.push(tokio::spawn(task));
         }
 
diff --git a/compute_tools/src/sync_sk.rs b/compute_tools/src/sync_sk.rs
index dd4ea94db2..22b7027b93 100644
--- a/compute_tools/src/sync_sk.rs
+++ b/compute_tools/src/sync_sk.rs
@@ -15,12 +15,15 @@ pub struct TimelineStatusOkResponse {
     commit_lsn: Lsn,
 }
 
-/// Get a safekeeper's metadata for our timeline
-pub async fn ping_safekeeper(config: tokio_postgres::Config) -> Result<TimelineStatusResponse> {
+/// Get a safekeeper's metadata for our timeline. The id is only used for logging
+pub async fn ping_safekeeper(
+    id: String,
+    config: tokio_postgres::Config,
+) -> Result<TimelineStatusResponse> {
     // TODO add retries
 
     // Connect
-    info!("connecting to {:?}", config);
+    info!("connecting to {}", id);
     let (client, conn) = config.connect(tokio_postgres::NoTls).await?;
     tokio::spawn(async move {
         if let Err(e) = conn.await {
@@ -29,11 +32,11 @@ pub async fn ping_safekeeper(config: tokio_postgres::Config) -> Result<TimelineS
     });
 
     // Query
-    info!("querying {:?}", config);
+    info!("querying {}", id);
     let result = client.simple_query("TIMELINE_STATUS").await?;
 
     // Parse result
-    info!("done with {:?}", config);
+    info!("done with {}", id);
     if let postgres::SimpleQueryMessage::Row(row) = &result[0] {
         use std::str::FromStr;
         let response = TimelineStatusResponse::Ok(TimelineStatusOkResponse {

From 5f8fd640bfa8e5d4d23dbc3df1b0a521ec666e56 Mon Sep 17 00:00:00 2001
From: Alek Westover <alek.westover@gmail.com>
Date: Wed, 26 Jul 2023 08:24:03 -0400
Subject: [PATCH 09/37] Upload Test Remote Extensions (#4792)

We need some real extensions in S3 to accurately test the code for
handling remote extensions.
In this PR we just upload three extensions (anon, kq_imcx and postgis), which is
enough for testing purposes for now. In addition to creating and
uploading the extension archives, we must generate a file
`ext_index.json` which specifies important metadata about the
extensions.

---------

Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
Co-authored-by: Alexander Bayandin <alexander@neon.tech>
---
 .dockerignore                        |  1 +
 .github/workflows/build_and_test.yml | 46 ++++-----------------
 Dockerfile.compute-node              | 62 +++++++++++++++++++---------
 scripts/combine_control_files.py     | 33 +++++++++++++++
 4 files changed, 84 insertions(+), 58 deletions(-)
 create mode 100644 scripts/combine_control_files.py

diff --git a/.dockerignore b/.dockerignore
index a6e11805e9..960588b6f2 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -21,4 +21,5 @@
 !workspace_hack/
 !neon_local/
 !scripts/ninstall.sh
+!scripts/combine_control_files.py
 !vm-cgconfig.conf
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 18dfc458b5..27bad61639 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -955,22 +955,15 @@ jobs:
         version: [ v14, v15 ]
 
     env:
-      # While on transition period we extract public extensions from compute-node image and custom extensions from extensions image.
-      # Later all the extensions will be moved to extensions image.
-      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:latest
-      COMPUTE_NODE_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:latest
+      EXTENSIONS_IMAGE: ${{ github.ref_name == 'release' && '093970136003' || '369495373322'}}.dkr.ecr.eu-central-1.amazonaws.com/extensions-${{ matrix.version }}:${{ needs.tag.outputs.build-tag }}
       AWS_ACCESS_KEY_ID: ${{ github.ref_name == 'release' && secrets.AWS_ACCESS_KEY_PROD || secrets.AWS_ACCESS_KEY_DEV }}
       AWS_SECRET_ACCESS_KEY: ${{ github.ref_name == 'release' && secrets.AWS_SECRET_KEY_PROD || secrets.AWS_SECRET_KEY_DEV }}
-      S3_BUCKETS: |
-        ${{ github.ref_name == 'release' &&
-          'neon-prod-extensions-ap-southeast-1 neon-prod-extensions-eu-central-1 neon-prod-extensions-us-east-1 neon-prod-extensions-us-east-2 neon-prod-extensions-us-west-2' ||
-          'neon-dev-extensions-eu-central-1 neon-dev-extensions-eu-west-1 neon-dev-extensions-us-east-2' }}
+      S3_BUCKETS: ${{ github.ref_name == 'release' && vars.S3_EXTENSIONS_BUCKETS_PROD || vars.S3_EXTENSIONS_BUCKETS_DEV }}
 
     steps:
       - name: Pull postgres-extensions image
         run: |
           docker pull ${EXTENSIONS_IMAGE}
-          docker pull ${COMPUTE_NODE_IMAGE}
 
       - name: Create postgres-extensions container
         id: create-container
@@ -978,46 +971,23 @@ jobs:
           EID=$(docker create ${EXTENSIONS_IMAGE} true)
           echo "EID=${EID}" >> $GITHUB_OUTPUT
 
-          CID=$(docker create ${COMPUTE_NODE_IMAGE} true)
-          echo "CID=${CID}" >> $GITHUB_OUTPUT
-
       - name: Extract postgres-extensions from container
         run: |
-          rm -rf ./extensions-to-upload ./custom-extensions # Just in case
+          rm -rf ./extensions-to-upload # Just in case
+          mkdir -p extensions-to-upload
 
-          # In compute image we have a bit different directory layout
-          mkdir -p extensions-to-upload/share
-          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/share/extension ./extensions-to-upload/share/extension
-          docker cp ${{ steps.create-container.outputs.CID }}:/usr/local/lib             ./extensions-to-upload/lib
-
-          # Delete Neon extensitons (they always present on compute-node image)
-          rm -rf ./extensions-to-upload/share/extension/neon*
-          rm -rf ./extensions-to-upload/lib/neon*
-
-          # Delete leftovers from the extension build step
-          rm -rf ./extensions-to-upload/lib/pgxs
-          rm -rf ./extensions-to-upload/lib/pkgconfig
-
-          docker cp ${{ steps.create-container.outputs.EID }}:/extensions ./custom-extensions
-          for EXT_NAME in $(ls ./custom-extensions); do
-            mkdir -p ./extensions-to-upload/${EXT_NAME}/share
-
-            mv ./custom-extensions/${EXT_NAME}/share/extension ./extensions-to-upload/${EXT_NAME}/share/extension
-            mv ./custom-extensions/${EXT_NAME}/lib             ./extensions-to-upload/${EXT_NAME}/lib
-          done
+          docker cp ${{ steps.create-container.outputs.EID }}:/extensions/ ./extensions-to-upload/
+          docker cp ${{ steps.create-container.outputs.EID }}:/ext_index.json ./extensions-to-upload/
 
       - name: Upload postgres-extensions to S3
-        # TODO: Reenable step after switching to the new extensions format (tar-gzipped + index.json)
-        if: false
         run: |
-          for BUCKET in $(echo ${S3_BUCKETS}); do
+          for BUCKET in $(echo ${S3_BUCKETS:-[]} | jq --raw-output '.[]'); do
             aws s3 cp --recursive --only-show-errors ./extensions-to-upload s3://${BUCKET}/${{ needs.tag.outputs.build-tag }}/${{ matrix.version }}
           done
 
       - name: Cleanup
-        if: ${{ always() && (steps.create-container.outputs.CID || steps.create-container.outputs.EID) }}
+        if: ${{ always() && steps.create-container.outputs.EID }}
         run: |
-          docker rm ${{ steps.create-container.outputs.CID }} || true
           docker rm ${{ steps.create-container.outputs.EID }} || true
 
   deploy:
diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 495ef25526..7d60458a3e 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -13,7 +13,7 @@ FROM debian:bullseye-slim AS build-deps
 RUN apt update &&  \
     apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
     zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
-    libicu-dev libxslt1-dev liblz4-dev libzstd-dev
+    libicu-dev libxslt1-dev liblz4-dev libzstd-dev zstd
 
 #########################################################################################
 #
@@ -77,6 +77,7 @@ ENV PATH "/usr/local/pgsql/bin:$PATH"
 RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
     echo "9a2a219da005a1730a39d1959a1c7cec619b1efb009b65be80ffc25bad299068 postgis.tar.gz" | sha256sum --check && \
     mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     ./autogen.sh && \
     ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -89,17 +90,28 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postg
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/postgis_topology.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/address_standardizer_data_us.control && \
+    mkdir -p /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis_raster.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis_sfcgal.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis_tiger_geocoder.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/postgis_topology.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/address_standardizer.control /extensions/postgis && \
+    cp /usr/local/pgsql/share/extension/address_standardizer_data_us.control /extensions/postgis
 
 RUN wget https://github.com/pgRouting/pgrouting/archive/v3.4.2.tar.gz -O pgrouting.tar.gz && \
     echo "cac297c07d34460887c4f3b522b35c470138760fe358e351ad1db4edb6ee306e pgrouting.tar.gz" | sha256sum --check && \
     mkdir pgrouting-src && cd pgrouting-src && tar xvzf ../pgrouting.tar.gz --strip-components=1 -C . && \
-    mkdir build && \
-    cd build && \
+    mkdir build && cd build && \
     cmake -DCMAKE_BUILD_TYPE=Release .. && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrouting.control && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
+    cp /usr/local/pgsql/share/extension/pgrouting.control /extensions/postgis && \
+    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
+    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/postgis.tar.zst -T -
 
 #########################################################################################
 #
@@ -419,12 +431,16 @@ RUN apt-get update && \
     wget https://github.com/ketteq-neon/postgres-exts/archive/e0bd1a9d9313d7120c1b9c7bb15c48c0dede4c4e.tar.gz -O kq_imcx.tar.gz && \
     echo "dc93a97ff32d152d32737ba7e196d9687041cda15e58ab31344c2f2de8855336 kq_imcx.tar.gz" | sha256sum --check && \
     mkdir kq_imcx-src && cd kq_imcx-src && tar xvzf ../kq_imcx.tar.gz --strip-components=1 -C . && \
-    mkdir build && \
-    cd build && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
+    mkdir build && cd build && \
     cmake -DCMAKE_BUILD_TYPE=Release .. && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/kq_imcx.control && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
+    mkdir -p /extensions/kq_imcx && cp /usr/local/pgsql/share/extension/kq_imcx.control /extensions/kq_imcx && \
+    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
+    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/kq_imcx.tar.zst -T -
 
 #########################################################################################
 #
@@ -553,16 +569,17 @@ RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b26
 FROM build-deps AS pg-anon-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-# Kaniko doesn't allow to do `${from#/usr/local/pgsql/}`, so we use `${from:17}` instead
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
 RUN wget https://gitlab.com/dalibo/postgresql_anonymizer/-/archive/1.1.0/postgresql_anonymizer-1.1.0.tar.gz -O pg_anon.tar.gz && \
     echo "08b09d2ff9b962f96c60db7e6f8e79cf7253eb8772516998fc35ece08633d3ad pg_anon.tar.gz" | sha256sum --check && \
     mkdir pg_anon-src && cd pg_anon-src && tar xvzf ../pg_anon.tar.gz --strip-components=1 -C . && \
-    find /usr/local/pgsql -type f | sort  > /before.txt && \
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /before.txt &&\
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/anon.control && \
-    find /usr/local/pgsql -type f | sort  > /after.txt && \
-    /bin/bash -c 'for from in $(comm -13 /before.txt /after.txt); do to=/extensions/anon/${from:17} && mkdir -p $(dirname ${to}) && cp -a ${from} ${to}; done'
+    find /usr/local/pgsql -type f | sed 's|^/usr/local/pgsql/||' > /after.txt &&\
+    mkdir -p /extensions/anon && cp /usr/local/pgsql/share/extension/anon.control /extensions/anon && \
+    sort -o /before.txt /before.txt && sort -o /after.txt /after.txt && \
+    comm -13 /before.txt /after.txt | tar --directory=/usr/local/pgsql --zstd -cf /extensions/anon.tar.zst -T -
 
 #########################################################################################
 #
@@ -754,16 +771,21 @@ RUN rm /usr/local/pgsql/lib/lib*.a
 # Extenstion only
 #
 #########################################################################################
+FROM python:3.9-slim-bullseye AS generate-ext-index
+ARG PG_VERSION
+ARG BUILD_TAG
+# copy the control files here
+COPY --from=kq-imcx-pg-build /extensions/ /extensions/
+COPY --from=pg-anon-pg-build /extensions/ /extensions/
+COPY --from=postgis-build /extensions/ /extensions/
+COPY scripts/combine_control_files.py ./combine_control_files.py
+RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG}
+
 FROM scratch AS postgres-extensions
 # After the transition this layer will include all extensitons.
-# As for now, it's only for new custom ones
-#
-# # Default extensions
-# COPY --from=postgres-cleanup-layer /usr/local/pgsql/share/extension /usr/local/pgsql/share/extension
-# COPY --from=postgres-cleanup-layer /usr/local/pgsql/lib             /usr/local/pgsql/lib
-# Custom extensions
-COPY --from=pg-anon-pg-build /extensions/anon/lib/ /extensions/anon/lib
-COPY --from=pg-anon-pg-build /extensions/anon/share/extension /extensions/anon/share/extension
+# As for now, it's only a couple for testing purposses
+COPY --from=generate-ext-index /extensions/*.tar.zst /extensions/
+COPY --from=generate-ext-index /ext_index.json /ext_index.json
 
 #########################################################################################
 #
diff --git a/scripts/combine_control_files.py b/scripts/combine_control_files.py
new file mode 100644
index 0000000000..37db522ddd
--- /dev/null
+++ b/scripts/combine_control_files.py
@@ -0,0 +1,33 @@
+#! /usr/bin/env python3
+# Script to generate ext_index.json metadata file
+# that stores content of the control files and location of extension archives
+# for all extensions in extensions subdir.
+import argparse
+import json
+from pathlib import Path
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="generate ext_index.json")
+    parser.add_argument("pg_version", type=str, choices=["v14", "v15"], help="pg_version")
+    parser.add_argument("BUILD_TAG", type=str, help="BUILD_TAG for this compute image")
+    args = parser.parse_args()
+    pg_version = args.pg_version
+    BUILD_TAG = args.BUILD_TAG
+
+    ext_index = {}
+    EXT_PATH = Path("extensions")
+    for extension in EXT_PATH.iterdir():
+        if extension.is_dir():
+            control_data = {}
+            for control_file in extension.glob("*.control"):
+                if control_file.suffix != ".control":
+                    continue
+                with open(control_file, "r") as f:
+                    control_data[control_file.name] = f.read()
+            ext_index[extension.name] = {
+                "control_data": control_data,
+                "archive_path": f"{BUILD_TAG}/{pg_version}/extensions/{extension.name}.tar.zst",
+            }
+
+    with open("ext_index.json", "w") as f:
+        json.dump(ext_index, f)

From 86a61b318bb186693718723dc8c7d8187e1c5b73 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 26 Jul 2023 14:32:56 +0100
Subject: [PATCH 10/37] Bump certifi from 2022.12.7 to 2023.7.22 (#4815)

---
 poetry.lock | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index a8ea410734..74cec84a40 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -740,13 +740,13 @@ typing-extensions = ">=4.1.0"
 
 [[package]]
 name = "certifi"
-version = "2022.12.7"
+version = "2023.7.22"
 description = "Python package for providing Mozilla's CA Bundle."
 optional = false
 python-versions = ">=3.6"
 files = [
-    {file = "certifi-2022.12.7-py3-none-any.whl", hash = "sha256:4ad3232f5e926d6718ec31cfc1fcadfde020920e278684144551c91769c7bc18"},
-    {file = "certifi-2022.12.7.tar.gz", hash = "sha256:35824b4c3a97115964b408844d64aa14db1cc518f6562e8d7261699d1350a9e3"},
+    {file = "certifi-2023.7.22-py3-none-any.whl", hash = "sha256:92d6037539857d8206b8f6ae472e8b77db8058fec5937a1ef3f54304089edbb9"},
+    {file = "certifi-2023.7.22.tar.gz", hash = "sha256:539cc1d13202e33ca466e88b2807e29f4c13049d6d87031a3c110744495cb082"},
 ]
 
 [[package]]

From b98419ee56891ce99f6da808ea2aaece298e8b5a Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 26 Jul 2023 15:19:18 +0100
Subject: [PATCH 11/37] Fix allure report overwriting for different Postgres
 versions (#4806)

## Problem

We've got an example of Allure reports from 2 different runners for the
same build that started to upload at the exact second, making one
overwrite another

## Summary of changes
- Use the Postgres version to distinguish artifacts (along with the
build type)
---
 .github/actions/run-python-test-set/action.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index ceb6f4aa90..60ccc56738 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -209,4 +209,4 @@ runs:
       uses: ./.github/actions/allure-report-store
       with:
         report-dir: /tmp/test_output/allure/results
-        unique-key: ${{ inputs.build_type }}
+        unique-key: ${{ inputs.build_type }}-${{ inputs.pg_version }}

From 35370f967f248445f51fb1c2e2814309a8403f00 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 26 Jul 2023 16:03:51 +0100
Subject: [PATCH 12/37] proxy: add some connection init logs (#4812)

## Problem

The first session event we emit is after we receive the first startup
packet from the client. This means we can't detect any issues between
TCP open and handling of the first PG packet

## Summary of changes

Add some new logs for websocket upgrade and connection handling
---
 proxy/src/http/websocket.rs | 4 +++-
 proxy/src/proxy.rs          | 5 +++++
 2 files changed, 8 insertions(+), 1 deletion(-)

diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
index 5b7a87bc11..4b6e15dc3a 100644
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -181,13 +181,15 @@ async fn ws_handler(
 
     // Check if the request is a websocket upgrade request.
     if hyper_tungstenite::is_upgrade_request(&request) {
+        info!(session_id = ?session_id, "performing websocket upgrade");
+
         let (response, websocket) = hyper_tungstenite::upgrade(&mut request, None)
             .map_err(|e| ApiError::BadRequest(e.into()))?;
 
         tokio::spawn(async move {
             if let Err(e) = serve_websocket(websocket, config, &cancel_map, session_id, host).await
             {
-                error!("error in websocket connection: {e:?}");
+                error!(session_id = ?session_id, "error in websocket connection: {e:?}");
             }
         });
 
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 2cdd1582ac..e37e7dc44b 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -196,6 +196,11 @@ pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
     stream: S,
     mode: ClientMode,
 ) -> anyhow::Result<()> {
+    info!(
+        protocol = mode.protocol_label(),
+        "handling interactive connection from client"
+    );
+
     // The `closed` counter will increase when this future is destroyed.
     NUM_CONNECTIONS_ACCEPTED_COUNTER
         .with_label_values(&[mode.protocol_label()])

From 5705413d901cab8b5a0bef6de65239bb29f0cde2 Mon Sep 17 00:00:00 2001
From: arpad-m <arpad-m@users.noreply.github.com>
Date: Wed, 26 Jul 2023 17:20:09 +0200
Subject: [PATCH 13/37] Use OnceLock instead of manually implementing it
 (#4805)

## Problem

In https://github.com/neondatabase/neon/issues/4743 , I'm trying to make
more of the pageserver async, but in order for that to happen, I need to
be able to persist the result of `ImageLayer::load` across await points.
For that to happen, the return value needs to be `Send`.

## Summary of changes

Use `OnceLock` in the image layer instead of manually implementing it
with booleans, locks and `Option`.

Part of #4743
---
 .../src/tenant/storage_layer/image_layer.rs   | 90 +++++--------------
 1 file changed, 23 insertions(+), 67 deletions(-)

diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index 53cff824e3..f758347d9a 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -38,6 +38,7 @@ use crate::{IMAGE_FILE_MAGIC, STORAGE_FORMAT_VERSION, TEMP_FILE_SUFFIX};
 use anyhow::{bail, ensure, Context, Result};
 use bytes::Bytes;
 use hex;
+use once_cell::sync::OnceCell;
 use pageserver_api::models::{HistoricLayerInfo, LayerAccessKind};
 use rand::{distributions::Alphanumeric, Rng};
 use serde::{Deserialize, Serialize};
@@ -47,7 +48,6 @@ use std::io::{Seek, SeekFrom};
 use std::ops::Range;
 use std::os::unix::prelude::FileExt;
 use std::path::{Path, PathBuf};
-use std::sync::{RwLock, RwLockReadGuard};
 use tracing::*;
 
 use utils::{
@@ -117,7 +117,7 @@ pub struct ImageLayer {
 
     access_stats: LayerAccessStats,
 
-    inner: RwLock<ImageLayerInner>,
+    inner: OnceCell<ImageLayerInner>,
 }
 
 impl std::fmt::Debug for ImageLayer {
@@ -134,21 +134,17 @@ impl std::fmt::Debug for ImageLayer {
 }
 
 pub struct ImageLayerInner {
-    /// If false, the 'index' has not been loaded into memory yet.
-    loaded: bool,
-
     // values copied from summary
     index_start_blk: u32,
     index_root_blk: u32,
 
-    /// Reader object for reading blocks from the file. (None if not loaded yet)
-    file: Option<FileBlockReader<VirtualFile>>,
+    /// Reader object for reading blocks from the file.
+    file: FileBlockReader<VirtualFile>,
 }
 
 impl std::fmt::Debug for ImageLayerInner {
     fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
         f.debug_struct("ImageLayerInner")
-            .field("loaded", &self.loaded)
             .field("index_start_blk", &self.index_start_blk)
             .field("index_root_blk", &self.index_root_blk)
             .finish()
@@ -175,7 +171,7 @@ impl Layer for ImageLayer {
         }
 
         let inner = self.load(LayerAccessKind::Dump, ctx)?;
-        let file = inner.file.as_ref().unwrap();
+        let file = &inner.file;
         let tree_reader =
             DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
 
@@ -203,7 +199,7 @@ impl Layer for ImageLayer {
 
         let inner = self.load(LayerAccessKind::GetValueReconstructData, ctx)?;
 
-        let file = inner.file.as_ref().unwrap();
+        let file = &inner.file;
         let tree_reader = DiskBtreeReader::new(inner.index_start_blk, inner.index_root_blk, file);
 
         let mut keybuf: [u8; KEY_SIZE] = [0u8; KEY_SIZE];
@@ -322,52 +318,26 @@ impl ImageLayer {
     /// Open the underlying file and read the metadata into memory, if it's
     /// not loaded already.
     ///
-    fn load(
-        &self,
-        access_kind: LayerAccessKind,
-        ctx: &RequestContext,
-    ) -> Result<RwLockReadGuard<ImageLayerInner>> {
+    fn load(&self, access_kind: LayerAccessKind, ctx: &RequestContext) -> Result<&ImageLayerInner> {
         self.access_stats
             .record_access(access_kind, ctx.task_kind());
         loop {
-            // Quick exit if already loaded
-            let inner = self.inner.read().unwrap();
-            if inner.loaded {
+            if let Some(inner) = self.inner.get() {
                 return Ok(inner);
             }
-
-            // Need to open the file and load the metadata. Upgrade our lock to
-            // a write lock. (Or rather, release and re-lock in write mode.)
-            drop(inner);
-            let mut inner = self.inner.write().unwrap();
-            if !inner.loaded {
-                self.load_inner(&mut inner).with_context(|| {
-                    format!("Failed to load image layer {}", self.path().display())
-                })?
-            } else {
-                // Another thread loaded it while we were not holding the lock.
-            }
-
-            // We now have the file open and loaded. There's no function to do
-            // that in the std library RwLock, so we have to release and re-lock
-            // in read mode. (To be precise, the lock guard was moved in the
-            // above call to `load_inner`, so it's already been released). And
-            // while we do that, another thread could unload again, so we have
-            // to re-check and retry if that happens.
-            drop(inner);
+            self.inner
+                .get_or_try_init(|| self.load_inner())
+                .with_context(|| format!("Failed to load image layer {}", self.path().display()))?;
         }
     }
 
-    fn load_inner(&self, inner: &mut ImageLayerInner) -> Result<()> {
+    fn load_inner(&self) -> Result<ImageLayerInner> {
         let path = self.path();
 
         // Open the file if it's not open already.
-        if inner.file.is_none() {
-            let file = VirtualFile::open(&path)
-                .with_context(|| format!("Failed to open file '{}'", path.display()))?;
-            inner.file = Some(FileBlockReader::new(file));
-        }
-        let file = inner.file.as_mut().unwrap();
+        let file = VirtualFile::open(&path)
+            .with_context(|| format!("Failed to open file '{}'", path.display()))?;
+        let file = FileBlockReader::new(file);
         let summary_blk = file.read_blk(0)?;
         let actual_summary = Summary::des_prefix(summary_blk.as_ref())?;
 
@@ -395,10 +365,11 @@ impl ImageLayer {
             }
         }
 
-        inner.index_start_blk = actual_summary.index_start_blk;
-        inner.index_root_blk = actual_summary.index_root_blk;
-        inner.loaded = true;
-        Ok(())
+        Ok(ImageLayerInner {
+            index_start_blk: actual_summary.index_start_blk,
+            index_root_blk: actual_summary.index_root_blk,
+            file,
+        })
     }
 
     /// Create an ImageLayer struct representing an existing file on disk
@@ -422,12 +393,7 @@ impl ImageLayer {
             ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
             lsn: filename.lsn,
             access_stats,
-            inner: RwLock::new(ImageLayerInner {
-                loaded: false,
-                file: None,
-                index_start_blk: 0,
-                index_root_blk: 0,
-            }),
+            inner: OnceCell::new(),
         }
     }
 
@@ -454,12 +420,7 @@ impl ImageLayer {
             ), // Now we assume image layer ALWAYS covers the full range. This may change in the future.
             lsn: summary.lsn,
             access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: RwLock::new(ImageLayerInner {
-                file: None,
-                loaded: false,
-                index_start_blk: 0,
-                index_root_blk: 0,
-            }),
+            inner: OnceCell::new(),
         })
     }
 
@@ -620,12 +581,7 @@ impl ImageLayerWriterInner {
             desc,
             lsn: self.lsn,
             access_stats: LayerAccessStats::empty_will_record_residence_event_later(),
-            inner: RwLock::new(ImageLayerInner {
-                loaded: false,
-                file: None,
-                index_start_blk,
-                index_root_blk,
-            }),
+            inner: OnceCell::new(),
         };
 
         // fsync the file

From 231d7a76166ede0141b86bbe91c4dce58ad3cb2c Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Wed, 26 Jul 2023 16:34:46 +0100
Subject: [PATCH 14/37] proxy: retry compute wake in auth (#4817)

## Problem

wake_compute can fail sometimes but is eligible for retries. We retry
during the main connect, but not during auth.

## Summary of changes

retry wake_compute during auth flow if there was an error talking to
control plane, or if there was a temporary error in waking the compute
node
---
 proxy/src/auth/backend/classic.rs | 13 ++++++-
 proxy/src/console/provider.rs     | 19 ++++++++++
 proxy/src/proxy.rs                | 63 +++++++++++++++----------------
 3 files changed, 62 insertions(+), 33 deletions(-)

diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 6753e7ed7f..4d20b9bf90 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -1,8 +1,11 @@
+use std::ops::ControlFlow;
+
 use super::AuthSuccess;
 use crate::{
     auth::{self, AuthFlow, ClientCredentials},
     compute,
     console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
+    proxy::{try_wake, NUM_RETRIES_CONNECT},
     sasl, scram,
     stream::PqStream,
 };
@@ -48,7 +51,15 @@ pub(super) async fn authenticate(
         }
     };
 
-    let mut node = api.wake_compute(extra, creds).await?;
+    let mut num_retries = 0;
+    let mut node = loop {
+        num_retries += 1;
+        match try_wake(api, extra, creds).await? {
+            ControlFlow::Break(n) => break n,
+            ControlFlow::Continue(_) if num_retries < NUM_RETRIES_CONNECT => continue,
+            ControlFlow::Continue(e) => return Err(e.into()),
+        }
+    };
     if let Some(keys) = scram_keys {
         use tokio_postgres::config::AuthKeys;
         node.config.auth_keys(AuthKeys::ScramSha256(keys));
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 3eaed1b82b..37190c76b8 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -14,6 +14,7 @@ pub mod errors {
     use crate::{
         error::{io_error, UserFacingError},
         http,
+        proxy::ShouldRetry,
     };
     use thiserror::Error;
 
@@ -72,6 +73,24 @@ pub mod errors {
         }
     }
 
+    impl ShouldRetry for ApiError {
+        fn could_retry(&self) -> bool {
+            match self {
+                // retry some transport errors
+                Self::Transport(io) => io.could_retry(),
+                // retry some temporary failures because the compute was in a bad state
+                // (bad request can be returned when the endpoint was in transition)
+                Self::Console {
+                    status: http::StatusCode::BAD_REQUEST | http::StatusCode::LOCKED,
+                    ..
+                } => true,
+                // retry server errors
+                Self::Console { status, .. } if status.is_server_error() => true,
+                _ => false,
+            }
+        }
+    }
+
     impl From<reqwest::Error> for ApiError {
         fn from(e: reqwest::Error) -> Self {
             io_error(e).into()
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index e37e7dc44b..b0c5a41efb 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -6,17 +6,12 @@ use crate::{
     cancellation::{self, CancelMap},
     compute::{self, PostgresConnection},
     config::{ProxyConfig, TlsConfig},
-    console::{
-        self,
-        errors::{ApiError, WakeComputeError},
-        messages::MetricsAuxInfo,
-    },
+    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo},
     stream::{PqStream, Stream},
 };
 use anyhow::{bail, Context};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use hyper::StatusCode;
 use metrics::{
     exponential_buckets, register_histogram, register_int_counter_vec, Histogram, IntCounterVec,
 };
@@ -33,7 +28,7 @@ use utils::measured_stream::MeasuredStream;
 
 /// Number of times we should retry the `/proxy_wake_compute` http request.
 /// Retry duration is BASE_RETRY_WAIT_DURATION * 1.5^n
-const NUM_RETRIES_CONNECT: u32 = 10;
+pub const NUM_RETRIES_CONNECT: u32 = 10;
 const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2);
 const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100);
 
@@ -418,13 +413,22 @@ where
     loop {
         match state {
             ConnectionState::Invalid(config, err) => {
-                match try_wake(&config, extra, creds).await {
-                    // we can't wake up the compute node
-                    Ok(None) => return Err(err.into()),
+                let wake_res = match creds {
+                    auth::BackendType::Console(api, creds) => {
+                        try_wake(api.as_ref(), extra, creds).await
+                    }
+                    auth::BackendType::Postgres(api, creds) => {
+                        try_wake(api.as_ref(), extra, creds).await
+                    }
+                    // nothing to do?
+                    auth::BackendType::Link(_) => return Err(err.into()),
+                };
+
+                match wake_res {
                     // there was an error communicating with the control plane
                     Err(e) => return Err(e.into()),
                     // failed to wake up but we can continue to retry
-                    Ok(Some(ControlFlow::Continue(()))) => {
+                    Ok(ControlFlow::Continue(_)) => {
                         state = ConnectionState::Invalid(config, err);
                         let wait_duration = retry_after(num_retries);
                         num_retries += 1;
@@ -434,7 +438,8 @@ where
                         continue;
                     }
                     // successfully woke up a compute node and can break the wakeup loop
-                    Ok(Some(ControlFlow::Break(mut node_info))) => {
+                    Ok(ControlFlow::Break(mut node_info)) => {
+                        node_info.config.reuse_password(&config);
                         mechanism.update_connect_config(&mut node_info.config);
                         state = ConnectionState::Cached(node_info)
                     }
@@ -470,28 +475,22 @@ where
 }
 
 /// Attempts to wake up the compute node.
-/// * Returns Ok(Some(true)) if there was an error waking but retries are acceptable
-/// * Returns Ok(Some(false)) if the wakeup succeeded
-/// * Returns Ok(None) or Err(e) if there was an error
-async fn try_wake(
-    config: &compute::ConnCfg,
+/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
+/// * Returns Ok(Break(node)) if the wakeup succeeded
+/// * Returns Err(e) if there was an error
+pub async fn try_wake(
+    api: &impl console::Api,
     extra: &console::ConsoleReqExtra<'_>,
-    creds: &auth::BackendType<'_, auth::ClientCredentials<'_>>,
-) -> Result<Option<ControlFlow<console::CachedNodeInfo>>, WakeComputeError> {
+    creds: &auth::ClientCredentials<'_>,
+) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
     info!("compute node's state has likely changed; requesting a wake-up");
-    match creds.wake_compute(extra).await {
-        // retry wake if the compute was in an invalid state
-        Err(WakeComputeError::ApiError(ApiError::Console {
-            status: StatusCode::BAD_REQUEST,
-            ..
-        })) => Ok(Some(ControlFlow::Continue(()))),
-        // Update `node_info` and try again.
-        Ok(Some(mut new)) => {
-            new.config.reuse_password(config);
-            Ok(Some(ControlFlow::Break(new)))
-        }
-        Err(e) => Err(e),
-        Ok(None) => Ok(None),
+    match api.wake_compute(extra, creds).await {
+        Err(err) => match &err {
+            WakeComputeError::ApiError(api) if api.could_retry() => Ok(ControlFlow::Continue(err)),
+            _ => Err(err),
+        },
+        // Ready to try again.
+        Ok(new) => Ok(ControlFlow::Break(new)),
     }
 }
 

From 874c31976ee43ba7c752fe3eb3a48ed6fcb0a7a8 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Wed, 26 Jul 2023 18:39:32 +0300
Subject: [PATCH 15/37] dedup cleanup fs traces (#4778)

This is a follow up for discussion:
https://github.com/neondatabase/neon/pull/4552#discussion_r1253417777
see context there
---
 pageserver/src/tenant.rs          | 116 ++++++++++++++++++------------
 pageserver/src/tenant/metadata.rs |  29 ++++----
 2 files changed, 84 insertions(+), 61 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 8f13055cc1..a674eba98f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -23,7 +23,6 @@ use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::completion;
 use utils::crashsafe::path_with_suffix_extension;
-use utils::fs_ext;
 
 use std::cmp::min;
 use std::collections::hash_map::Entry;
@@ -32,6 +31,7 @@ use std::collections::HashMap;
 use std::fs;
 use std::fs::File;
 use std::fs::OpenOptions;
+use std::io;
 use std::io::Write;
 use std::ops::Bound::Included;
 use std::path::Path;
@@ -47,6 +47,7 @@ use std::time::{Duration, Instant};
 
 use self::config::TenantConf;
 use self::delete::DeleteTimelineFlow;
+use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
 use self::remote_timeline_client::RemoteTimelineClient;
 use self::timeline::uninit::TimelineUninitMark;
@@ -338,7 +339,7 @@ pub enum CreateTimelineError {
 
 struct TenantDirectoryScan {
     sorted_timelines_to_load: Vec<(TimelineId, TimelineMetadata)>,
-    timelines_to_resume_deletion: Vec<(TimelineId, TimelineMetadata)>,
+    timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)>,
 }
 
 enum CreateTimelineCause {
@@ -818,7 +819,8 @@ impl Tenant {
         // Note timelines_to_resume_deletion needs to be separate because it can be not sortable
         // from the point of `tree_sort_timelines`. I e some parents can be missing because deletion
         // completed in non topological order (for example because parent has smaller number of layer files in it)
-        let mut timelines_to_resume_deletion: Vec<(TimelineId, TimelineMetadata)> = vec![];
+        let mut timelines_to_resume_deletion: Vec<(TimelineId, Option<TimelineMetadata>)> = vec![];
+
         let timelines_dir = self.conf.timelines_path(&self.tenant_id);
 
         for entry in
@@ -868,7 +870,6 @@ impl Tenant {
                 }
             } else if crate::is_delete_mark(&timeline_dir) {
                 // If metadata exists, load as usual, continue deletion
-                // If metadata doesnt exist remove timeline dir and delete mark
                 let timeline_id =
                     TimelineId::try_from(timeline_dir.file_stem()).with_context(|| {
                         format!(
@@ -877,33 +878,37 @@ impl Tenant {
                         )
                     })?;
 
-                let metadata_path = self.conf.metadata_path(&self.tenant_id, &timeline_id);
-                if metadata_path.exists() {
-                    // Remote deletion did not finish. Need to resume.
-                    timelines_to_resume_deletion.push((
-                        timeline_id,
-                        load_metadata(self.conf, &self.tenant_id, &timeline_id)?,
-                    ));
-                    continue;
-                }
+                match load_metadata(self.conf, &self.tenant_id, &timeline_id) {
+                    Ok(metadata) => {
+                        timelines_to_resume_deletion.push((timeline_id, Some(metadata)))
+                    }
+                    Err(e) => match &e {
+                        LoadMetadataError::Read(r) => {
+                            if r.kind() != io::ErrorKind::NotFound {
+                                return Err(anyhow::anyhow!(e)).with_context(|| {
+                                    format!("Failed to load metadata for timeline_id {timeline_id}")
+                                });
+                            }
 
-                // Missing metadata means that timeline directory should be empty at this point.
-                // Remove delete mark afterwards.
-                // Note that failure during the process wont prevent tenant from successfully loading.
-                // TODO: this is very much similar to DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces
-                // but here we're inside spawn_blocking.
-                if let Err(e) = fs_ext::ignore_absent_files(|| {
-                    fs::remove_dir(self.conf.timeline_path(&self.tenant_id, &timeline_id))
-                })
-                .context("remove deleted timeline dir")
-                .and_then(|_| fs::remove_file(&timeline_dir).context("remove delete mark"))
-                {
-                    warn!(
-                        "cannot clean up deleted timeline dir at: {} error: {:#}",
-                        timeline_dir.display(),
-                        e
-                    );
-                };
+                            // If metadata doesnt exist it means that we've crashed without
+                            // completing cleanup_remaining_timeline_fs_traces in DeleteTimelineFlow.
+                            // So save timeline_id for later call to `DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces`.
+                            // We cant do it here because the method is async so we'd need block_on
+                            // and here we're in spawn_blocking. cleanup_remaining_timeline_fs_traces uses fs operations
+                            // so that basically results in a cycle:
+                            // spawn_blocking
+                            // - block_on
+                            //   - spawn_blocking
+                            // which can lead to running out of threads in blocing pool.
+                            timelines_to_resume_deletion.push((timeline_id, None));
+                        }
+                        _ => {
+                            return Err(anyhow::anyhow!(e)).with_context(|| {
+                                format!("Failed to load metadata for timeline_id {timeline_id}")
+                            })
+                        }
+                    },
+                }
             } else {
                 if !timeline_dir.exists() {
                     warn!(
@@ -1022,21 +1027,37 @@ impl Tenant {
         }
 
         // Resume deletion ones with deleted_mark
-        for (timeline_id, local_metadata) in scan.timelines_to_resume_deletion {
-            if let Err(e) = self
-                .load_local_timeline(timeline_id, local_metadata, init_order, ctx, true)
-                .await
-            {
-                match e {
-                    LoadLocalTimelineError::Load(source) => {
-                        // We tried to load deleted timeline, this is a bug.
-                        return Err(anyhow::anyhow!(source).context(
-                            "This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}"
-                        ));
+        for (timeline_id, maybe_local_metadata) in scan.timelines_to_resume_deletion {
+            match maybe_local_metadata {
+                None => {
+                    // See comment in `scan_and_sort_timelines_dir`.
+                    if let Err(e) =
+                        DeleteTimelineFlow::cleanup_remaining_timeline_fs_traces(self, timeline_id)
+                            .await
+                    {
+                        warn!(
+                            "cannot clean up deleted timeline dir timeline_id: {} error: {:#}",
+                            timeline_id, e
+                        );
                     }
-                    LoadLocalTimelineError::ResumeDeletion(source) => {
-                        // Make sure resumed deletion wont fail loading for entire tenant.
-                        error!("Failed to resume timeline deletion: {source:#}")
+                }
+                Some(local_metadata) => {
+                    if let Err(e) = self
+                        .load_local_timeline(timeline_id, local_metadata, init_order, ctx, true)
+                        .await
+                    {
+                        match e {
+                            LoadLocalTimelineError::Load(source) => {
+                                // We tried to load deleted timeline, this is a bug.
+                                return Err(anyhow::anyhow!(source).context(
+                                "This is a bug. We tried to load deleted timeline which is wrong and loading failed. Timeline: {timeline_id}"
+                            ));
+                            }
+                            LoadLocalTimelineError::ResumeDeletion(source) => {
+                                // Make sure resumed deletion wont fail loading for entire tenant.
+                                error!("Failed to resume timeline deletion: {source:#}")
+                            }
+                        }
                     }
                 }
             }
@@ -3814,9 +3835,9 @@ mod tests {
             .await
             .err()
             .expect("should fail");
-        // get all the stack with all .context, not tonly the last one
+        // get all the stack with all .context, not only the last one
         let message = format!("{err:#}");
-        let expected = "Failed to parse metadata bytes from path";
+        let expected = "failed to load metadata";
         assert!(
             message.contains(expected),
             "message '{message}' expected to contain {expected}"
@@ -3833,7 +3854,8 @@ mod tests {
         }
         assert!(
             found_error_message,
-            "didn't find the corrupted metadata error"
+            "didn't find the corrupted metadata error in {}",
+            message
         );
 
         Ok(())
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index d52bb66e76..a27602c335 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -9,10 +9,11 @@
 //! [`remote_timeline_client`]: super::remote_timeline_client
 
 use std::fs::{File, OpenOptions};
-use std::io::Write;
+use std::io::{self, Write};
 
 use anyhow::{bail, ensure, Context};
 use serde::{Deserialize, Serialize};
+use thiserror::Error;
 use tracing::info_span;
 use utils::bin_ser::SerializeError;
 use utils::{
@@ -267,24 +268,24 @@ pub fn save_metadata(
     Ok(())
 }
 
+#[derive(Error, Debug)]
+pub enum LoadMetadataError {
+    #[error(transparent)]
+    Read(#[from] io::Error),
+
+    #[error(transparent)]
+    Decode(#[from] anyhow::Error),
+}
+
 pub fn load_metadata(
     conf: &'static PageServerConf,
     tenant_id: &TenantId,
     timeline_id: &TimelineId,
-) -> anyhow::Result<TimelineMetadata> {
+) -> Result<TimelineMetadata, LoadMetadataError> {
     let metadata_path = conf.metadata_path(tenant_id, timeline_id);
-    let metadata_bytes = std::fs::read(&metadata_path).with_context(|| {
-        format!(
-            "Failed to read metadata bytes from path {}",
-            metadata_path.display()
-        )
-    })?;
-    TimelineMetadata::from_bytes(&metadata_bytes).with_context(|| {
-        format!(
-            "Failed to parse metadata bytes from path {}",
-            metadata_path.display()
-        )
-    })
+    let metadata_bytes = std::fs::read(metadata_path)?;
+
+    Ok(TimelineMetadata::from_bytes(&metadata_bytes)?)
 }
 
 #[cfg(test)]

From 48ce95533c11a9979a3255b180c251f48fadb0b6 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Wed, 26 Jul 2023 20:20:12 +0300
Subject: [PATCH 16/37] test: allow normal warnings in
 test_threshold_based_eviction (#4801)

See:
https://neon-github-public-dev.s3.amazonaws.com/reports/main/5654328815/index.html#suites/3fc871d9ee8127d8501d607e03205abb/3482458eba88c021
---
 pageserver/src/metrics.rs                     | 99 +++++++++++++------
 .../regress/test_threshold_based_eviction.py  |  6 ++
 2 files changed, 74 insertions(+), 31 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index c233f36da6..84f2d923e2 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -73,7 +73,7 @@ pub static STORAGE_TIME_COUNT_PER_TIMELINE: Lazy<IntCounterVec> = Lazy::new(|| {
 // Buckets for background operations like compaction, GC, size calculation
 const STORAGE_OP_BUCKETS: &[f64] = &[0.010, 0.100, 1.0, 10.0, 100.0, 1000.0];
 
-pub static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
+pub(crate) static STORAGE_TIME_GLOBAL: Lazy<HistogramVec> = Lazy::new(|| {
     register_histogram_vec!(
         "pageserver_storage_operations_seconds_global",
         "Time spent on storage operations",
@@ -93,7 +93,7 @@ pub(crate) static READ_NUM_FS_LAYERS: Lazy<Histogram> = Lazy::new(|| {
 });
 
 // Metrics collected on operations on the storage repository.
-pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_getpage_reconstruct_seconds",
         "Time spent in reconstruct_value (reconstruct a page from deltas)",
@@ -102,7 +102,7 @@ pub static RECONSTRUCT_TIME: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static MATERIALIZED_PAGE_CACHE_HIT_DIRECT: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "pageserver_materialized_cache_hits_direct_total",
         "Number of cache hits from materialized page cache without redo",
@@ -119,7 +119,7 @@ pub(crate) static GET_RECONSTRUCT_DATA_TIME: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static MATERIALIZED_PAGE_CACHE_HIT: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "pageserver_materialized_cache_hits_total",
         "Number of cache hits from materialized page cache",
@@ -280,7 +280,7 @@ static REMOTE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "pageserver_remote_ondemand_downloaded_layers_total",
         "Total on-demand downloaded layers"
@@ -288,7 +288,7 @@ pub static REMOTE_ONDEMAND_DOWNLOADED_LAYERS: Lazy<IntCounter> = Lazy::new(|| {
     .unwrap()
 });
 
-pub static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static REMOTE_ONDEMAND_DOWNLOADED_BYTES: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "pageserver_remote_ondemand_downloaded_bytes_total",
         "Total bytes of layers on-demand downloaded",
@@ -327,7 +327,7 @@ pub(crate) static BROKEN_TENANTS_SET: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("Failed to register pageserver_tenant_states_count metric")
 });
 
-pub static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
+pub(crate) static TENANT_SYNTHETIC_SIZE_METRIC: Lazy<UIntGaugeVec> = Lazy::new(|| {
     register_uint_gauge_vec!(
         "pageserver_tenant_synthetic_cached_size_bytes",
         "Synthetic size of each tenant in bytes",
@@ -385,7 +385,7 @@ static EVICTIONS_WITH_LOW_RESIDENCE_DURATION: Lazy<IntCounterVec> = Lazy::new(||
     .expect("failed to define a metric")
 });
 
-pub static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static UNEXPECTED_ONDEMAND_DOWNLOADS: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "pageserver_unexpected_ondemand_downloads_count",
         "Number of unexpected on-demand downloads. \
@@ -690,7 +690,7 @@ pub(crate) static REMOTE_OPERATION_TIME: Lazy<HistogramVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "pageserver_tenant_task_events",
         "Number of task start/stop/fail events.",
@@ -699,7 +699,7 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
     .expect("Failed to register tenant_task_events metric")
 });
 
-pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "pageserver_background_loop_period_overrun_count",
         "Incremented whenever warn_when_period_overrun() logs a warning.",
@@ -710,7 +710,7 @@ pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new
 
 // walreceiver metrics
 
-pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "pageserver_walreceiver_started_connections_total",
         "Number of started walreceiver connections"
@@ -718,7 +718,7 @@ pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
     register_int_gauge!(
         "pageserver_walreceiver_active_managers",
         "Number of active walreceiver managers"
@@ -726,7 +726,7 @@ pub static WALRECEIVER_ACTIVE_MANAGERS: Lazy<IntGauge> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "pageserver_walreceiver_switches_total",
         "Number of walreceiver manager change_connection calls",
@@ -735,7 +735,7 @@ pub static WALRECEIVER_SWITCHES: Lazy<IntCounterVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "pageserver_walreceiver_broker_updates_total",
         "Number of received broker updates in walreceiver"
@@ -743,7 +743,7 @@ pub static WALRECEIVER_BROKER_UPDATES: Lazy<IntCounter> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
+pub(crate) static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "pageserver_walreceiver_candidates_events_total",
         "Number of walreceiver candidate events",
@@ -752,10 +752,10 @@ pub static WALRECEIVER_CANDIDATES_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
+pub(crate) static WALRECEIVER_CANDIDATES_ADDED: Lazy<IntCounter> =
     Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["add"]));
 
-pub static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
+pub(crate) static WALRECEIVER_CANDIDATES_REMOVED: Lazy<IntCounter> =
     Lazy::new(|| WALRECEIVER_CANDIDATES_EVENTS.with_label_values(&["remove"]));
 
 // Metrics collected on WAL redo operations
@@ -802,7 +802,7 @@ macro_rules! redo_bytes_histogram_count_buckets {
     };
 }
 
-pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_wal_redo_seconds",
         "Time spent on WAL redo",
@@ -811,7 +811,7 @@ pub static WAL_REDO_TIME: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_wal_redo_wait_seconds",
         "Time spent waiting for access to the Postgres WAL redo process",
@@ -820,7 +820,7 @@ pub static WAL_REDO_WAIT_TIME: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_wal_redo_records_histogram",
         "Histogram of number of records replayed per redo in the Postgres WAL redo process",
@@ -829,7 +829,7 @@ pub static WAL_REDO_RECORDS_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
+pub(crate) static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
     register_histogram!(
         "pageserver_wal_redo_bytes_histogram",
         "Histogram of number of records replayed per redo sent to Postgres",
@@ -838,7 +838,8 @@ pub static WAL_REDO_BYTES_HISTOGRAM: Lazy<Histogram> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
-pub static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
+// FIXME: isn't this already included by WAL_REDO_RECORDS_HISTOGRAM which has _count?
+pub(crate) static WAL_REDO_RECORD_COUNTER: Lazy<IntCounter> = Lazy::new(|| {
     register_int_counter!(
         "pageserver_replayed_wal_records_total",
         "Number of WAL records replayed in WAL redo process"
@@ -1394,15 +1395,51 @@ impl<F: Future<Output = Result<O, E>>, O, E> Future for MeasuredRemoteOp<F> {
 }
 
 pub fn preinitialize_metrics() {
-    // We want to alert on this metric increasing.
-    // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
-    assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
-    UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
+    // Python tests need these and on some we do alerting.
+    //
+    // FIXME(4813): make it so that we have no top level metrics as this fn will easily fall out of
+    // order:
+    // - global metrics reside in a Lazy<PageserverMetrics>
+    //   - access via crate::metrics::PS_METRICS.materialized_page_cache_hit.inc()
+    // - could move the statics into TimelineMetrics::new()?
 
-    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
-    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
+    // counters
+    [
+        &MATERIALIZED_PAGE_CACHE_HIT,
+        &MATERIALIZED_PAGE_CACHE_HIT_DIRECT,
+        &UNEXPECTED_ONDEMAND_DOWNLOADS,
+        &WALRECEIVER_STARTED_CONNECTIONS,
+        &WALRECEIVER_BROKER_UPDATES,
+        &WALRECEIVER_CANDIDATES_ADDED,
+        &WALRECEIVER_CANDIDATES_REMOVED,
+    ]
+    .into_iter()
+    .for_each(|c| {
+        Lazy::force(c);
+    });
 
-    // Python tests need these.
-    MATERIALIZED_PAGE_CACHE_HIT_DIRECT.get();
-    MATERIALIZED_PAGE_CACHE_HIT.get();
+    // countervecs
+    [&BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT]
+        .into_iter()
+        .for_each(|c| {
+            Lazy::force(c);
+        });
+
+    // gauges
+    WALRECEIVER_ACTIVE_MANAGERS.get();
+
+    // histograms
+    [
+        &READ_NUM_FS_LAYERS,
+        &RECONSTRUCT_TIME,
+        &WAIT_LSN_TIME,
+        &WAL_REDO_TIME,
+        &WAL_REDO_WAIT_TIME,
+        &WAL_REDO_RECORDS_HISTOGRAM,
+        &WAL_REDO_BYTES_HISTOGRAM,
+    ]
+    .into_iter()
+    .for_each(|h| {
+        Lazy::force(h);
+    });
 }
diff --git a/test_runner/regress/test_threshold_based_eviction.py b/test_runner/regress/test_threshold_based_eviction.py
index c7083d92be..4d8c87e09e 100644
--- a/test_runner/regress/test_threshold_based_eviction.py
+++ b/test_runner/regress/test_threshold_based_eviction.py
@@ -38,6 +38,12 @@ def test_threshold_based_eviction(
     env = neon_env_builder.init_start()
     env.pageserver.allowed_errors.append(metrics_refused_log_line)
 
+    # these can happen whenever we run consumption metrics collection
+    env.pageserver.allowed_errors.append(r".*failed to calculate logical size at \S+: cancelled")
+    env.pageserver.allowed_errors.append(
+        r".*failed to calculate synthetic size for tenant \S+: failed to calculate some logical_sizes"
+    )
+
     tenant_id, timeline_id = env.initial_tenant, env.initial_timeline
     assert isinstance(timeline_id, TimelineId)
 

From b9a7a661d0967251e826db21f6f8315880ff6857 Mon Sep 17 00:00:00 2001
From: Alek Westover <alek.westover@gmail.com>
Date: Wed, 26 Jul 2023 15:55:55 -0400
Subject: [PATCH 17/37] add list of public extensions and lookup table for
 libraries (#4807)

---
 Dockerfile.compute-node          |  4 ++-
 scripts/combine_control_files.py | 49 +++++++++++++++++++++++++++++++-
 2 files changed, 51 insertions(+), 2 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 7d60458a3e..2c4465fc82 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -774,12 +774,14 @@ RUN rm /usr/local/pgsql/lib/lib*.a
 FROM python:3.9-slim-bullseye AS generate-ext-index
 ARG PG_VERSION
 ARG BUILD_TAG
+RUN apt update && apt install -y zstd
+
 # copy the control files here
 COPY --from=kq-imcx-pg-build /extensions/ /extensions/
 COPY --from=pg-anon-pg-build /extensions/ /extensions/
 COPY --from=postgis-build /extensions/ /extensions/
 COPY scripts/combine_control_files.py ./combine_control_files.py
-RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG}
+RUN python3 ./combine_control_files.py ${PG_VERSION} ${BUILD_TAG} --public_extensions="anon,postgis"
 
 FROM scratch AS postgres-extensions
 # After the transition this layer will include all extensitons.
diff --git a/scripts/combine_control_files.py b/scripts/combine_control_files.py
index 37db522ddd..0350e4721d 100644
--- a/scripts/combine_control_files.py
+++ b/scripts/combine_control_files.py
@@ -4,17 +4,49 @@
 # for all extensions in extensions subdir.
 import argparse
 import json
+import subprocess
 from pathlib import Path
 
+"""
+# ext_index.json example:
+{
+    "public_extensions": [
+        "anon"
+    ],
+    "library_index": {
+        "anon": "anon",
+        "kq_imcx": "kq_imcx"
+        // would be more complicated for something like postgis where multiple library names all map to postgis
+    },
+    "extension_data": {
+        "kq_imcx": {
+            "control_data": {
+                "kq_imcx.control": "# This file is generated content from add_postgresql_extension.\n# No point in modifying it, it will be overwritten anyway.\n\n# Default version, always set\ndefault_version = '0.1'\n\n# Module pathname generated from target shared library name. Use\n# MODULE_PATHNAME in script file.\nmodule_pathname = '$libdir/kq_imcx.so'\n\n# Comment for extension. Set using COMMENT option. Can be set in\n# script file as well.\ncomment = 'ketteQ In-Memory Calendar Extension (IMCX)'\n\n# Encoding for script file. Set using ENCODING option.\n#encoding = ''\n\n# Required extensions. Set using REQUIRES option (multi-valued).\n#requires = ''\ntrusted = true\n"
+            },
+            "archive_path": "5648391853/v15/extensions/kq_imcx.tar.zst"
+        },
+        "anon": {
+            "control_data": {
+                "anon.control": "# PostgreSQL Anonymizer (anon) extension \ncomment = 'Data anonymization tools' \ndefault_version = '1.1.0' \ndirectory='extension/anon' \nrelocatable = false \nrequires = 'pgcrypto' \nsuperuser = false \nmodule_pathname = '$libdir/anon' \ntrusted = true \n"
+            },
+            "archive_path": "5648391853/v15/extensions/anon.tar.zst"
+        }
+    }
+}
+"""
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="generate ext_index.json")
     parser.add_argument("pg_version", type=str, choices=["v14", "v15"], help="pg_version")
     parser.add_argument("BUILD_TAG", type=str, help="BUILD_TAG for this compute image")
+    parser.add_argument("--public_extensions", type=str, help="list of public extensions")
     args = parser.parse_args()
     pg_version = args.pg_version
     BUILD_TAG = args.BUILD_TAG
+    public_ext_list = args.public_extensions.split(",")
 
     ext_index = {}
+    library_index = {}
     EXT_PATH = Path("extensions")
     for extension in EXT_PATH.iterdir():
         if extension.is_dir():
@@ -28,6 +60,21 @@ if __name__ == "__main__":
                 "control_data": control_data,
                 "archive_path": f"{BUILD_TAG}/{pg_version}/extensions/{extension.name}.tar.zst",
             }
+        elif extension.suffix == ".zst":
+            file_list = (
+                str(subprocess.check_output(["tar", "tf", str(extension)]), "utf-8")
+                .strip()
+                .split("\n")
+            )
+            for file in file_list:
+                if file.endswith(".so") and file.startswith("lib/"):
+                    lib_name = file[4:-3]
+                    library_index[lib_name] = extension.name.replace(".tar.zst", "")
 
+    all_data = {
+        "public_extensions": public_ext_list,
+        "library_index": library_index,
+        "extension_data": ext_index,
+    }
     with open("ext_index.json", "w") as f:
-        json.dump(ext_index, f)
+        json.dump(all_data, f)

From 395bd9174efbeada5960dbc73498d15ef00f1841 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 27 Jul 2023 10:22:44 +0300
Subject: [PATCH 18/37] test: allow future image layer warning (#4818)

https://neon-github-public-dev.s3.amazonaws.com/reports/main/5670795960/index.html#suites/837740b64a53e769572c4ed7b7a7eeeb/5a73fa4a69399123/retries

Allow it because we are doing immediate stop.
---
 test_runner/regress/test_remote_storage.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 7c04ed9017..43f0fb718f 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -348,6 +348,9 @@ def test_remote_storage_upload_queue_retries(
     # XXX: should vary this test to selectively fail just layer uploads, index uploads, deletions
     #      but how do we validate the result after restore?
 
+    # these are always possible when we do an immediate stop. perhaps something with compacting has changed since.
+    env.pageserver.allowed_errors.append(r".*found future (delta|image) layer.*")
+
     env.pageserver.stop(immediate=True)
     env.endpoints.stop_all()
 

From 3e425c40c01eaecc6513cccaf6fe06d741241fde Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 27 Jul 2023 15:40:53 +0300
Subject: [PATCH 19/37] fix(compute_ctl): remove stray variable in error
 message (#4823)

error is not needed because anyhow will have the cause chain reported
anyways.

related to test_neon_cli_basics being flaky, but doesn't actually fix
any flakyness, just the obvious stray `{e}`.
---
 control_plane/src/endpoint.rs | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/control_plane/src/endpoint.rs b/control_plane/src/endpoint.rs
index 6df6e47f29..8b5c88bf01 100644
--- a/control_plane/src/endpoint.rs
+++ b/control_plane/src/endpoint.rs
@@ -564,9 +564,7 @@ impl Endpoint {
                 }
                 Err(e) => {
                     if attempt == MAX_ATTEMPTS {
-                        return Err(e).context(
-                            "timed out waiting to connect to compute_ctl HTTP; last error: {e}",
-                        );
+                        return Err(e).context("timed out waiting to connect to compute_ctl HTTP");
                     }
                 }
             }

From cafbe8237ed6a1a79a08bd97657cf83b45e8b3c5 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Thu, 27 Jul 2023 15:52:36 +0300
Subject: [PATCH 20/37] Move tenant/delete.rs to tenant/timeline/delete.rs
 (#4825)

move tenant/delete.rs to tenant/timeline/delete.rs to prepare for
appearance of tenant deletion routines in tenant/delete.rs
---
 pageserver/src/tenant.rs                       |  3 +--
 pageserver/src/tenant/mgr.rs                   |  2 +-
 pageserver/src/tenant/timeline.rs              |  3 ++-
 pageserver/src/tenant/{ => timeline}/delete.rs | 14 ++++++++------
 4 files changed, 12 insertions(+), 10 deletions(-)
 rename pageserver/src/tenant/{ => timeline}/delete.rs (98%)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index a674eba98f..1b287ee10f 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -46,7 +46,6 @@ use std::sync::{Mutex, RwLock};
 use std::time::{Duration, Instant};
 
 use self::config::TenantConf;
-use self::delete::DeleteTimelineFlow;
 use self::metadata::LoadMetadataError;
 use self::metadata::TimelineMetadata;
 use self::remote_timeline_client::RemoteTimelineClient;
@@ -70,6 +69,7 @@ use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
 use crate::InitializationOrder;
 
+use crate::tenant::timeline::delete::DeleteTimelineFlow;
 use crate::tenant::timeline::uninit::cleanup_timeline_directory;
 use crate::virtual_file::VirtualFile;
 use crate::walredo::PostgresRedoManager;
@@ -117,7 +117,6 @@ mod remote_timeline_client;
 pub mod storage_layer;
 
 pub mod config;
-pub mod delete;
 pub mod mgr;
 pub mod tasks;
 pub mod upload_queue;
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index aeecc88602..25c5e3f2e0 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -26,7 +26,7 @@ use crate::{InitializationOrder, IGNORED_TENANT_FILE_NAME};
 use utils::fs_ext::PathExt;
 use utils::id::{TenantId, TimelineId};
 
-use super::delete::DeleteTimelineFlow;
+use super::timeline::delete::DeleteTimelineFlow;
 
 /// The tenants known to the pageserver.
 /// The enum variants are used to distinguish the different states that the pageserver can be in.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 1002595909..4f9c0b08b8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1,3 +1,4 @@
+pub mod delete;
 mod eviction_task;
 pub mod layer_manager;
 mod logical_size;
@@ -79,6 +80,7 @@ use crate::METADATA_FILE_NAME;
 use crate::ZERO_PAGE;
 use crate::{is_temporary, task_mgr};
 
+use self::delete::DeleteTimelineFlow;
 pub(super) use self::eviction_task::EvictionTaskTenantState;
 use self::eviction_task::EvictionTaskTimelineState;
 use self::layer_manager::LayerManager;
@@ -86,7 +88,6 @@ use self::logical_size::LogicalSize;
 use self::walreceiver::{WalReceiver, WalReceiverConf};
 
 use super::config::TenantConf;
-use super::delete::DeleteTimelineFlow;
 use super::remote_timeline_client::index::IndexPart;
 use super::remote_timeline_client::RemoteTimelineClient;
 use super::storage_layer::{
diff --git a/pageserver/src/tenant/delete.rs b/pageserver/src/tenant/timeline/delete.rs
similarity index 98%
rename from pageserver/src/tenant/delete.rs
rename to pageserver/src/tenant/timeline/delete.rs
index eeaaf15c32..d99ca2d292 100644
--- a/pageserver/src/tenant/delete.rs
+++ b/pageserver/src/tenant/timeline/delete.rs
@@ -15,15 +15,17 @@ use utils::{
 use crate::{
     config::PageServerConf,
     task_mgr::{self, TaskKind},
-    tenant::{remote_timeline_client, DeleteTimelineError},
+    tenant::{
+        metadata::TimelineMetadata,
+        remote_timeline_client::{
+            self, PersistIndexPartWithDeletedFlagError, RemoteTimelineClient,
+        },
+        CreateTimelineCause, DeleteTimelineError, Tenant,
+    },
     InitializationOrder,
 };
 
-use super::{
-    metadata::TimelineMetadata,
-    remote_timeline_client::{PersistIndexPartWithDeletedFlagError, RemoteTimelineClient},
-    CreateTimelineCause, Tenant, Timeline,
-};
+use super::Timeline;
 
 /// Now that the Timeline is in Stopping state, request all the related tasks to shut down.
 async fn stop_tasks(timeline: &Timeline) -> Result<(), DeleteTimelineError> {

From 67d2fa6dec41af962655198b13f9f8f3a7dc6ef4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 27 Jul 2023 19:13:58 +0300
Subject: [PATCH 21/37] test: fix `test_neon_cli_basics` flakyness without
 making it better for future (#4827)

The test was starting two endpoints on the same branch as discovered by
@petuhovskiy.

The fix is to allow passing branch-name from the python side over to
neon_local, which already accepted it.

Split from #4824, which will handle making this more misuse resistant.
---
 test_runner/fixtures/neon_fixtures.py      | 4 ++++
 test_runner/regress/test_neon_local_cli.py | 6 ++++--
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index eafc061ab9..5e4553f52d 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1504,6 +1504,7 @@ class NeonCli(AbstractNeonCli):
         safekeepers: Optional[List[int]] = None,
         tenant_id: Optional[TenantId] = None,
         lsn: Optional[Lsn] = None,
+        branch_name: Optional[str] = None,
     ) -> "subprocess.CompletedProcess[str]":
         args = [
             "endpoint",
@@ -1517,8 +1518,11 @@ class NeonCli(AbstractNeonCli):
             args.append(f"--lsn={lsn}")
         args.extend(["--pg-port", str(pg_port)])
         args.extend(["--http-port", str(http_port)])
+
         if safekeepers is not None:
             args.extend(["--safekeepers", (",".join(map(str, safekeepers)))])
+        if branch_name is not None:
+            args.extend(["--branch-name", branch_name])
         if endpoint_id is not None:
             args.append(endpoint_id)
 
diff --git a/test_runner/regress/test_neon_local_cli.py b/test_runner/regress/test_neon_local_cli.py
index 3314e7fbf6..6dd47de8cf 100644
--- a/test_runner/regress/test_neon_local_cli.py
+++ b/test_runner/regress/test_neon_local_cli.py
@@ -16,11 +16,13 @@ def test_neon_cli_basics(neon_env_builder: NeonEnvBuilder, port_distributor: Por
             endpoint_id="ep-basic-main", pg_port=pg_port, http_port=http_port
         )
 
-        env.neon_cli.create_branch(new_branch_name="migration_check")
+        branch_name = "migration-check"
+
+        env.neon_cli.create_branch(new_branch_name=branch_name)
         pg_port = port_distributor.get_port()
         http_port = port_distributor.get_port()
         env.neon_cli.endpoint_start(
-            endpoint_id="ep-migration_check", pg_port=pg_port, http_port=http_port
+            f"ep-{branch_name}", pg_port, http_port, branch_name=branch_name
         )
     finally:
         env.neon_cli.stop()

From 3681fc39fdc39beda5c97e6ab3ff3880e0fac4be Mon Sep 17 00:00:00 2001
From: Alek Westover <alek.westover@gmail.com>
Date: Fri, 28 Jul 2023 10:03:18 -0400
Subject: [PATCH 22/37] modify `relative_path_to_s3_object` logic for
 `prefix=None` (#4795)

see added unit tests for more description
---
 libs/remote_storage/src/s3_bucket.rs      | 80 +++++++++++++++++++++--
 libs/remote_storage/tests/test_real_s3.rs |  2 +-
 test_runner/fixtures/neon_fixtures.py     |  2 +-
 3 files changed, 75 insertions(+), 9 deletions(-)

diff --git a/libs/remote_storage/src/s3_bucket.rs b/libs/remote_storage/src/s3_bucket.rs
index 43d818dfb9..37a6bf23e8 100644
--- a/libs/remote_storage/src/s3_bucket.rs
+++ b/libs/remote_storage/src/s3_bucket.rs
@@ -200,13 +200,17 @@ impl S3Bucket {
         )
     }
 
-    fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
-        let mut full_path = self.prefix_in_bucket.clone().unwrap_or_default();
-        for segment in path.0.iter() {
-            full_path.push(REMOTE_STORAGE_PREFIX_SEPARATOR);
-            full_path.push_str(segment.to_str().unwrap_or_default());
+    pub fn relative_path_to_s3_object(&self, path: &RemotePath) -> String {
+        assert_eq!(std::path::MAIN_SEPARATOR, REMOTE_STORAGE_PREFIX_SEPARATOR);
+        let path_string = path
+            .get_path()
+            .to_string_lossy()
+            .trim_end_matches(REMOTE_STORAGE_PREFIX_SEPARATOR)
+            .to_string();
+        match &self.prefix_in_bucket {
+            Some(prefix) => prefix.clone() + "/" + &path_string,
+            None => path_string,
         }
-        full_path
     }
 
     async fn download_object(&self, request: GetObjectRequest) -> Result<Download, DownloadError> {
@@ -427,10 +431,12 @@ impl RemoteStorage for S3Bucket {
     }
 
     async fn download(&self, from: &RemotePath) -> Result<Download, DownloadError> {
+        // if prefix is not none then download file `prefix/from`
+        // if prefix is none then download file `from`
         self.download_object(GetObjectRequest {
             bucket: self.bucket_name.clone(),
             key: self.relative_path_to_s3_object(from),
-            ..GetObjectRequest::default()
+            range: None,
         })
         .await
     }
@@ -523,3 +529,63 @@ impl RemoteStorage for S3Bucket {
         Ok(())
     }
 }
+
+#[cfg(test)]
+mod tests {
+    use std::num::NonZeroUsize;
+    use std::path::Path;
+
+    use crate::{RemotePath, S3Bucket, S3Config};
+
+    #[test]
+    fn relative_path() {
+        let all_paths = vec!["", "some/path", "some/path/"];
+        let all_paths: Vec<RemotePath> = all_paths
+            .iter()
+            .map(|x| RemotePath::new(Path::new(x)).expect("bad path"))
+            .collect();
+        let prefixes = [
+            None,
+            Some(""),
+            Some("test/prefix"),
+            Some("test/prefix/"),
+            Some("/test/prefix/"),
+        ];
+        let expected_outputs = vec![
+            vec!["", "some/path", "some/path"],
+            vec!["/", "/some/path", "/some/path"],
+            vec![
+                "test/prefix/",
+                "test/prefix/some/path",
+                "test/prefix/some/path",
+            ],
+            vec![
+                "test/prefix/",
+                "test/prefix/some/path",
+                "test/prefix/some/path",
+            ],
+            vec![
+                "test/prefix/",
+                "test/prefix/some/path",
+                "test/prefix/some/path",
+            ],
+        ];
+
+        for (prefix_idx, prefix) in prefixes.iter().enumerate() {
+            let config = S3Config {
+                bucket_name: "bucket".to_owned(),
+                bucket_region: "region".to_owned(),
+                prefix_in_bucket: prefix.map(str::to_string),
+                endpoint: None,
+                concurrency_limit: NonZeroUsize::new(100).unwrap(),
+                max_keys_per_list_response: Some(5),
+            };
+            let storage = S3Bucket::new(&config).expect("remote storage init");
+            for (test_path_idx, test_path) in all_paths.iter().enumerate() {
+                let result = storage.relative_path_to_s3_object(test_path);
+                let expected = expected_outputs[prefix_idx][test_path_idx];
+                assert_eq!(result, expected);
+            }
+        }
+    }
+}
diff --git a/libs/remote_storage/tests/test_real_s3.rs b/libs/remote_storage/tests/test_real_s3.rs
index 0917bab39c..982c01a9be 100644
--- a/libs/remote_storage/tests/test_real_s3.rs
+++ b/libs/remote_storage/tests/test_real_s3.rs
@@ -19,7 +19,7 @@ static LOGGING_DONE: OnceCell<()> = OnceCell::new();
 
 const ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME: &str = "ENABLE_REAL_S3_REMOTE_STORAGE";
 
-const BASE_PREFIX: &str = "test/";
+const BASE_PREFIX: &str = "test";
 
 /// Tests that S3 client can list all prefixes, even if the response come paginated and requires multiple S3 queries.
 /// Uses real S3 and requires [`ENABLE_REAL_S3_REMOTE_STORAGE_ENV_VAR_NAME`] and related S3 cred env vars specified.
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 5e4553f52d..fb78d69d67 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -542,7 +542,7 @@ class S3Storage:
     access_key: str
     secret_key: str
     endpoint: Optional[str] = None
-    prefix_in_bucket: Optional[str] = None
+    prefix_in_bucket: Optional[str] = ""
 
     def access_env_vars(self) -> Dict[str, str]:
         return {

From 9fdd3a4a1e6e4c0059cef2c6218ef84fcef657d0 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 28 Jul 2023 16:00:55 +0100
Subject: [PATCH 23/37] test_runner: add amcheck to test_compatibility (#4772)

Run `pg_amcheck` in forward and backward compatibility tests to catch
some data corruption.

## Summary of changes
- Add amcheck compiling to Makefile
- Add `pg_amcheck` to test_compatibility
---
 Makefile                                  | 2 ++
 test_runner/regress/test_compatibility.py | 8 ++++++++
 2 files changed, 10 insertions(+)

diff --git a/Makefile b/Makefile
index ae979b8b4c..0768b64502 100644
--- a/Makefile
+++ b/Makefile
@@ -108,6 +108,8 @@ postgres-%: postgres-configure-% \
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pg_buffercache install
 	+@echo "Compiling pageinspect $*"
 	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/pageinspect install
+	+@echo "Compiling amcheck $*"
+	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/build/$*/contrib/amcheck install
 
 .PHONY: postgres-clean-%
 postgres-clean-%:
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index 6c592d3249..f353c9d6c9 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -438,6 +438,14 @@ def check_neon_works(
         test_output_dir / "dump-from-wal.filediff",
     )
 
+    # TODO: Run pg_amcheck unconditionally after the next release
+    try:
+        pg_bin.run(["psql", connstr, "--command", "CREATE EXTENSION IF NOT EXISTS amcheck"])
+    except subprocess.CalledProcessError:
+        log.info("Extension amcheck is not available, skipping pg_amcheck")
+    else:
+        pg_bin.run_capture(["pg_amcheck", connstr, "--install-missing", "--verbose"])
+
     # Check that we can interract with the data
     pg_bin.run_capture(["pgbench", "--time=10", "--progress=2", connstr])
 

From 7374634845f27c4380c5b395ff1fc00aa3a5f4d4 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 28 Jul 2023 16:15:31 +0100
Subject: [PATCH 24/37] test_runner: clean up test_compatibility (#4770)

## Problem

We have some amount of outdated logic in test_compatibility, that we
don't need anymore.

## Summary of changes
- Remove `PR4425_ALLOWED_DIFF` and tune `dump_differs` method to accept
allowed diffs in the future (a cleanup after
https://github.com/neondatabase/neon/pull/4425)
- Remote etcd related code (a cleanup after
https://github.com/neondatabase/neon/pull/2733)
- Don't set `preserve_database_files`
---
 test_runner/regress/test_compatibility.py | 111 +++++++---------------
 1 file changed, 33 insertions(+), 78 deletions(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index f353c9d6c9..dd2556350f 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -4,7 +4,7 @@ import shutil
 import subprocess
 import tempfile
 from pathlib import Path
-from typing import Any, Optional
+from typing import Any, List, Optional
 
 import pytest
 import toml  # TODO: replace with tomllib for Python >= 3.11
@@ -14,7 +14,6 @@ from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
     PortDistributor,
-    parse_project_git_version_output,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -63,7 +62,6 @@ def test_create_snapshot(
     neon_env_builder.pg_version = pg_version
     neon_env_builder.num_safekeepers = 3
     neon_env_builder.enable_local_fs_remote_storage()
-    neon_env_builder.preserve_database_files = True
 
     env = neon_env_builder.init_start()
     endpoint = env.endpoints.create_start("main")
@@ -276,14 +274,6 @@ def prepare_snapshot(
     pageserver_config["listen_pg_addr"] = port_distributor.replace_with_new_port(
         pageserver_config["listen_pg_addr"]
     )
-    # since storage_broker these are overridden by neon_local during pageserver
-    # start; remove both to prevent unknown options during etcd ->
-    # storage_broker migration. TODO: remove once broker is released
-    pageserver_config.pop("broker_endpoint", None)
-    pageserver_config.pop("broker_endpoints", None)
-    etcd_broker_endpoints = [f"http://localhost:{port_distributor.get_port()}/"]
-    if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0":
-        pageserver_config["broker_endpoints"] = etcd_broker_endpoints  # old etcd version
 
     # Older pageserver versions had just one `auth_type` setting. Now there
     # are separate settings for pg and http ports. We don't use authentication
@@ -301,20 +291,8 @@ def prepare_snapshot(
     snapshot_config_toml = repo_dir / "config"
     snapshot_config = toml.load(snapshot_config_toml)
 
-    # Provide up/downgrade etcd <-> storage_broker to make forward/backward
-    # compatibility test happy. TODO: leave only the new part once broker is released.
-    if get_neon_version(neon_binpath) == "49da498f651b9f3a53b56c7c0697636d880ddfe0":
-        # old etcd version
-        snapshot_config["etcd_broker"] = {
-            "etcd_binary_path": shutil.which("etcd"),
-            "broker_endpoints": etcd_broker_endpoints,
-        }
-        snapshot_config.pop("broker", None)
-    else:
-        # new storage_broker version
-        broker_listen_addr = f"127.0.0.1:{port_distributor.get_port()}"
-        snapshot_config["broker"] = {"listen_addr": broker_listen_addr}
-        snapshot_config.pop("etcd_broker", None)
+    broker_listen_addr = f"127.0.0.1:{port_distributor.get_port()}"
+    snapshot_config["broker"] = {"listen_addr": broker_listen_addr}
 
     snapshot_config["pageserver"]["listen_http_addr"] = port_distributor.replace_with_new_port(
         snapshot_config["pageserver"]["listen_http_addr"]
@@ -350,12 +328,6 @@ def prepare_snapshot(
     ), f"there're files referencing `test_create_snapshot/repo`, this path should be replaced with {repo_dir}:\n{rv.stdout}"
 
 
-# get git SHA of neon binary
-def get_neon_version(neon_binpath: Path):
-    out = subprocess.check_output([neon_binpath / "neon_local", "--version"]).decode("utf-8")
-    return parse_project_git_version_output(out)
-
-
 def check_neon_works(
     repo_dir: Path,
     neon_target_binpath: Path,
@@ -381,7 +353,6 @@ def check_neon_works(
     config.pg_version = pg_version
     config.initial_tenant = snapshot_config["default_tenant_id"]
     config.pg_distrib_dir = pg_distrib_dir
-    config.preserve_database_files = True
 
     # Use the "target" binaries to launch the storage nodes
     config_target = config
@@ -453,10 +424,15 @@ def check_neon_works(
     assert not initial_dump_differs, "initial dump differs"
 
 
-def dump_differs(first: Path, second: Path, output: Path) -> bool:
+def dump_differs(
+    first: Path, second: Path, output: Path, allowed_diffs: Optional[List[str]] = None
+) -> bool:
     """
     Runs diff(1) command on two SQL dumps and write the output to the given output file.
-    Returns True if the dumps differ, False otherwise.
+    The function supports allowed diffs, if the diff is in the allowed_diffs list, it's not considered as a difference.
+    See the example of it in https://github.com/neondatabase/neon/pull/4425/files#diff-15c5bfdd1d5cc1411b9221091511a60dd13a9edf672bdfbb57dd2ef8bb7815d6
+
+    Returns True if the dumps differ and produced diff is not allowed, False otherwise (in most cases we want it to return False).
     """
 
     with output.open("w") as stdout:
@@ -474,51 +450,30 @@ def dump_differs(first: Path, second: Path, output: Path) -> bool:
 
     differs = res.returncode != 0
 
-    # TODO: Remove after https://github.com/neondatabase/neon/pull/4425 is merged, and a couple of releases are made
-    if differs:
-        with tempfile.NamedTemporaryFile(mode="w") as tmp:
-            tmp.write(PR4425_ALLOWED_DIFF)
-            tmp.flush()
+    allowed_diffs = allowed_diffs or []
+    if differs and len(allowed_diffs) > 0:
+        for allowed_diff in allowed_diffs:
+            with tempfile.NamedTemporaryFile(mode="w") as tmp:
+                tmp.write(allowed_diff)
+                tmp.flush()
 
-            allowed = subprocess.run(
-                [
-                    "diff",
-                    "--unified",  # Make diff output more readable
-                    r"--ignore-matching-lines=^---",  # Ignore diff headers
-                    r"--ignore-matching-lines=^\+\+\+",  # Ignore diff headers
-                    "--ignore-matching-lines=^@@",  # Ignore diff blocks location
-                    "--ignore-matching-lines=^ *$",  # Ignore lines with only spaces
-                    "--ignore-matching-lines=^ --.*",  # Ignore the " --" lines for compatibility with PG14
-                    "--ignore-blank-lines",
-                    str(output),
-                    str(tmp.name),
-                ],
-            )
+                allowed = subprocess.run(
+                    [
+                        "diff",
+                        "--unified",  # Make diff output more readable
+                        r"--ignore-matching-lines=^---",  # Ignore diff headers
+                        r"--ignore-matching-lines=^\+\+\+",  # Ignore diff headers
+                        "--ignore-matching-lines=^@@",  # Ignore diff blocks location
+                        "--ignore-matching-lines=^ *$",  # Ignore lines with only spaces
+                        "--ignore-matching-lines=^ --.*",  # Ignore SQL comments in diff
+                        "--ignore-blank-lines",
+                        str(output),
+                        str(tmp.name),
+                    ],
+                )
 
-            differs = allowed.returncode != 0
+                differs = allowed.returncode != 0
+                if not differs:
+                    break
 
     return differs
-
-
-PR4425_ALLOWED_DIFF = """
---- /tmp/test_output/test_backward_compatibility[release-pg15]/compatibility_snapshot/dump.sql 2023-06-08 18:12:45.000000000 +0000
-+++ /tmp/test_output/test_backward_compatibility[release-pg15]/dump.sql        2023-06-13 07:25:35.211733653 +0000
-@@ -13,12 +13,20 @@
-
- CREATE ROLE cloud_admin;
- ALTER ROLE cloud_admin WITH SUPERUSER INHERIT CREATEROLE CREATEDB LOGIN REPLICATION BYPASSRLS;
-+CREATE ROLE neon_superuser;
-+ALTER ROLE neon_superuser WITH NOSUPERUSER INHERIT CREATEROLE CREATEDB NOLOGIN NOREPLICATION NOBYPASSRLS;
-
- --
- -- User Configurations
- --
-
-
-+--
-+-- Role memberships
-+--
-+
-+GRANT pg_read_all_data TO neon_superuser GRANTED BY cloud_admin;
-+GRANT pg_write_all_data TO neon_superuser GRANTED BY cloud_admin;
-"""

From 2fbdf26094e3a22fc239889d50a93ffede751ac9 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 28 Jul 2023 21:32:01 +0300
Subject: [PATCH 25/37] test: raise timeout to avoid flakyness (#4832)

2s timeout was too tight for our CI,
[evidence](https://neon-github-public-dev.s3.amazonaws.com/reports/main/5669956577/index.html#/testresult/6388e31182cc2d6e).
15s might be better.

Also cleanup code no longer needed after #4204.
---
 test_runner/regress/test_ondemand_download.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index b92981a371..ca43df350d 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -690,10 +690,6 @@ def test_ondemand_download_failure_to_replace(
 
     pageserver_http = env.pageserver.http_client()
 
-    lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
-
-    wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
-
     # remove layers so that they will be redownloaded
     pageserver_http.tenant_detach(tenant_id)
     pageserver_http.tenant_attach(tenant_id)
@@ -704,8 +700,10 @@ def test_ondemand_download_failure_to_replace(
     # requesting details with non-incremental size should trigger a download of the only layer
     # this will need to be adjusted if an index for logical sizes is ever implemented
     with pytest.raises(PageserverApiException):
-        # error message is not useful
-        pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2)
+        # PageserverApiException is expected because of the failpoint (timeline_detail building does something)
+        # ReadTimeout can happen on our busy CI, but it should not, because there is no more busylooping
+        # but should it be added back, we would wait for 15s here.
+        pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=15)
 
     actual_message = ".* ERROR .*layermap-replace-notfound"
     assert env.pageserver.log_contains(actual_message) is not None

From 4338eed8c4e29f422e58279c8179181910c568a0 Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 28 Jul 2023 22:06:03 +0300
Subject: [PATCH 26/37] Make it possible to grant self perfmissions to self
 created roles (#4821)

## Problem

See: https://neondb.slack.com/archives/C04USJQNLD6/p1689973957908869

## Summary of changes

Bump Postgres version

## Checklist before requesting a review

- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [ ] Do not forget to reformat commit message to not include the above
checklist

---------

Co-authored-by: Konstantin Knizhnik <knizhnik@neon.tech>
---
 vendor/postgres-v14   | 2 +-
 vendor/postgres-v15   | 2 +-
 vendor/revisions.json | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/vendor/postgres-v14 b/vendor/postgres-v14
index 12c5dc8281..ebedb34d01 160000
--- a/vendor/postgres-v14
+++ b/vendor/postgres-v14
@@ -1 +1 @@
-Subproject commit 12c5dc8281d20b5bd636e1097eea80a7bc609591
+Subproject commit ebedb34d01c8ac9c31e8ea4628b9854103a1dc8f
diff --git a/vendor/postgres-v15 b/vendor/postgres-v15
index e3fbfc4d14..1220c8a63f 160000
--- a/vendor/postgres-v15
+++ b/vendor/postgres-v15
@@ -1 +1 @@
-Subproject commit e3fbfc4d143b2d3c3c1813ce747f8af35aa9405e
+Subproject commit 1220c8a63f00101829f9222a5821fc084b4384c7
diff --git a/vendor/revisions.json b/vendor/revisions.json
index 18da5900a8..f5d7428cd9 100644
--- a/vendor/revisions.json
+++ b/vendor/revisions.json
@@ -1,4 +1,4 @@
 {
-    "postgres-v15": "e3fbfc4d143b2d3c3c1813ce747f8af35aa9405e",
-    "postgres-v14": "12c5dc8281d20b5bd636e1097eea80a7bc609591"
+    "postgres-v15": "1220c8a63f00101829f9222a5821fc084b4384c7",
+    "postgres-v14": "ebedb34d01c8ac9c31e8ea4628b9854103a1dc8f"
 }

From a8f3540f3d35a701239583389474904444393785 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Fri, 28 Jul 2023 19:10:55 -0400
Subject: [PATCH 27/37] proxy: add unit test for wake_compute (#4819)

## Problem

ref https://github.com/neondatabase/neon/pull/4721, ref
https://github.com/neondatabase/neon/issues/4709

## Summary of changes

This PR adds unit tests for wake_compute.

The patch adds a new variant `Test` to auth backends. When
`wake_compute` is called, we will verify if it is the exact operation
sequence we are expecting. The operation sequence now contains 3 more
operations: `Wake`, `WakeRetry`, and `WakeFail`.

The unit tests for proxy connects are now complete and I'll continue
work on WebSocket e2e test in future PRs.

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 proxy/src/auth/backend.rs |  15 +++++
 proxy/src/proxy.rs        |  35 ++++++-----
 proxy/src/proxy/tests.rs  | 120 +++++++++++++++++++++++++++++++-------
 3 files changed, 134 insertions(+), 36 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 9322e4f9ff..ff73f2b625 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -53,6 +53,12 @@ pub enum BackendType<'a, T> {
     Postgres(Cow<'a, console::provider::mock::Api>, T),
     /// Authentication via a web browser.
     Link(Cow<'a, url::ApiUrl>),
+    /// Test backend.
+    Test(&'a dyn TestBackend),
+}
+
+pub trait TestBackend: Send + Sync + 'static {
+    fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError>;
 }
 
 impl std::fmt::Display for BackendType<'_, ()> {
@@ -62,6 +68,7 @@ impl std::fmt::Display for BackendType<'_, ()> {
             Console(endpoint, _) => fmt.debug_tuple("Console").field(&endpoint.url()).finish(),
             Postgres(endpoint, _) => fmt.debug_tuple("Postgres").field(&endpoint.url()).finish(),
             Link(url) => fmt.debug_tuple("Link").field(&url.as_str()).finish(),
+            Test(_) => fmt.debug_tuple("Test").finish(),
         }
     }
 }
@@ -75,6 +82,7 @@ impl<T> BackendType<'_, T> {
             Console(c, x) => Console(Cow::Borrowed(c), x),
             Postgres(c, x) => Postgres(Cow::Borrowed(c), x),
             Link(c) => Link(Cow::Borrowed(c)),
+            Test(x) => Test(*x),
         }
     }
 }
@@ -89,6 +97,7 @@ impl<'a, T> BackendType<'a, T> {
             Console(c, x) => Console(c, f(x)),
             Postgres(c, x) => Postgres(c, f(x)),
             Link(c) => Link(c),
+            Test(x) => Test(x),
         }
     }
 }
@@ -102,6 +111,7 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
             Console(c, x) => x.map(|x| Console(c, x)),
             Postgres(c, x) => x.map(|x| Postgres(c, x)),
             Link(c) => Ok(Link(c)),
+            Test(x) => Ok(Test(x)),
         }
     }
 }
@@ -147,6 +157,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
             Console(_, creds) => creds.project.clone(),
             Postgres(_, creds) => creds.project.clone(),
             Link(_) => Some("link".to_owned()),
+            Test(_) => Some("test".to_owned()),
         }
     }
     /// Authenticate the client via the requested backend, possibly using credentials.
@@ -188,6 +199,9 @@ impl BackendType<'_, ClientCredentials<'_>> {
                     .await?
                     .map(CachedNodeInfo::new_uncached)
             }
+            Test(_) => {
+                unreachable!("this function should never be called in the test backend")
+            }
         };
 
         info!("user successfully authenticated");
@@ -206,6 +220,7 @@ impl BackendType<'_, ClientCredentials<'_>> {
             Console(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
             Postgres(api, creds) => api.wake_compute(extra, creds).map_ok(Some).await,
             Link(_) => Ok(None),
+            Test(x) => x.wake_compute().map(Some),
         }
     }
 }
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index b0c5a41efb..6a0a65578c 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -6,7 +6,7 @@ use crate::{
     cancellation::{self, CancelMap},
     compute::{self, PostgresConnection},
     config::{ProxyConfig, TlsConfig},
-    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo},
+    console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api},
     stream::{PqStream, Stream},
 };
 use anyhow::{bail, Context};
@@ -413,18 +413,18 @@ where
     loop {
         match state {
             ConnectionState::Invalid(config, err) => {
+                info!("compute node's state has likely changed; requesting a wake-up");
+
                 let wake_res = match creds {
-                    auth::BackendType::Console(api, creds) => {
-                        try_wake(api.as_ref(), extra, creds).await
-                    }
-                    auth::BackendType::Postgres(api, creds) => {
-                        try_wake(api.as_ref(), extra, creds).await
-                    }
+                    auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
+                    auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
                     // nothing to do?
                     auth::BackendType::Link(_) => return Err(err.into()),
+                    // test backend
+                    auth::BackendType::Test(x) => x.wake_compute(),
                 };
 
-                match wake_res {
+                match handle_try_wake(wake_res) {
                     // there was an error communicating with the control plane
                     Err(e) => return Err(e.into()),
                     // failed to wake up but we can continue to retry
@@ -478,13 +478,10 @@ where
 /// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
 /// * Returns Ok(Break(node)) if the wakeup succeeded
 /// * Returns Err(e) if there was an error
-pub async fn try_wake(
-    api: &impl console::Api,
-    extra: &console::ConsoleReqExtra<'_>,
-    creds: &auth::ClientCredentials<'_>,
+fn handle_try_wake(
+    result: Result<console::CachedNodeInfo, WakeComputeError>,
 ) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
-    info!("compute node's state has likely changed; requesting a wake-up");
-    match api.wake_compute(extra, creds).await {
+    match result {
         Err(err) => match &err {
             WakeComputeError::ApiError(api) if api.could_retry() => Ok(ControlFlow::Continue(err)),
             _ => Err(err),
@@ -494,6 +491,16 @@ pub async fn try_wake(
     }
 }
 
+/// Attempts to wake up the compute node.
+pub async fn try_wake(
+    api: &impl console::Api,
+    extra: &console::ConsoleReqExtra<'_>,
+    creds: &auth::ClientCredentials<'_>,
+) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
+    info!("compute node's state has likely changed; requesting a wake-up");
+    handle_try_wake(api.wake_compute(extra, creds).await)
+}
+
 pub trait ShouldRetry {
     fn could_retry(&self) -> bool;
     fn should_retry(&self, num_retries: u32) -> bool {
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 565f86eecc..27e14d442c 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -1,10 +1,10 @@
 //! A group of high-level tests for connection establishing logic and auth.
-use std::borrow::Cow;
-
+//!
 use super::*;
+use crate::auth::backend::TestBackend;
 use crate::auth::ClientCredentials;
 use crate::console::{CachedNodeInfo, NodeInfo};
-use crate::{auth, sasl, scram};
+use crate::{auth, http, sasl, scram};
 use async_trait::async_trait;
 use rstest::rstest;
 use tokio_postgres::config::SslMode;
@@ -309,8 +309,11 @@ fn connect_compute_total_wait() {
     assert!(total_wait > tokio::time::Duration::from_secs(10));
 }
 
-#[derive(Clone, Copy)]
+#[derive(Clone, Copy, Debug)]
 enum ConnectAction {
+    Wake,
+    WakeFail,
+    WakeRetry,
     Connect,
     Retry,
     Fail,
@@ -321,6 +324,17 @@ struct TestConnectMechanism {
     sequence: Vec<ConnectAction>,
 }
 
+impl TestConnectMechanism {
+    fn verify(&self) {
+        let counter = self.counter.lock().unwrap();
+        assert_eq!(
+            *counter,
+            self.sequence.len(),
+            "sequence does not proceed to the end"
+        );
+    }
+}
+
 impl TestConnectMechanism {
     fn new(sequence: Vec<ConnectAction>) -> Self {
         Self {
@@ -370,30 +384,63 @@ impl ConnectMechanism for TestConnectMechanism {
             ConnectAction::Connect => Ok(TestConnection),
             ConnectAction::Retry => Err(TestConnectError { retryable: true }),
             ConnectAction::Fail => Err(TestConnectError { retryable: false }),
+            x => panic!("expecting action {:?}, connect is called instead", x),
         }
     }
 
     fn update_connect_config(&self, _conf: &mut compute::ConnCfg) {}
 }
 
-fn helper_create_connect_info() -> (
-    CachedNodeInfo,
-    console::ConsoleReqExtra<'static>,
-    auth::BackendType<'static, ClientCredentials<'static>>,
-) {
+impl TestBackend for TestConnectMechanism {
+    fn wake_compute(&self) -> Result<CachedNodeInfo, console::errors::WakeComputeError> {
+        let mut counter = self.counter.lock().unwrap();
+        let action = self.sequence[*counter];
+        *counter += 1;
+        match action {
+            ConnectAction::Wake => Ok(helper_create_cached_node_info()),
+            ConnectAction::WakeFail => {
+                let err = console::errors::ApiError::Console {
+                    status: http::StatusCode::FORBIDDEN,
+                    text: "TEST".into(),
+                };
+                assert!(!err.could_retry());
+                Err(console::errors::WakeComputeError::ApiError(err))
+            }
+            ConnectAction::WakeRetry => {
+                let err = console::errors::ApiError::Console {
+                    status: http::StatusCode::INTERNAL_SERVER_ERROR,
+                    text: "TEST".into(),
+                };
+                assert!(err.could_retry());
+                Err(console::errors::WakeComputeError::ApiError(err))
+            }
+            x => panic!("expecting action {:?}, wake_compute is called instead", x),
+        }
+    }
+}
+
+fn helper_create_cached_node_info() -> CachedNodeInfo {
     let node = NodeInfo {
         config: compute::ConnCfg::new(),
         aux: Default::default(),
         allow_self_signed_compute: false,
     };
-    let cache = CachedNodeInfo::new_uncached(node);
+    CachedNodeInfo::new_uncached(node)
+}
+
+fn helper_create_connect_info(
+    mechanism: &TestConnectMechanism,
+) -> (
+    CachedNodeInfo,
+    console::ConsoleReqExtra<'static>,
+    auth::BackendType<'_, ClientCredentials<'static>>,
+) {
+    let cache = helper_create_cached_node_info();
     let extra = console::ConsoleReqExtra {
         session_id: uuid::Uuid::new_v4(),
         application_name: Some("TEST"),
     };
-    let url = "https://TEST_URL".parse().unwrap();
-    let api = console::provider::mock::Api::new(url);
-    let creds = auth::BackendType::Postgres(Cow::Owned(api), ClientCredentials::new_noop());
+    let creds = auth::BackendType::Test(mechanism);
     (cache, extra, creds)
 }
 
@@ -401,42 +448,46 @@ fn helper_create_connect_info() -> (
 async fn connect_to_compute_success() {
     use ConnectAction::*;
     let mechanism = TestConnectMechanism::new(vec![Connect]);
-    let (cache, extra, creds) = helper_create_connect_info();
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
     connect_to_compute(&mechanism, cache, &extra, &creds)
         .await
         .unwrap();
+    mechanism.verify();
 }
 
 #[tokio::test]
 async fn connect_to_compute_retry() {
     use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Retry, Retry, Connect]);
-    let (cache, extra, creds) = helper_create_connect_info();
+    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Connect]);
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
     connect_to_compute(&mechanism, cache, &extra, &creds)
         .await
         .unwrap();
+    mechanism.verify();
 }
 
 /// Test that we don't retry if the error is not retryable.
 #[tokio::test]
 async fn connect_to_compute_non_retry_1() {
     use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Retry, Retry, Fail]);
-    let (cache, extra, creds) = helper_create_connect_info();
+    let mechanism = TestConnectMechanism::new(vec![Retry, Wake, Retry, Fail]);
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
     connect_to_compute(&mechanism, cache, &extra, &creds)
         .await
         .unwrap_err();
+    mechanism.verify();
 }
 
 /// Even for non-retryable errors, we should retry at least once.
 #[tokio::test]
 async fn connect_to_compute_non_retry_2() {
     use ConnectAction::*;
-    let mechanism = TestConnectMechanism::new(vec![Fail, Retry, Connect]);
-    let (cache, extra, creds) = helper_create_connect_info();
+    let mechanism = TestConnectMechanism::new(vec![Fail, Wake, Retry, Connect]);
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
     connect_to_compute(&mechanism, cache, &extra, &creds)
         .await
         .unwrap();
+    mechanism.verify();
 }
 
 /// Retry for at most `NUM_RETRIES_CONNECT` times.
@@ -445,11 +496,36 @@ async fn connect_to_compute_non_retry_3() {
     assert_eq!(NUM_RETRIES_CONNECT, 10);
     use ConnectAction::*;
     let mechanism = TestConnectMechanism::new(vec![
-        Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
+        Retry, Wake, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry, Retry,
         /* the 11th time */ Retry,
     ]);
-    let (cache, extra, creds) = helper_create_connect_info();
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
     connect_to_compute(&mechanism, cache, &extra, &creds)
         .await
         .unwrap_err();
+    mechanism.verify();
+}
+
+/// Should retry wake compute.
+#[tokio::test]
+async fn wake_retry() {
+    use ConnectAction::*;
+    let mechanism = TestConnectMechanism::new(vec![Retry, WakeRetry, Wake, Connect]);
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mechanism, cache, &extra, &creds)
+        .await
+        .unwrap();
+    mechanism.verify();
+}
+
+/// Wake failed with a non-retryable error.
+#[tokio::test]
+async fn wake_non_retry() {
+    use ConnectAction::*;
+    let mechanism = TestConnectMechanism::new(vec![Retry, WakeFail]);
+    let (cache, extra, creds) = helper_create_connect_info(&mechanism);
+    connect_to_compute(&mechanism, cache, &extra, &creds)
+        .await
+        .unwrap_err();
+    mechanism.verify();
 }

From 89ee8f202835a3889dfb327bd275f6df9ebd5cc4 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 31 Jul 2023 10:43:12 +0300
Subject: [PATCH 28/37] fix: demote warnings, fix flakyness (#4837)

`WARN ... found future (image|delta) layer` are not actionable log
lines. They don't need to be warnings. `info!` is enough.

This also fixes some known but not tracked flakyness in
[`test_remote_timeline_client_calls_started_metric`][evidence].

[evidence]:
https://neon-github-public-dev.s3.amazonaws.com/reports/pr-4829/5683495367/index.html#/testresult/34fe79e24729618b

Closes #3369.
Closes #4473.
---
 pageserver/src/tenant/timeline.rs              | 8 ++++----
 test_runner/regress/test_gc_cutoff.py          | 4 ----
 test_runner/regress/test_pageserver_restart.py | 4 ----
 test_runner/regress/test_recovery.py           | 4 ----
 test_runner/regress/test_remote_storage.py     | 3 ---
 5 files changed, 4 insertions(+), 19 deletions(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 4f9c0b08b8..c7974e9c00 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1600,7 +1600,7 @@ impl Timeline {
             if let Some(imgfilename) = ImageFileName::parse_str(&fname) {
                 // create an ImageLayer struct for each image file.
                 if imgfilename.lsn > disk_consistent_lsn {
-                    warn!(
+                    info!(
                         "found future image layer {} on timeline {} disk_consistent_lsn is {}",
                         imgfilename, self.timeline_id, disk_consistent_lsn
                     );
@@ -1632,7 +1632,7 @@ impl Timeline {
                 // is 102, then it might not have been fully flushed to disk
                 // before crash.
                 if deltafilename.lsn_range.end > disk_consistent_lsn + 1 {
-                    warn!(
+                    info!(
                         "found future delta layer {} on timeline {} disk_consistent_lsn is {}",
                         deltafilename, self.timeline_id, disk_consistent_lsn
                     );
@@ -1774,7 +1774,7 @@ impl Timeline {
             match remote_layer_name {
                 LayerFileName::Image(imgfilename) => {
                     if imgfilename.lsn > up_to_date_disk_consistent_lsn {
-                        warn!(
+                        info!(
                         "found future image layer {} on timeline {} remote_consistent_lsn is {}",
                         imgfilename, self.timeline_id, up_to_date_disk_consistent_lsn
                     );
@@ -1799,7 +1799,7 @@ impl Timeline {
                     // is 102, then it might not have been fully flushed to disk
                     // before crash.
                     if deltafilename.lsn_range.end > up_to_date_disk_consistent_lsn + 1 {
-                        warn!(
+                        info!(
                             "found future delta layer {} on timeline {} remote_consistent_lsn is {}",
                             deltafilename, self.timeline_id, up_to_date_disk_consistent_lsn
                         );
diff --git a/test_runner/regress/test_gc_cutoff.py b/test_runner/regress/test_gc_cutoff.py
index 6e2a0622f1..f58abb4575 100644
--- a/test_runner/regress/test_gc_cutoff.py
+++ b/test_runner/regress/test_gc_cutoff.py
@@ -14,10 +14,6 @@ from fixtures.neon_fixtures import NeonEnvBuilder, PgBin
 def test_gc_cutoff(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin):
     env = neon_env_builder.init_start()
 
-    # These warnings are expected, when the pageserver is restarted abruptly
-    env.pageserver.allowed_errors.append(".*found future image layer.*")
-    env.pageserver.allowed_errors.append(".*found future delta layer.*")
-
     pageserver_http = env.pageserver.http_client()
 
     # Use aggressive GC and checkpoint settings, so that we also exercise GC during the test
diff --git a/test_runner/regress/test_pageserver_restart.py b/test_runner/regress/test_pageserver_restart.py
index 6da5503fb1..b8ddbe3ec1 100644
--- a/test_runner/regress/test_pageserver_restart.py
+++ b/test_runner/regress/test_pageserver_restart.py
@@ -72,10 +72,6 @@ def test_pageserver_restart(neon_env_builder: NeonEnvBuilder):
 def test_pageserver_chaos(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
 
-    # These warnings are expected, when the pageserver is restarted abruptly
-    env.pageserver.allowed_errors.append(".*found future image layer.*")
-    env.pageserver.allowed_errors.append(".*found future delta layer.*")
-
     # Use a tiny checkpoint distance, to create a lot of layers quickly.
     # That allows us to stress the compaction and layer flushing logic more.
     tenant, _ = env.neon_cli.create_tenant(
diff --git a/test_runner/regress/test_recovery.py b/test_runner/regress/test_recovery.py
index 552825cf08..9d7a4a8fd6 100644
--- a/test_runner/regress/test_recovery.py
+++ b/test_runner/regress/test_recovery.py
@@ -15,10 +15,6 @@ def test_pageserver_recovery(neon_env_builder: NeonEnvBuilder):
     env = neon_env_builder.init_start()
     env.pageserver.is_testing_enabled_or_skip()
 
-    # These warnings are expected, when the pageserver is restarted abruptly
-    env.pageserver.allowed_errors.append(".*found future delta layer.*")
-    env.pageserver.allowed_errors.append(".*found future image layer.*")
-
     # Create a branch for us
     env.neon_cli.create_branch("test_pageserver_recovery", "main")
 
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 43f0fb718f..7c04ed9017 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -348,9 +348,6 @@ def test_remote_storage_upload_queue_retries(
     # XXX: should vary this test to selectively fail just layer uploads, index uploads, deletions
     #      but how do we validate the result after restore?
 
-    # these are always possible when we do an immediate stop. perhaps something with compacting has changed since.
-    env.pageserver.allowed_errors.append(r".*found future (delta|image) layer.*")
-
     env.pageserver.stop(immediate=True)
     env.endpoints.stop_all()
 

From e5183f85dc3a6b851429c183dfa1e21ed109ba6f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Arpad=20M=C3=BCller?= <arpad-m@users.noreply.github.com>
Date: Mon, 31 Jul 2023 12:52:29 +0200
Subject: [PATCH 29/37] Make DiskBtreeReader::dump async (#4838)

## Problem

`DiskBtreeReader::dump` calls `read_blk` internally, which we want to
make async in the future. As it is currently relying on recursion, and
async doesn't like recursion, we want to find an alternative to that and
instead traverse the tree using a loop and a manual stack.

## Summary of changes

* Make `DiskBtreeReader::dump` and all the places calling it async
* Make `DiskBtreeReader::dump` non-recursive internally and use a stack
instead. It now deparses the node in each iteration, which isn't
optimal, but on the other hand it's hard to store the node as it is
referencing the buffer. Self referential data are hard in Rust. For a
dumping function, speed isn't a priority so we deparse the node multiple
times now (up to branching factor many times).

Part of https://github.com/neondatabase/neon/issues/4743

I have verified that output is unchanged by comparing the output of this
command both before and after this patch:
```
cargo test -p pageserver -- particular_data --nocapture
```
---
 pageserver/src/tenant/disk_btree.rs           | 67 ++++++++++---------
 .../src/tenant/storage_layer/delta_layer.rs   |  2 +-
 .../src/tenant/storage_layer/image_layer.rs   |  2 +-
 3 files changed, 37 insertions(+), 34 deletions(-)

diff --git a/pageserver/src/tenant/disk_btree.rs b/pageserver/src/tenant/disk_btree.rs
index 734409a619..518286ddb0 100644
--- a/pageserver/src/tenant/disk_btree.rs
+++ b/pageserver/src/tenant/disk_btree.rs
@@ -390,39 +390,42 @@ where
     }
 
     #[allow(dead_code)]
-    pub fn dump(&self) -> Result<()> {
-        self.dump_recurse(self.root_blk, &[], 0)
-    }
+    pub async fn dump(&self) -> Result<()> {
+        let mut stack = Vec::new();
 
-    fn dump_recurse(&self, blknum: u32, path: &[u8], depth: usize) -> Result<()> {
-        let blk = self.reader.read_blk(self.start_blk + blknum)?;
-        let buf: &[u8] = blk.as_ref();
+        stack.push((self.root_blk, String::new(), 0, 0, 0));
 
-        let node = OnDiskNode::<L>::deparse(buf)?;
+        while let Some((blknum, path, depth, child_idx, key_off)) = stack.pop() {
+            let blk = self.reader.read_blk(self.start_blk + blknum)?;
+            let buf: &[u8] = blk.as_ref();
+            let node = OnDiskNode::<L>::deparse(buf)?;
 
-        print!("{:indent$}", "", indent = depth * 2);
-        println!(
-            "blk #{}: path {}: prefix {}, suffix_len {}",
-            blknum,
-            hex::encode(path),
-            hex::encode(node.prefix),
-            node.suffix_len
-        );
+            if child_idx == 0 {
+                print!("{:indent$}", "", indent = depth * 2);
+                let path_prefix = stack
+                    .iter()
+                    .map(|(_blknum, path, ..)| path.as_str())
+                    .collect::<String>();
+                println!(
+                    "blk #{blknum}: path {path_prefix}{path}: prefix {}, suffix_len {}",
+                    hex::encode(node.prefix),
+                    node.suffix_len
+                );
+            }
 
-        let mut idx = 0;
-        let mut key_off = 0;
-        while idx < node.num_children {
+            if child_idx + 1 < node.num_children {
+                let key_off = key_off + node.suffix_len as usize;
+                stack.push((blknum, path.clone(), depth, child_idx + 1, key_off));
+            }
             let key = &node.keys[key_off..key_off + node.suffix_len as usize];
-            let val = node.value(idx as usize);
+            let val = node.value(child_idx as usize);
+
             print!("{:indent$}", "", indent = depth * 2 + 2);
             println!("{}: {}", hex::encode(key), hex::encode(val.0));
 
             if node.level > 0 {
-                let child_path = [path, node.prefix].concat();
-                self.dump_recurse(val.to_blknum(), &child_path, depth + 1)?;
+                stack.push((val.to_blknum(), hex::encode(node.prefix), depth + 1, 0, 0));
             }
-            idx += 1;
-            key_off += node.suffix_len as usize;
         }
         Ok(())
     }
@@ -754,8 +757,8 @@ mod tests {
         }
     }
 
-    #[test]
-    fn basic() -> Result<()> {
+    #[tokio::test]
+    async fn basic() -> Result<()> {
         let mut disk = TestDisk::new();
         let mut writer = DiskBtreeBuilder::<_, 6>::new(&mut disk);
 
@@ -775,7 +778,7 @@ mod tests {
 
         let reader = DiskBtreeReader::new(0, root_offset, disk);
 
-        reader.dump()?;
+        reader.dump().await?;
 
         // Test the `get` function on all the keys.
         for (key, val) in all_data.iter() {
@@ -835,8 +838,8 @@ mod tests {
         Ok(())
     }
 
-    #[test]
-    fn lots_of_keys() -> Result<()> {
+    #[tokio::test]
+    async fn lots_of_keys() -> Result<()> {
         let mut disk = TestDisk::new();
         let mut writer = DiskBtreeBuilder::<_, 8>::new(&mut disk);
 
@@ -856,7 +859,7 @@ mod tests {
 
         let reader = DiskBtreeReader::new(0, root_offset, disk);
 
-        reader.dump()?;
+        reader.dump().await?;
 
         use std::sync::Mutex;
 
@@ -994,8 +997,8 @@ mod tests {
     ///
     /// This test contains a particular data set, see disk_btree_test_data.rs
     ///
-    #[test]
-    fn particular_data() -> Result<()> {
+    #[tokio::test]
+    async fn particular_data() -> Result<()> {
         // Build a tree from it
         let mut disk = TestDisk::new();
         let mut writer = DiskBtreeBuilder::<_, 26>::new(&mut disk);
@@ -1022,7 +1025,7 @@ mod tests {
         })?;
         assert_eq!(count, disk_btree_test_data::TEST_DATA.len());
 
-        reader.dump()?;
+        reader.dump().await?;
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/storage_layer/delta_layer.rs b/pageserver/src/tenant/storage_layer/delta_layer.rs
index 9585c04120..7574554b4e 100644
--- a/pageserver/src/tenant/storage_layer/delta_layer.rs
+++ b/pageserver/src/tenant/storage_layer/delta_layer.rs
@@ -256,7 +256,7 @@ impl Layer for DeltaLayer {
             file,
         );
 
-        tree_reader.dump()?;
+        tree_reader.dump().await?;
 
         let mut cursor = file.block_cursor();
 
diff --git a/pageserver/src/tenant/storage_layer/image_layer.rs b/pageserver/src/tenant/storage_layer/image_layer.rs
index f758347d9a..a5b9d8834e 100644
--- a/pageserver/src/tenant/storage_layer/image_layer.rs
+++ b/pageserver/src/tenant/storage_layer/image_layer.rs
@@ -175,7 +175,7 @@ impl Layer for ImageLayer {
         let tree_reader =
             DiskBtreeReader::<_, KEY_SIZE>::new(inner.index_start_blk, inner.index_root_blk, file);
 
-        tree_reader.dump()?;
+        tree_reader.dump().await?;
 
         tree_reader.visit(&[0u8; KEY_SIZE], VisitDirection::Forwards, |key, value| {
             println!("key: {} offset {}", hex::encode(key), value);

From f0ad603693077cb4c4f9ae070248c6382c1219ff Mon Sep 17 00:00:00 2001
From: John Spray <jcspray@gmail.com>
Date: Mon, 31 Jul 2023 12:51:18 +0100
Subject: [PATCH 30/37] pageserver: add unit test for deleted_at in IndexPart
 (#4844)

## Problem

Existing IndexPart unit tests only exercised the version 1 format (i.e.
without deleted_at set).

## Summary of changes

Add a test that sets version to 2, and sets a value for deleted_at.

Closes https://github.com/neondatabase/neon/issues/4162
---
 .../tenant/remote_timeline_client/index.rs    | 39 +++++++++++++++++++
 1 file changed, 39 insertions(+)

diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index c3f6dcadec..fdbf26e6ae 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -223,6 +223,45 @@ mod tests {
         assert_eq!(part, expected);
     }
 
+    #[test]
+    fn v2_indexpart_is_parsed_with_deleted_at() {
+        let example = r#"{
+            "version":2,
+            "timeline_layers":["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9"],
+            "missing_layers":["This shouldn't fail deserialization"],
+            "layer_metadata":{
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9": { "file_size": 25600000 },
+                "000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51": { "file_size": 9007199254741001 }
+            },
+            "disk_consistent_lsn":"0/16960E8",
+            "metadata_bytes":[112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
+            "deleted_at": "2023-07-31T09:00:00.123"
+        }"#;
+
+        let expected = IndexPart {
+            // note this is not verified, could be anything, but exists for humans debugging.. could be the git version instead?
+            version: 2,
+            timeline_layers: HashSet::from(["000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap()]),
+            layer_metadata: HashMap::from([
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__0000000001696070-00000000016960E9".parse().unwrap(), IndexLayerMetadata {
+                    file_size: 25600000,
+                }),
+                ("000000000000000000000000000000000000-FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF__00000000016B59D8-00000000016B5A51".parse().unwrap(), IndexLayerMetadata {
+                    // serde_json should always parse this but this might be a double with jq for
+                    // example.
+                    file_size: 9007199254741001,
+                })
+            ]),
+            disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
+            metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            deleted_at: Some(chrono::NaiveDateTime::parse_from_str(
+                "2023-07-31T09:00:00.123000000", "%Y-%m-%dT%H:%M:%S.%f").unwrap())
+        };
+
+        let part = serde_json::from_str::<IndexPart>(example).unwrap();
+        assert_eq!(part, expected);
+    }
+
     #[test]
     fn empty_layers_are_parsed() {
         let empty_layers_json = r#"{

From eb7860312179ce8a9275dc50b90626747bfe35e5 Mon Sep 17 00:00:00 2001
From: Conrad Ludgate <conrad@neon.tech>
Date: Mon, 31 Jul 2023 14:30:24 +0100
Subject: [PATCH 31/37] proxy: div by zero (#4845)

## Problem

1. In the CacheInvalid state loop, we weren't checking the
`num_retries`. If this managed to get up to `32`, the retry_after
procedure would compute 2^32 which would overflow to 0 and trigger a div
by zero
2. When fixing the above, I started working on a flow diagram for the
state machine logic and realised it was more complex than it had to be:
  a. We start in a `Cached` state
b. `Cached`: call `connect_once`. After the first connect_once error, we
always move to the `CacheInvalid` state, otherwise, we return the
connection.
c. `CacheInvalid`: we attempt to `wake_compute` and we either switch to
Cached or we retry this step (or we error).
d. `Cached`: call `connect_once`. We either retry this step or we have a
connection (or we error) - After num_retries > 1 we never switch back to
`CacheInvalid`.

## Summary of changes

1. Insert a `num_retries` check in the `handle_try_wake` procedure. Also
using floats in the retry_after procedure to prevent the overflow
entirely
2. Refactor connect_to_compute to be more linear in design.
---
 proxy/src/auth/backend/classic.rs |  11 +--
 proxy/src/proxy.rs                | 140 +++++++++++++-----------------
 proxy/src/proxy/tests.rs          |   2 +-
 3 files changed, 66 insertions(+), 87 deletions(-)

diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index 4d20b9bf90..5ed0f747c2 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -5,7 +5,7 @@ use crate::{
     auth::{self, AuthFlow, ClientCredentials},
     compute,
     console::{self, AuthInfo, CachedNodeInfo, ConsoleReqExtra},
-    proxy::{try_wake, NUM_RETRIES_CONNECT},
+    proxy::handle_try_wake,
     sasl, scram,
     stream::PqStream,
 };
@@ -51,14 +51,15 @@ pub(super) async fn authenticate(
         }
     };
 
+    info!("compute node's state has likely changed; requesting a wake-up");
     let mut num_retries = 0;
     let mut node = loop {
-        num_retries += 1;
-        match try_wake(api, extra, creds).await? {
+        let wake_res = api.wake_compute(extra, creds).await;
+        match handle_try_wake(wake_res, num_retries)? {
+            ControlFlow::Continue(_) => num_retries += 1,
             ControlFlow::Break(n) => break n,
-            ControlFlow::Continue(_) if num_retries < NUM_RETRIES_CONNECT => continue,
-            ControlFlow::Continue(e) => return Err(e.into()),
         }
+        info!(num_retries, "retrying wake compute");
     };
     if let Some(keys) = scram_keys {
         use tokio_postgres::config::AuthKeys;
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 6a0a65578c..5f59957b2d 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -347,11 +347,6 @@ async fn connect_to_compute_once(
         .await
 }
 
-enum ConnectionState<E> {
-    Cached(console::CachedNodeInfo),
-    Invalid(compute::ConnCfg, E),
-}
-
 #[async_trait]
 pub trait ConnectMechanism {
     type Connection;
@@ -407,70 +402,67 @@ where
 
     mechanism.update_connect_config(&mut node_info.config);
 
-    let mut num_retries = 0;
-    let mut state = ConnectionState::<M::ConnectError>::Cached(node_info);
+    // try once
+    let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+        Ok(res) => return Ok(res),
+        Err(e) => {
+            error!(error = ?e, "could not connect to compute node");
+            (invalidate_cache(node_info), e)
+        }
+    };
 
-    loop {
-        match state {
-            ConnectionState::Invalid(config, err) => {
-                info!("compute node's state has likely changed; requesting a wake-up");
+    let mut num_retries = 1;
 
-                let wake_res = match creds {
-                    auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
-                    auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
-                    // nothing to do?
-                    auth::BackendType::Link(_) => return Err(err.into()),
-                    // test backend
-                    auth::BackendType::Test(x) => x.wake_compute(),
-                };
+    // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
+    info!("compute node's state has likely changed; requesting a wake-up");
+    let node_info = loop {
+        let wake_res = match creds {
+            auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await,
+            auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await,
+            // nothing to do?
+            auth::BackendType::Link(_) => return Err(err.into()),
+            // test backend
+            auth::BackendType::Test(x) => x.wake_compute(),
+        };
 
-                match handle_try_wake(wake_res) {
-                    // there was an error communicating with the control plane
-                    Err(e) => return Err(e.into()),
-                    // failed to wake up but we can continue to retry
-                    Ok(ControlFlow::Continue(_)) => {
-                        state = ConnectionState::Invalid(config, err);
-                        let wait_duration = retry_after(num_retries);
-                        num_retries += 1;
-
-                        info!(num_retries, "retrying wake compute");
-                        time::sleep(wait_duration).await;
-                        continue;
-                    }
-                    // successfully woke up a compute node and can break the wakeup loop
-                    Ok(ControlFlow::Break(mut node_info)) => {
-                        node_info.config.reuse_password(&config);
-                        mechanism.update_connect_config(&mut node_info.config);
-                        state = ConnectionState::Cached(node_info)
-                    }
-                }
+        match handle_try_wake(wake_res, num_retries)? {
+            // failed to wake up but we can continue to retry
+            ControlFlow::Continue(_) => {}
+            // successfully woke up a compute node and can break the wakeup loop
+            ControlFlow::Break(mut node_info) => {
+                node_info.config.reuse_password(&config);
+                mechanism.update_connect_config(&mut node_info.config);
+                break node_info;
             }
-            ConnectionState::Cached(node_info) => {
-                match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
-                    Ok(res) => return Ok(res),
-                    Err(e) => {
-                        error!(error = ?e, "could not connect to compute node");
-                        if !e.should_retry(num_retries) {
-                            return Err(e.into());
-                        }
+        }
 
-                        // after the first connect failure,
-                        // we should invalidate the cache and wake up a new compute node
-                        if num_retries == 0 {
-                            state = ConnectionState::Invalid(invalidate_cache(node_info), e);
-                        } else {
-                            state = ConnectionState::Cached(node_info);
-                        }
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
 
-                        let wait_duration = retry_after(num_retries);
-                        num_retries += 1;
+        time::sleep(wait_duration).await;
+        info!(num_retries, "retrying wake compute");
+    };
 
-                        info!(num_retries, "retrying wake compute");
-                        time::sleep(wait_duration).await;
-                    }
+    // now that we have a new node, try connect to it repeatedly.
+    // this can error for a few reasons, for instance:
+    // * DNS connection settings haven't quite propagated yet
+    info!("wake_compute success. attempting to connect");
+    loop {
+        match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
+            Ok(res) => return Ok(res),
+            Err(e) => {
+                error!(error = ?e, "could not connect to compute node");
+                if !e.should_retry(num_retries) {
+                    return Err(e.into());
                 }
             }
         }
+
+        let wait_duration = retry_after(num_retries);
+        num_retries += 1;
+
+        time::sleep(wait_duration).await;
+        info!(num_retries, "retrying connect_once");
     }
 }
 
@@ -478,12 +470,15 @@ where
 /// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable
 /// * Returns Ok(Break(node)) if the wakeup succeeded
 /// * Returns Err(e) if there was an error
-fn handle_try_wake(
+pub fn handle_try_wake(
     result: Result<console::CachedNodeInfo, WakeComputeError>,
+    num_retries: u32,
 ) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
     match result {
         Err(err) => match &err {
-            WakeComputeError::ApiError(api) if api.could_retry() => Ok(ControlFlow::Continue(err)),
+            WakeComputeError::ApiError(api) if api.should_retry(num_retries) => {
+                Ok(ControlFlow::Continue(err))
+            }
             _ => Err(err),
         },
         // Ready to try again.
@@ -491,22 +486,10 @@ fn handle_try_wake(
     }
 }
 
-/// Attempts to wake up the compute node.
-pub async fn try_wake(
-    api: &impl console::Api,
-    extra: &console::ConsoleReqExtra<'_>,
-    creds: &auth::ClientCredentials<'_>,
-) -> Result<ControlFlow<console::CachedNodeInfo, WakeComputeError>, WakeComputeError> {
-    info!("compute node's state has likely changed; requesting a wake-up");
-    handle_try_wake(api.wake_compute(extra, creds).await)
-}
-
 pub trait ShouldRetry {
     fn could_retry(&self) -> bool;
     fn should_retry(&self, num_retries: u32) -> bool {
         match self {
-            // retry all errors at least once
-            _ if num_retries == 0 => true,
             _ if num_retries >= NUM_RETRIES_CONNECT => false,
             err => err.could_retry(),
         }
@@ -558,14 +541,9 @@ impl ShouldRetry for compute::ConnectionError {
     }
 }
 
-pub fn retry_after(num_retries: u32) -> time::Duration {
-    match num_retries {
-        0 => time::Duration::ZERO,
-        _ => {
-            // 3/2 = 1.5 which seems to be an ok growth factor heuristic
-            BASE_RETRY_WAIT_DURATION * 3_u32.pow(num_retries) / 2_u32.pow(num_retries)
-        }
-    }
+fn retry_after(num_retries: u32) -> time::Duration {
+    // 1.5 seems to be an ok growth factor heuristic
+    BASE_RETRY_WAIT_DURATION.mul_f64(1.5_f64.powi(num_retries as i32))
 }
 
 /// Finish client connection initialization: confirm auth success, send params, etc.
diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs
index 27e14d442c..5653ec94dc 100644
--- a/proxy/src/proxy/tests.rs
+++ b/proxy/src/proxy/tests.rs
@@ -302,7 +302,7 @@ async fn scram_auth_mock() -> anyhow::Result<()> {
 #[test]
 fn connect_compute_total_wait() {
     let mut total_wait = tokio::time::Duration::ZERO;
-    for num_retries in 0..10 {
+    for num_retries in 1..10 {
         total_wait += retry_after(num_retries);
     }
     assert!(total_wait < tokio::time::Duration::from_secs(12));

From 705ae2dce9b60e6597e8794a23ef9ac731e0cbc3 Mon Sep 17 00:00:00 2001
From: Yinnan Yao <35447132+yaoyinnan@users.noreply.github.com>
Date: Mon, 31 Jul 2023 21:40:52 +0800
Subject: [PATCH 32/37] Fix error message for listen_pg_addr_tenant_only
 binding (#4787)

## Problem

Wrong use of `conf.listen_pg_addr` in `error!()`.

## Summary of changes

Use `listen_pg_addr_tenant_only` instead of `conf.listen_pg_addr`.

Signed-off-by: yaoyinnan <35447132+yaoyinnan@users.noreply.github.com>
---
 safekeeper/src/bin/safekeeper.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index abede2e44d..90eebd392c 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -234,7 +234,10 @@ async fn start_safekeeper(conf: SafeKeeperConf) -> Result<()> {
                 listen_pg_addr_tenant_only
             );
             let listener = tcp_listener::bind(listen_pg_addr_tenant_only.clone()).map_err(|e| {
-                error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
+                error!(
+                    "failed to bind to address {}: {}",
+                    listen_pg_addr_tenant_only, e
+                );
                 e
             })?;
             Some(listener)

From e1424647a0d730000b74b524fd4440918d152676 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim2404@users.noreply.github.com>
Date: Mon, 31 Jul 2023 19:23:18 +0200
Subject: [PATCH 33/37] Update pg_embedding to 0.3.1 version (#4811)

---
 Dockerfile.compute-node | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 2c4465fc82..9759faf733 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -551,10 +551,8 @@ FROM build-deps AS pg-embedding-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
 ENV PATH "/usr/local/pgsql/bin/:$PATH"
-# eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703 made on 15/07/2023
-# There is no release tag yet
-RUN wget https://github.com/neondatabase/pg_embedding/archive/eeb3ba7c3a60c95b2604dd543c64b2f1bb4a3703.tar.gz -O pg_embedding.tar.gz && \
-    echo "030846df723652f99a8689ce63b66fa0c23477a7fd723533ab8a6b28ab70730f pg_embedding.tar.gz" | sha256sum --check && \
+RUN wget https://github.com/neondatabase/pg_embedding/archive/refs/tags/0.3.1.tar.gz -O pg_embedding.tar.gz && \
+    echo "c4ae84eef36fa8ec5868f6e061f39812f19ee5ba3604d428d40935685c7be512 pg_embedding.tar.gz" | sha256sum --check && \
     mkdir pg_embedding-src && cd pg_embedding-src && tar xvzf ../pg_embedding.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \

From 39e458f049ebf430f12139fd42284157cf136d20 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 31 Jul 2023 18:49:46 +0100
Subject: [PATCH 34/37] test_compatibility: fix pg_tenant_only_port port
 collision (#4850)

## Problem

Compatibility tests fail from time to time due to `pg_tenant_only_port`
port collision (added in https://github.com/neondatabase/neon/pull/4731)

## Summary of changes
- replace `pg_tenant_only_port` value in config with new port
- remove old logic, than we don't need anymore
- unify config overrides
---
 test_runner/regress/test_compatibility.py | 40 +++++++----------------
 1 file changed, 12 insertions(+), 28 deletions(-)

diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index dd2556350f..754d23a01a 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -257,28 +257,15 @@ def prepare_snapshot(
         shutil.rmtree(repo_dir / "pgdatadirs")
     os.mkdir(repo_dir / "endpoints")
 
-    # Remove wal-redo temp directory if it exists. Newer pageserver versions don't create
-    # them anymore, but old versions did.
-    for tenant in (repo_dir / "tenants").glob("*"):
-        wal_redo_dir = tenant / "wal-redo-datadir.___temp"
-        if wal_redo_dir.exists() and wal_redo_dir.is_dir():
-            shutil.rmtree(wal_redo_dir)
-
     # Update paths and ports in config files
     pageserver_toml = repo_dir / "pageserver.toml"
     pageserver_config = toml.load(pageserver_toml)
     pageserver_config["remote_storage"]["local_path"] = str(repo_dir / "local_fs_remote_storage")
-    pageserver_config["listen_http_addr"] = port_distributor.replace_with_new_port(
-        pageserver_config["listen_http_addr"]
-    )
-    pageserver_config["listen_pg_addr"] = port_distributor.replace_with_new_port(
-        pageserver_config["listen_pg_addr"]
-    )
+    for param in ("listen_http_addr", "listen_pg_addr", "broker_endpoint"):
+        pageserver_config[param] = port_distributor.replace_with_new_port(pageserver_config[param])
 
-    # Older pageserver versions had just one `auth_type` setting. Now there
-    # are separate settings for pg and http ports. We don't use authentication
-    # in compatibility tests so just remove authentication related settings.
-    pageserver_config.pop("auth_type", None)
+    # We don't use authentication in compatibility tests
+    # so just remove authentication related settings.
     pageserver_config.pop("pg_auth_type", None)
     pageserver_config.pop("http_auth_type", None)
 
@@ -290,19 +277,16 @@ def prepare_snapshot(
 
     snapshot_config_toml = repo_dir / "config"
     snapshot_config = toml.load(snapshot_config_toml)
-
-    broker_listen_addr = f"127.0.0.1:{port_distributor.get_port()}"
-    snapshot_config["broker"] = {"listen_addr": broker_listen_addr}
-
-    snapshot_config["pageserver"]["listen_http_addr"] = port_distributor.replace_with_new_port(
-        snapshot_config["pageserver"]["listen_http_addr"]
-    )
-    snapshot_config["pageserver"]["listen_pg_addr"] = port_distributor.replace_with_new_port(
-        snapshot_config["pageserver"]["listen_pg_addr"]
+    for param in ("listen_http_addr", "listen_pg_addr"):
+        snapshot_config["pageserver"][param] = port_distributor.replace_with_new_port(
+            snapshot_config["pageserver"][param]
+        )
+    snapshot_config["broker"]["listen_addr"] = port_distributor.replace_with_new_port(
+        snapshot_config["broker"]["listen_addr"]
     )
     for sk in snapshot_config["safekeepers"]:
-        sk["http_port"] = port_distributor.replace_with_new_port(sk["http_port"])
-        sk["pg_port"] = port_distributor.replace_with_new_port(sk["pg_port"])
+        for param in ("http_port", "pg_port", "pg_tenant_only_port"):
+            sk[param] = port_distributor.replace_with_new_port(sk[param])
 
     if pg_distrib_dir:
         snapshot_config["pg_distrib_dir"] = str(pg_distrib_dir)

From ddbe1704540f3cc50e11dc42a0b1772e8a4f091b Mon Sep 17 00:00:00 2001
From: bojanserafimov <bojan.serafimov7@gmail.com>
Date: Mon, 31 Jul 2023 14:13:32 -0400
Subject: [PATCH 35/37] Prewarm compute nodes (#4828)

---
 compute_tools/src/bin/compute_ctl.rs |  7 +++++
 compute_tools/src/compute.rs         | 44 ++++++++++++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 4953e5f331..9d15a203e5 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -193,6 +193,13 @@ fn main() -> Result<()> {
     if !spec_set {
         // No spec provided, hang waiting for it.
         info!("no compute spec provided, waiting");
+
+        // TODO this can stall startups in the unlikely event that we bind
+        //      this compute node while it's busy prewarming. It's not too
+        //      bad because it's just 100ms and unlikely, but it's an
+        //      avoidable problem.
+        compute.prewarm_postgres()?;
+
         let mut state = compute.state.lock().unwrap();
         while state.status != ComputeStatus::ConfigurationPending {
             state = compute.state_changed.wait(state).unwrap();
diff --git a/compute_tools/src/compute.rs b/compute_tools/src/compute.rs
index ceda189364..254d367a71 100644
--- a/compute_tools/src/compute.rs
+++ b/compute_tools/src/compute.rs
@@ -532,6 +532,50 @@ impl ComputeNode {
         Ok(())
     }
 
+    /// Start and stop a postgres process to warm up the VM for startup.
+    pub fn prewarm_postgres(&self) -> Result<()> {
+        info!("prewarming");
+
+        // Create pgdata
+        let pgdata = &format!("{}.warmup", self.pgdata);
+        create_pgdata(pgdata)?;
+
+        // Run initdb to completion
+        info!("running initdb");
+        let initdb_bin = Path::new(&self.pgbin).parent().unwrap().join("initdb");
+        Command::new(initdb_bin)
+            .args(["-D", pgdata])
+            .output()
+            .expect("cannot start initdb process");
+
+        // Write conf
+        use std::io::Write;
+        let conf_path = Path::new(pgdata).join("postgresql.conf");
+        let mut file = std::fs::File::create(conf_path)?;
+        writeln!(file, "shared_buffers=65536")?;
+        writeln!(file, "port=51055")?; // Nobody should be connecting
+        writeln!(file, "shared_preload_libraries = 'neon'")?;
+
+        // Start postgres
+        info!("starting postgres");
+        let mut pg = Command::new(&self.pgbin)
+            .args(["-D", pgdata])
+            .spawn()
+            .expect("cannot start postgres process");
+
+        // Stop it when it's ready
+        info!("waiting for postgres");
+        wait_for_postgres(&mut pg, Path::new(pgdata))?;
+        pg.kill()?;
+        info!("sent kill signal");
+        pg.wait()?;
+        info!("done prewarming");
+
+        // clean up
+        let _ok = fs::remove_dir_all(pgdata);
+        Ok(())
+    }
+
     /// Start Postgres as a child process and manage DBs/roles.
     /// After that this will hang waiting on the postmaster process to exit.
     #[instrument(skip_all)]

From 326189d95080d85bb1c635da3511501836839b8d Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 31 Jul 2023 22:10:19 +0300
Subject: [PATCH 36/37] consumption_metrics: send timeline_written_size_delta
 (#4822)

We want to have timeline_written_size_delta which is defined as
difference to the previously sent `timeline_written_size` from the
current `timeline_written_size`.

Solution is to send it. On the first round `disk_consistent_lsn` is used
which is captured during `load` time. After that an incremental "event"
is sent on every collection. Incremental "events" are not part of
deduplication.

I've added some infrastructure to allow somewhat typesafe
`EventType::Absolute` and `EventType::Incremental` factories per
metrics, now that we have our first `EventType::Incremental` usage.
---
 libs/consumption_metrics/src/lib.rs   |  28 ++-
 pageserver/src/consumption_metrics.rs | 263 ++++++++++++++++++++------
 pageserver/src/tenant/timeline.rs     |   6 +
 3 files changed, 243 insertions(+), 54 deletions(-)

diff --git a/libs/consumption_metrics/src/lib.rs b/libs/consumption_metrics/src/lib.rs
index 3aac00662d..3a1b396d63 100644
--- a/libs/consumption_metrics/src/lib.rs
+++ b/libs/consumption_metrics/src/lib.rs
@@ -5,7 +5,7 @@ use chrono::{DateTime, Utc};
 use rand::Rng;
 use serde::Serialize;
 
-#[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
+#[derive(Serialize, Debug, Clone, Copy, Eq, PartialEq, Ord, PartialOrd)]
 #[serde(tag = "type")]
 pub enum EventType {
     #[serde(rename = "absolute")]
@@ -17,6 +17,32 @@ pub enum EventType {
     },
 }
 
+impl EventType {
+    pub fn absolute_time(&self) -> Option<&DateTime<Utc>> {
+        use EventType::*;
+        match self {
+            Absolute { time } => Some(time),
+            _ => None,
+        }
+    }
+
+    pub fn incremental_timerange(&self) -> Option<std::ops::Range<&DateTime<Utc>>> {
+        // these can most likely be thought of as Range or RangeFull
+        use EventType::*;
+        match self {
+            Incremental {
+                start_time,
+                stop_time,
+            } => Some(start_time..stop_time),
+            _ => None,
+        }
+    }
+
+    pub fn is_incremental(&self) -> bool {
+        matches!(self, EventType::Incremental { .. })
+    }
+}
+
 #[derive(Serialize, Debug, Clone, Eq, PartialEq, Ord, PartialOrd)]
 pub struct Event<Extra> {
     #[serde(flatten)]
diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index df44300fce..45b4be470b 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -7,7 +7,7 @@ use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use anyhow;
-use chrono::Utc;
+use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use pageserver_api::models::TenantState;
 use reqwest::Url;
@@ -18,12 +18,6 @@ use std::time::Duration;
 use tracing::*;
 use utils::id::{NodeId, TenantId, TimelineId};
 
-const WRITTEN_SIZE: &str = "written_size";
-const SYNTHETIC_STORAGE_SIZE: &str = "synthetic_storage_size";
-const RESIDENT_SIZE: &str = "resident_size";
-const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
-const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
-
 const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
 
 #[serde_as]
@@ -44,6 +38,121 @@ pub struct PageserverConsumptionMetricsKey {
     pub metric: &'static str,
 }
 
+impl PageserverConsumptionMetricsKey {
+    const fn absolute_values(self) -> AbsoluteValueFactory {
+        AbsoluteValueFactory(self)
+    }
+    const fn incremental_values(self) -> IncrementalValueFactory {
+        IncrementalValueFactory(self)
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only absolute values.
+struct AbsoluteValueFactory(PageserverConsumptionMetricsKey);
+
+impl AbsoluteValueFactory {
+    fn now(self, val: u64) -> (PageserverConsumptionMetricsKey, (EventType, u64)) {
+        let key = self.0;
+        let time = Utc::now();
+        (key, (EventType::Absolute { time }, val))
+    }
+}
+
+/// Helper type which each individual metric kind can return to produce only incremental values.
+struct IncrementalValueFactory(PageserverConsumptionMetricsKey);
+
+impl IncrementalValueFactory {
+    #[allow(clippy::wrong_self_convention)]
+    fn from_previous_up_to(
+        self,
+        prev_end: DateTime<Utc>,
+        up_to: DateTime<Utc>,
+        val: u64,
+    ) -> (PageserverConsumptionMetricsKey, (EventType, u64)) {
+        let key = self.0;
+        // cannot assert prev_end < up_to because these are realtime clock based
+        (
+            key,
+            (
+                EventType::Incremental {
+                    start_time: prev_end,
+                    stop_time: up_to,
+                },
+                val,
+            ),
+        )
+    }
+
+    fn key(&self) -> &PageserverConsumptionMetricsKey {
+        &self.0
+    }
+}
+
+// the static part of a PageserverConsumptionMetricsKey
+impl PageserverConsumptionMetricsKey {
+    const fn written_size(tenant_id: TenantId, timeline_id: TimelineId) -> AbsoluteValueFactory {
+        PageserverConsumptionMetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: "written_size",
+        }
+        .absolute_values()
+    }
+
+    /// Values will be the difference of the latest written_size (last_record_lsn) to what we
+    /// previously sent.
+    const fn written_size_delta(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> IncrementalValueFactory {
+        PageserverConsumptionMetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: "written_size_bytes_delta",
+        }
+        .incremental_values()
+    }
+
+    const fn timeline_logical_size(
+        tenant_id: TenantId,
+        timeline_id: TimelineId,
+    ) -> AbsoluteValueFactory {
+        PageserverConsumptionMetricsKey {
+            tenant_id,
+            timeline_id: Some(timeline_id),
+            metric: "timeline_logical_size",
+        }
+        .absolute_values()
+    }
+
+    const fn remote_storage_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        PageserverConsumptionMetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "remote_storage_size",
+        }
+        .absolute_values()
+    }
+
+    const fn resident_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        PageserverConsumptionMetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "resident_size",
+        }
+        .absolute_values()
+    }
+
+    const fn synthetic_size(tenant_id: TenantId) -> AbsoluteValueFactory {
+        PageserverConsumptionMetricsKey {
+            tenant_id,
+            timeline_id: None,
+            metric: "synthetic_storage_size",
+        }
+        .absolute_values()
+    }
+}
+
 /// Main thread that serves metrics collection
 pub async fn collect_metrics(
     metric_collection_endpoint: &Url,
@@ -79,7 +188,7 @@ pub async fn collect_metrics(
         .timeout(DEFAULT_HTTP_REPORTING_TIMEOUT)
         .build()
         .expect("Failed to create http client with timeout");
-    let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
+    let mut cached_metrics = HashMap::new();
     let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
 
     loop {
@@ -121,13 +230,13 @@ pub async fn collect_metrics(
 /// - refactor this function (chunking+sending part) to reuse it in proxy module;
 pub async fn collect_metrics_iteration(
     client: &reqwest::Client,
-    cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, u64>,
+    cached_metrics: &mut HashMap<PageserverConsumptionMetricsKey, (EventType, u64)>,
     metric_collection_endpoint: &reqwest::Url,
     node_id: NodeId,
     ctx: &RequestContext,
     send_cached: bool,
 ) {
-    let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, u64)> = Vec::new();
+    let mut current_metrics: Vec<(PageserverConsumptionMetricsKey, (EventType, u64))> = Vec::new();
     trace!(
         "starting collect_metrics_iteration. metric_collection_endpoint: {}",
         metric_collection_endpoint
@@ -166,27 +275,80 @@ pub async fn collect_metrics_iteration(
             if timeline.is_active() {
                 let timeline_written_size = u64::from(timeline.get_last_record_lsn());
 
-                current_metrics.push((
-                    PageserverConsumptionMetricsKey {
-                        tenant_id,
-                        timeline_id: Some(timeline.timeline_id),
-                        metric: WRITTEN_SIZE,
+                let (key, written_size_now) =
+                    PageserverConsumptionMetricsKey::written_size(tenant_id, timeline.timeline_id)
+                        .now(timeline_written_size);
+
+                // last_record_lsn can only go up, right now at least, TODO: #2592 or related
+                // features might change this.
+
+                let written_size_delta_key = PageserverConsumptionMetricsKey::written_size_delta(
+                    tenant_id,
+                    timeline.timeline_id,
+                );
+
+                // use this when available, because in a stream of incremental values, it will be
+                // accurate where as when last_record_lsn stops moving, we will only cache the last
+                // one of those.
+                let last_stop_time =
+                    cached_metrics
+                        .get(written_size_delta_key.key())
+                        .map(|(until, _val)| {
+                            until
+                                .incremental_timerange()
+                                .expect("never create EventType::Absolute for written_size_delta")
+                                .end
+                        });
+
+                // by default, use the last sent written_size as the basis for
+                // calculating the delta. if we don't yet have one, use the load time value.
+                let prev = cached_metrics
+                    .get(&key)
+                    .map(|(prev_at, prev)| {
+                        // use the prev time from our last incremental update, or default to latest
+                        // absolute update on the first round.
+                        let prev_at = prev_at
+                            .absolute_time()
+                            .expect("never create EventType::Incremental for written_size");
+                        let prev_at = last_stop_time.unwrap_or(prev_at);
+                        (*prev_at, *prev)
+                    })
+                    .unwrap_or_else(|| {
+                        // if we don't have a previous point of comparison, compare to the load time
+                        // lsn.
+                        let (disk_consistent_lsn, loaded_at) = &timeline.loaded_at;
+                        (DateTime::from(*loaded_at), disk_consistent_lsn.0)
+                    });
+
+                // written_size_delta_bytes
+                current_metrics.extend(
+                    if let Some(delta) = written_size_now.1.checked_sub(prev.1) {
+                        let up_to = written_size_now
+                            .0
+                            .absolute_time()
+                            .expect("never create EventType::Incremental for written_size");
+                        let key_value =
+                            written_size_delta_key.from_previous_up_to(prev.0, *up_to, delta);
+                        Some(key_value)
+                    } else {
+                        None
                     },
-                    timeline_written_size,
-                ));
+                );
+
+                // written_size
+                current_metrics.push((key, written_size_now));
 
                 let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
                 match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
                     // Only send timeline logical size when it is fully calculated.
                     Ok((size, is_exact)) if is_exact => {
-                        current_metrics.push((
-                            PageserverConsumptionMetricsKey {
+                        current_metrics.push(
+                            PageserverConsumptionMetricsKey::timeline_logical_size(
                                 tenant_id,
-                                timeline_id: Some(timeline.timeline_id),
-                                metric: TIMELINE_LOGICAL_SIZE,
-                            },
-                            size,
-                        ));
+                                timeline.timeline_id,
+                            )
+                            .now(size),
+                        );
                     }
                     Ok((_, _)) => {}
                     Err(err) => {
@@ -205,14 +367,10 @@ pub async fn collect_metrics_iteration(
 
         match tenant.get_remote_size().await {
             Ok(tenant_remote_size) => {
-                current_metrics.push((
-                    PageserverConsumptionMetricsKey {
-                        tenant_id,
-                        timeline_id: None,
-                        metric: REMOTE_STORAGE_SIZE,
-                    },
-                    tenant_remote_size,
-                ));
+                current_metrics.push(
+                    PageserverConsumptionMetricsKey::remote_storage_size(tenant_id)
+                        .now(tenant_remote_size),
+                );
             }
             Err(err) => {
                 error!(
@@ -222,14 +380,9 @@ pub async fn collect_metrics_iteration(
             }
         }
 
-        current_metrics.push((
-            PageserverConsumptionMetricsKey {
-                tenant_id,
-                timeline_id: None,
-                metric: RESIDENT_SIZE,
-            },
-            tenant_resident_size,
-        ));
+        current_metrics.push(
+            PageserverConsumptionMetricsKey::resident_size(tenant_id).now(tenant_resident_size),
+        );
 
         // Note that this metric is calculated in a separate bgworker
         // Here we only use cached value, which may lag behind the real latest one
@@ -237,23 +390,27 @@ pub async fn collect_metrics_iteration(
 
         if tenant_synthetic_size != 0 {
             // only send non-zeroes because otherwise these show up as errors in logs
-            current_metrics.push((
-                PageserverConsumptionMetricsKey {
-                    tenant_id,
-                    timeline_id: None,
-                    metric: SYNTHETIC_STORAGE_SIZE,
-                },
-                tenant_synthetic_size,
-            ));
+            current_metrics.push(
+                PageserverConsumptionMetricsKey::synthetic_size(tenant_id)
+                    .now(tenant_synthetic_size),
+            );
         }
     }
 
     // Filter metrics, unless we want to send all metrics, including cached ones.
     // See: https://github.com/neondatabase/neon/issues/3485
     if !send_cached {
-        current_metrics.retain(|(curr_key, curr_val)| match cached_metrics.get(curr_key) {
-            Some(val) => val != curr_val,
-            None => true,
+        current_metrics.retain(|(curr_key, (kind, curr_val))| {
+            if kind.is_incremental() {
+                // incremental values (currently only written_size_delta) should not get any cache
+                // deduplication because they will be used by upstream for "is still alive."
+                true
+            } else {
+                match cached_metrics.get(curr_key) {
+                    Some((_, val)) => val != curr_val,
+                    None => true,
+                }
+            }
         });
     }
 
@@ -272,8 +429,8 @@ pub async fn collect_metrics_iteration(
         chunk_to_send.clear();
 
         // enrich metrics with type,timestamp and idempotency key before sending
-        chunk_to_send.extend(chunk.iter().map(|(curr_key, curr_val)| Event {
-            kind: EventType::Absolute { time: Utc::now() },
+        chunk_to_send.extend(chunk.iter().map(|(curr_key, (when, curr_val))| Event {
+            kind: *when,
             metric: curr_key.metric,
             idempotency_key: idempotency_key(node_id.to_string()),
             value: *curr_val,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index c7974e9c00..34211fb714 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -294,6 +294,10 @@ pub struct Timeline {
     /// Completion shared between all timelines loaded during startup; used to delay heavier
     /// background tasks until some logical sizes have been calculated.
     initial_logical_size_attempt: Mutex<Option<completion::Completion>>,
+
+    /// Load or creation time information about the disk_consistent_lsn and when the loading
+    /// happened. Used for consumption metrics.
+    pub(crate) loaded_at: (Lsn, SystemTime),
 }
 
 pub struct WalReceiverInfo {
@@ -1404,6 +1408,8 @@ impl Timeline {
                 last_freeze_at: AtomicLsn::new(disk_consistent_lsn.0),
                 last_freeze_ts: RwLock::new(Instant::now()),
 
+                loaded_at: (disk_consistent_lsn, SystemTime::now()),
+
                 ancestor_timeline: ancestor,
                 ancestor_lsn: metadata.ancestor_lsn(),
 

From 7b6c84945688b873801fe4f551840e1424044ba6 Mon Sep 17 00:00:00 2001
From: Alex Chi Z <iskyzh@gmail.com>
Date: Mon, 31 Jul 2023 19:59:11 -0400
Subject: [PATCH 37/37] support isolation level + read only for http batch sql
 (#4830)

We will retrieve `neon-batch-isolation-level` and `neon-batch-read-only`
from the http header, which sets the txn properties.
https://github.com/neondatabase/serverless/pull/38#issuecomment-1653130981

---------

Signed-off-by: Alex Chi Z <chi@neon.tech>
---
 proxy/src/http/sql_over_http.rs   | 47 ++++++++++++++++++++++++++++---
 proxy/src/http/websocket.rs       | 11 ++++++--
 test_runner/regress/test_proxy.py | 27 +++++++++++++++---
 3 files changed, 75 insertions(+), 10 deletions(-)

diff --git a/proxy/src/http/sql_over_http.rs b/proxy/src/http/sql_over_http.rs
index 1f83fcfc68..a4eb13a10b 100644
--- a/proxy/src/http/sql_over_http.rs
+++ b/proxy/src/http/sql_over_http.rs
@@ -1,7 +1,9 @@
 use std::sync::Arc;
 
+use anyhow::bail;
 use futures::pin_mut;
 use futures::StreamExt;
+use hashbrown::HashMap;
 use hyper::body::HttpBody;
 use hyper::http::HeaderName;
 use hyper::http::HeaderValue;
@@ -12,6 +14,7 @@ use serde_json::Value;
 use tokio_postgres::types::Kind;
 use tokio_postgres::types::Type;
 use tokio_postgres::GenericClient;
+use tokio_postgres::IsolationLevel;
 use tokio_postgres::Row;
 use url::Url;
 
@@ -37,6 +40,8 @@ const MAX_REQUEST_SIZE: u64 = 1024 * 1024; // 1 MB
 static RAW_TEXT_OUTPUT: HeaderName = HeaderName::from_static("neon-raw-text-output");
 static ARRAY_MODE: HeaderName = HeaderName::from_static("neon-array-mode");
 static ALLOW_POOL: HeaderName = HeaderName::from_static("neon-pool-opt-in");
+static TXN_ISOLATION_LEVEL: HeaderName = HeaderName::from_static("neon-batch-isolation-level");
+static TXN_READ_ONLY: HeaderName = HeaderName::from_static("neon-batch-read-only");
 
 static HEADER_VALUE_TRUE: HeaderValue = HeaderValue::from_static("true");
 
@@ -170,7 +175,7 @@ pub async fn handle(
     request: Request<Body>,
     sni_hostname: Option<String>,
     conn_pool: Arc<GlobalConnPool>,
-) -> anyhow::Result<Value> {
+) -> anyhow::Result<(Value, HashMap<HeaderName, HeaderValue>)> {
     //
     // Determine the destination and connection params
     //
@@ -185,6 +190,23 @@ pub async fn handle(
     // Allow connection pooling only if explicitly requested
     let allow_pool = headers.get(&ALLOW_POOL) == Some(&HEADER_VALUE_TRUE);
 
+    // isolation level and read only
+
+    let txn_isolation_level_raw = headers.get(&TXN_ISOLATION_LEVEL).cloned();
+    let txn_isolation_level = match txn_isolation_level_raw {
+        Some(ref x) => Some(match x.as_bytes() {
+            b"Serializable" => IsolationLevel::Serializable,
+            b"ReadUncommitted" => IsolationLevel::ReadUncommitted,
+            b"ReadCommitted" => IsolationLevel::ReadCommitted,
+            b"RepeatableRead" => IsolationLevel::RepeatableRead,
+            _ => bail!("invalid isolation level"),
+        }),
+        None => None,
+    };
+
+    let txn_read_only_raw = headers.get(&TXN_READ_ONLY).cloned();
+    let txn_read_only = txn_read_only_raw.as_ref() == Some(&HEADER_VALUE_TRUE);
+
     let request_content_length = match request.body().size_hint().upper() {
         Some(v) => v,
         None => MAX_REQUEST_SIZE + 1,
@@ -208,10 +230,19 @@ pub async fn handle(
     // Now execute the query and return the result
     //
     let result = match payload {
-        Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode).await,
+        Payload::Single(query) => query_to_json(&client, query, raw_output, array_mode)
+            .await
+            .map(|x| (x, HashMap::default())),
         Payload::Batch(queries) => {
             let mut results = Vec::new();
-            let transaction = client.transaction().await?;
+            let mut builder = client.build_transaction();
+            if let Some(isolation_level) = txn_isolation_level {
+                builder = builder.isolation_level(isolation_level);
+            }
+            if txn_read_only {
+                builder = builder.read_only(true);
+            }
+            let transaction = builder.start().await?;
             for query in queries {
                 let result = query_to_json(&transaction, query, raw_output, array_mode).await;
                 match result {
@@ -223,7 +254,15 @@ pub async fn handle(
                 }
             }
             transaction.commit().await?;
-            Ok(json!({ "results": results }))
+            let mut headers = HashMap::default();
+            headers.insert(
+                TXN_READ_ONLY.clone(),
+                HeaderValue::try_from(txn_read_only.to_string())?,
+            );
+            if let Some(txn_isolation_level_raw) = txn_isolation_level_raw {
+                headers.insert(TXN_ISOLATION_LEVEL.clone(), txn_isolation_level_raw);
+            }
+            Ok((json!({ "results": results }), headers))
         }
     };
 
diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
index 4b6e15dc3a..fec76c74f4 100644
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -6,6 +6,7 @@ use crate::{
 };
 use bytes::{Buf, Bytes};
 use futures::{Sink, Stream, StreamExt};
+use hashbrown::HashMap;
 use hyper::{
     server::{
         accept,
@@ -205,7 +206,7 @@ async fn ws_handler(
             Ok(_) => StatusCode::OK,
             Err(_) => StatusCode::BAD_REQUEST,
         };
-        let json = match result {
+        let (json, headers) = match result {
             Ok(r) => r,
             Err(e) => {
                 let message = format!("{:?}", e);
@@ -216,7 +217,10 @@ async fn ws_handler(
                     },
                     None => Value::Null,
                 };
-                json!({ "message": message, "code": code })
+                (
+                    json!({ "message": message, "code": code }),
+                    HashMap::default(),
+                )
             }
         };
         json_response(status_code, json).map(|mut r| {
@@ -224,6 +228,9 @@ async fn ws_handler(
                 "Access-Control-Allow-Origin",
                 hyper::http::HeaderValue::from_static("*"),
             );
+            for (k, v) in headers {
+                r.headers_mut().insert(k, v);
+            }
             r
         })
     } else if request.uri().path() == "/sql" && request.method() == Method::OPTIONS {
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index d5bf98109c..35334ec7b2 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -265,18 +265,23 @@ def test_sql_over_http_output_options(static_proxy: NeonProxy):
 def test_sql_over_http_batch(static_proxy: NeonProxy):
     static_proxy.safe_psql("create role http with login password 'http' superuser")
 
-    def qq(queries: List[Tuple[str, Optional[List[Any]]]]) -> Any:
+    def qq(queries: List[Tuple[str, Optional[List[Any]]]], read_only: bool = False) -> Any:
         connstr = f"postgresql://http:http@{static_proxy.domain}:{static_proxy.proxy_port}/postgres"
         response = requests.post(
             f"https://{static_proxy.domain}:{static_proxy.external_http_port}/sql",
             data=json.dumps(list(map(lambda x: {"query": x[0], "params": x[1] or []}, queries))),
-            headers={"Content-Type": "application/sql", "Neon-Connection-String": connstr},
+            headers={
+                "Content-Type": "application/sql",
+                "Neon-Connection-String": connstr,
+                "Neon-Batch-Isolation-Level": "Serializable",
+                "Neon-Batch-Read-Only": "true" if read_only else "false",
+            },
             verify=str(static_proxy.test_output_dir / "proxy.crt"),
         )
         assert response.status_code == 200
-        return response.json()["results"]
+        return response.json()["results"], response.headers
 
-    result = qq(
+    result, headers = qq(
         [
             ("select 42 as answer", None),
             ("select $1 as answer", [42]),
@@ -291,6 +296,9 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
         ]
     )
 
+    assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
+    assert headers["Neon-Batch-Read-Only"] == "false"
+
     assert result[0]["rows"] == [{"answer": 42}]
     assert result[1]["rows"] == [{"answer": "42"}]
     assert result[2]["rows"] == [{"answer": 42}]
@@ -311,3 +319,14 @@ def test_sql_over_http_batch(static_proxy: NeonProxy):
     assert res["command"] == "DROP"
     assert res["rowCount"] is None
     assert len(result) == 10
+
+    result, headers = qq(
+        [
+            ("select 42 as answer", None),
+        ],
+        True,
+    )
+    assert headers["Neon-Batch-Isolation-Level"] == "Serializable"
+    assert headers["Neon-Batch-Read-Only"] == "true"
+
+    assert result[0]["rows"] == [{"answer": 42}]