From 66b06e416a72971e87480e1d4dd90e9b217b352d Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 9 May 2023 17:08:02 +0300
Subject: [PATCH 01/23] Pass tracing context in env variables instead of the
 spec file. (#4174)

If compute_ctl is launched without a spec file, it fetches it from the
control plane with an HTTP request. We cannot get the startup tracing
context from the compute spec in that case, because we don't have it
available on start. We could still read the tracing context from the
compute spec after we have fetched it, but that would leave the fetch
itself out of the context. Pass the tracing context in environment
variables instead.
---
 compute_tools/src/bin/compute_ctl.rs | 74 ++++++++++++++++++----------
 libs/compute_api/src/spec.rs         |  3 --
 2 files changed, 49 insertions(+), 28 deletions(-)
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 36dbc382b5..2f515c9bf1 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -30,6 +30,7 @@
 //!             -b /usr/local/bin/postgres
 //! ```
 //!
+use std::collections::HashMap;
 use std::fs::File;
 use std::panic;
 use std::path::Path;
@@ -67,6 +68,54 @@ fn main() -> Result<()> {
     let spec_json = matches.get_one::<String>("spec");
     let spec_path = matches.get_one::<String>("spec-path");
 
+    // Extract OpenTelemetry context for the startup actions from the
+    // TRACEPARENT and TRACESTATE env variables, and attach it to the current
+    // tracing context.
+    //
+    // This is used to propagate the context for the 'start_compute' operation
+    // from the neon control plane. This allows linking together the wider
+    // 'start_compute' operation that creates the compute container, with the
+    // startup actions here within the container.
+    //
+    // There is no standard for passing context in env variables, but a lot of
+    // tools use TRACEPARENT/TRACESTATE, so we use that convention too. See
+    // https://github.com/open-telemetry/opentelemetry-specification/issues/740
+    //
+    // Switch to the startup context here, and exit it once the startup has
+    // completed and Postgres is up and running.
+    //
+    // If this pod is pre-created without binding it to any particular endpoint
+    // yet, this isn't the right place to enter the startup context. In that
+    // case, the control plane should pass the tracing context as part of the
+    // /configure API call.
+    //
+    // NOTE: This is supposed to only cover the *startup* actions. Once
+    // postgres is configured and up-and-running, we exit this span. Any other
+    // actions that are performed on incoming HTTP requests, for example, are
+    // performed in separate spans.
+    //
+    // XXX: If the pod is restarted, we perform the startup actions in the same
+    // context as the original startup actions, which probably doesn't make
+    // sense.
+    let mut startup_tracing_carrier: HashMap<String, String> = HashMap::new();
+    if let Ok(val) = std::env::var("TRACEPARENT") {
+        startup_tracing_carrier.insert("traceparent".to_string(), val);
+    }
+    if let Ok(val) = std::env::var("TRACESTATE") {
+        startup_tracing_carrier.insert("tracestate".to_string(), val);
+    }
+    let startup_context_guard = if !startup_tracing_carrier.is_empty() {
+        use opentelemetry::propagation::TextMapPropagator;
+        use opentelemetry::sdk::propagation::TraceContextPropagator;
+        let guard = TraceContextPropagator::new()
+            .extract(&startup_tracing_carrier)
+            .attach();
+        info!("startup tracing context attached");
+        Some(guard)
+    } else {
+        None
+    };
+
     let compute_id = matches.get_one::<String>("compute-id");
     let control_plane_uri = matches.get_one::<String>("control-plane-uri");
 
@@ -148,8 +197,6 @@ fn main() -> Result<()> {
 
     // We got all we need, update the state.
     let mut state = compute.state.lock().unwrap();
-    let pspec = state.pspec.as_ref().expect("spec must be set");
-    let startup_tracing_context = pspec.spec.startup_tracing_context.clone();
 
     // Record for how long we slept waiting for the spec.
     state.metrics.wait_for_spec_ms = Utc::now()
@@ -165,29 +212,6 @@ fn main() -> Result<()> {
     compute.state_changed.notify_all();
     drop(state);
 
-    // Extract OpenTelemetry context for the startup actions from the spec, and
-    // attach it to the current tracing context.
-    //
-    // This is used to propagate the context for the 'start_compute' operation
-    // from the neon control plane. This allows linking together the wider
-    // 'start_compute' operation that creates the compute container, with the
-    // startup actions here within the container.
-    //
-    // Switch to the startup context here, and exit it once the startup has
-    // completed and Postgres is up and running.
-    //
-    // NOTE: This is supposed to only cover the *startup* actions. Once
-    // postgres is configured and up-and-running, we exit this span. Any other
-    // actions that are performed on incoming HTTP requests, for example, are
-    // performed in separate spans.
-    let startup_context_guard = if let Some(ref carrier) = startup_tracing_context {
-        use opentelemetry::propagation::TextMapPropagator;
-        use opentelemetry::sdk::propagation::TraceContextPropagator;
-        Some(TraceContextPropagator::new().extract(carrier).attach())
-    } else {
-        None
-    };
-
     // Launch remaining service threads
     let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
     let _configurator_handle =
diff --git a/libs/compute_api/src/spec.rs b/libs/compute_api/src/spec.rs
index 87c0edd687..6072980ed8 100644
--- a/libs/compute_api/src/spec.rs
+++ b/libs/compute_api/src/spec.rs
@@ -5,7 +5,6 @@
 //! and connect it to the storage nodes.
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
-use std::collections::HashMap;
 use utils::lsn::Lsn;
 
 /// String type alias representing Postgres identifier and
@@ -31,8 +30,6 @@ pub struct ComputeSpec {
     pub mode: ComputeMode,
 
     pub storage_auth_token: Option<String>,
-
-    pub startup_tracing_context: Option<HashMap<String, String>>,
 }
 
 #[serde_as]

From 3ec52088dd53a762f86c2469063e2c0724802513 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Tue, 9 May 2023 18:16:22 +0200
Subject: [PATCH 02/23] eviction_task: tracing::instrument the imitate-access
 calls (#4180)

Currently, if we unexpectly download from the eviction task, the log
lines look like what we have in
https://github.com/neondatabase/neon/issues/4154

```
2023-05-04T14:42:57.586772Z  WARN eviction_task{tenant_id=$TENANT timeline_id=$TIMELINE}:eviction_iteration{policy_kind="LayerAccessThreshold"}: unexpectedly on-demand downloading remote layer remote $TIMELINE/000000067F000032AC0000400C00FFFFFFFF-000000067F000032AC000040140000000008__0000000001696070-0000000003DC76E9 for task kind Eviction
```

We know these are caused by the imitate accesses.
But we don't know which one (my bet is on update_gc_info).

I didn't want to pollute the other tasks' logs with the additional
spans, so, using `.instrument()` when we call non-eviction-task code.

refs https://github.com/neondatabase/neon/issues/4154
---
 .../src/tenant/timeline/eviction_task.rs      | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 58321762ea..523a5f8fa7 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -22,7 +22,7 @@ use std::{
 
 use tokio::time::Instant;
 use tokio_util::sync::CancellationToken;
-use tracing::{debug, error, info, instrument, warn};
+use tracing::{debug, error, info, info_span, instrument, warn, Instrument};
 
 use crate::{
     context::{DownloadBehavior, RequestContext},
@@ -276,6 +276,7 @@ impl Timeline {
         ControlFlow::Continue(())
     }
 
+    #[instrument(skip_all)]
     async fn imitate_layer_accesses(
         &self,
         p: &EvictionPolicyLayerAccessThreshold,
@@ -324,6 +325,7 @@ impl Timeline {
     }
 
     /// Recompute the values which would cause on-demand downloads during restart.
+    #[instrument(skip_all)]
     async fn imitate_timeline_cached_layer_accesses(
         &self,
         cancel: &CancellationToken,
@@ -332,7 +334,10 @@ impl Timeline {
         let lsn = self.get_last_record_lsn();
 
         // imitiate on-restart initial logical size
-        let size = self.calculate_logical_size(lsn, cancel.clone(), ctx).await;
+        let size = self
+            .calculate_logical_size(lsn, cancel.clone(), ctx)
+            .instrument(info_span!("calculate_logical_size"))
+            .await;
 
         match &size {
             Ok(_size) => {
@@ -347,7 +352,11 @@ impl Timeline {
         }
 
         // imitiate repartiting on first compactation
-        if let Err(e) = self.collect_keyspace(lsn, ctx).await {
+        if let Err(e) = self
+            .collect_keyspace(lsn, ctx)
+            .instrument(info_span!("collect_keyspace"))
+            .await
+        {
             // if this failed, we probably failed logical size because these use the same keys
             if size.is_err() {
                 // ignore, see above comment
@@ -360,6 +369,7 @@ impl Timeline {
     }
 
     // Imitate the synthetic size calculation done by the consumption_metrics module.
+    #[instrument(skip_all)]
     async fn imitate_synthetic_size_calculation_worker(
         &self,
         tenant: &Arc<Tenant>,
@@ -398,7 +408,8 @@ impl Timeline {
 
         let mut throwaway_cache = HashMap::new();
         let gather =
-            crate::tenant::size::gather_inputs(tenant, limit, None, &mut throwaway_cache, ctx);
+            crate::tenant::size::gather_inputs(tenant, limit, None, &mut throwaway_cache, ctx)
+                .instrument(info_span!("gather_inputs"));
 
         tokio::select! {
             _ = cancel.cancelled() => {}

From eb3a8be9334533a57b5b337e3c040d6bb1ae57b4 Mon Sep 17 00:00:00 2001
From: Dmitry Rodionov <dmitry@neon.tech>
Date: Wed, 10 May 2023 11:27:12 +0300
Subject: [PATCH 03/23] keep track of timeline deletion status in IndexPart to
 prevent timeline resurrection (#3919)

Before this patch, the following sequence would lead to the resurrection of a deleted timeline:

- create timeline
- wait for its index part to reach s3
- delete timeline
- wait an arbitrary amount of time, including 0 seconds
- detach tenant
- attach tenant
- the timeline is there and Active again

This happens because we only kept track of the deletion in the tenant dir (by deleting the timeline dir) but not in S3.

The solution is to turn the deleted timeline's IndexPart into a tombstone.
The deletion status of the timeline is expressed in the `deleted_at: Option<NativeDateTime>` field of IndexPart.
It's `None` while the timeline is alive and `Some(deletion time stamp)` if it is deleted.

We change the timeline deletion handler to upload this tombstoned IndexPart.
The handler does not return success if the upload fails.

Coincidentally, this fixes the long-stanging TODO about the `std::fs::remove_dir_all` being not atomic.
It need not be atomic anymore because we set the `deleted_at=Some()` before starting the `remove_dir_all`.

The tombstone is in the IndexPart only, not in the `metadata`.
So, we only have the tombstone and the `remove_dir_all` benefits mentioned above if remote storage is configured.
This was a conscious trade-off because there's no good format evolution story for the current metadata file format.

The introduction of this additional step into `delete_timeline` was painful because delete_timeline needs to be
1. cancel-safe
2. idempotent
3. safe to call concurrently
These are mostly self-inflicted limitations that can be avoided by using request-coalescing.
PR https://github.com/neondatabase/neon/pull/4159 will do that.

fixes https://github.com/neondatabase/neon/issues/3560

refs https://github.com/neondatabase/neon/issues/3889 (part of tenant relocation)


Co-authored-by: Joonas Koivunen <joonas@neon.tech>
Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 libs/remote_storage/src/local_fs.rs           |   9 +
 pageserver/src/tenant.rs                      | 169 ++++++--
 pageserver/src/tenant/metadata.rs             |   3 +-
 .../src/tenant/remote_timeline_client.rs      | 210 +++++++++-
 .../tenant/remote_timeline_client/index.rs    |  11 +-
 .../tenant/remote_timeline_client/upload.rs   |   4 +
 pageserver/src/tenant/upload_queue.rs         |  19 +-
 test_runner/fixtures/pageserver/http.py       |   4 +-
 test_runner/fixtures/pageserver/utils.py      |   4 +-
 test_runner/regress/test_import.py            |   2 +
 test_runner/regress/test_timeline_delete.py   | 385 +++++++++++++++++-
 11 files changed, 746 insertions(+), 74 deletions(-)

diff --git a/libs/remote_storage/src/local_fs.rs b/libs/remote_storage/src/local_fs.rs
index d7b46731cd..c081a6d361 100644
--- a/libs/remote_storage/src/local_fs.rs
+++ b/libs/remote_storage/src/local_fs.rs
@@ -128,6 +128,15 @@ impl RemoteStorage for LocalFs {
         // We need this dance with sort of durable rename (without fsyncs)
         // to prevent partial uploads. This was really hit when pageserver shutdown
         // cancelled the upload and partial file was left on the fs
+        // NOTE: Because temp file suffix always the same this operation is racy.
+        // Two concurrent operations can lead to the following sequence:
+        // T1: write(temp)
+        // T2: write(temp) -> overwrites the content
+        // T1: rename(temp, dst) -> succeeds
+        // T2: rename(temp, dst) -> fails, temp no longet exists
+        // This can be solved by supplying unique temp suffix every time, but this situation
+        // is not normal in the first place, the error can help (and helped at least once)
+        // to discover bugs in upper level synchronization.
         let temp_file_path =
             path_with_suffix_extension(&target_file_path, LOCAL_FS_TEMP_FILE_SUFFIX);
         let mut destination = io::BufWriter::new(
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index d69d5e4b45..1c6006493e 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -58,6 +58,8 @@ use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::metadata::load_metadata;
 use crate::tenant::remote_timeline_client::index::IndexPart;
+use crate::tenant::remote_timeline_client::MaybeDeletedIndexPart;
+use crate::tenant::remote_timeline_client::PersistIndexPartWithDeletedFlagError;
 use crate::tenant::storage_layer::DeltaLayer;
 use crate::tenant::storage_layer::ImageLayer;
 use crate::tenant::storage_layer::Layer;
@@ -698,16 +700,9 @@ impl Tenant {
                         .await
                         .context("download index file")?;
 
-                    let remote_metadata = index_part.parse_metadata().context("parse metadata")?;
-
                     debug!("finished index part download");
 
-                    Result::<_, anyhow::Error>::Ok((
-                        timeline_id,
-                        client,
-                        index_part,
-                        remote_metadata,
-                    ))
+                    Result::<_, anyhow::Error>::Ok((timeline_id, client, index_part))
                 }
                 .map(move |res| {
                     res.with_context(|| format!("download index part for timeline {timeline_id}"))
@@ -716,17 +711,26 @@ impl Tenant {
             );
         }
         // Wait for all the download tasks to complete & collect results.
-        let mut remote_clients = HashMap::new();
-        let mut index_parts = HashMap::new();
+        let mut remote_index_and_client = HashMap::new();
         let mut timeline_ancestors = HashMap::new();
         while let Some(result) = part_downloads.join_next().await {
             // NB: we already added timeline_id as context to the error
             let result: Result<_, anyhow::Error> = result.context("joinset task join")?;
-            let (timeline_id, client, index_part, remote_metadata) = result?;
+            let (timeline_id, client, index_part) = result?;
             debug!("successfully downloaded index part for timeline {timeline_id}");
-            timeline_ancestors.insert(timeline_id, remote_metadata);
-            index_parts.insert(timeline_id, index_part);
-            remote_clients.insert(timeline_id, client);
+            match index_part {
+                MaybeDeletedIndexPart::IndexPart(index_part) => {
+                    timeline_ancestors.insert(
+                        timeline_id,
+                        index_part.parse_metadata().context("parse_metadata")?,
+                    );
+                    remote_index_and_client.insert(timeline_id, (index_part, client));
+                }
+                MaybeDeletedIndexPart::Deleted => {
+                    info!("timeline {} is deleted, skipping", timeline_id);
+                    continue;
+                }
+            }
         }
 
         // For every timeline, download the metadata file, scan the local directory,
@@ -734,12 +738,16 @@ impl Tenant {
         // layer file.
         let sorted_timelines = tree_sort_timelines(timeline_ancestors)?;
         for (timeline_id, remote_metadata) in sorted_timelines {
+            let (index_part, remote_client) = remote_index_and_client
+                .remove(&timeline_id)
+                .expect("just put it in above");
+
             // TODO again handle early failure
             self.load_remote_timeline(
                 timeline_id,
-                index_parts.remove(&timeline_id).unwrap(),
+                index_part,
                 remote_metadata,
-                remote_clients.remove(&timeline_id).unwrap(),
+                remote_client,
                 &ctx,
             )
             .await
@@ -1045,21 +1053,13 @@ impl Tenant {
     /// Subroutine of `load_tenant`, to load an individual timeline
     ///
     /// NB: The parent is assumed to be already loaded!
-    #[instrument(skip(self, local_metadata, ctx), fields(timeline_id=%timeline_id))]
+    #[instrument(skip_all, fields(timeline_id))]
     async fn load_local_timeline(
         &self,
         timeline_id: TimelineId,
         local_metadata: TimelineMetadata,
         ctx: &RequestContext,
     ) -> anyhow::Result<()> {
-        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
-            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
-            .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?;
-            Some(ancestor_timeline)
-        } else {
-            None
-        };
-
         let remote_client = self.remote_storage.as_ref().map(|remote_storage| {
             RemoteTimelineClient::new(
                 remote_storage.clone(),
@@ -1072,6 +1072,29 @@ impl Tenant {
         let remote_startup_data = match &remote_client {
             Some(remote_client) => match remote_client.download_index_file().await {
                 Ok(index_part) => {
+                    let index_part = match index_part {
+                        MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
+                        MaybeDeletedIndexPart::Deleted => {
+                            // TODO: we won't reach here if remote storage gets de-configured after start of the deletion operation.
+                            // Example:
+                            //  start deletion operation
+                            //  finishes upload of index part
+                            //  pageserver crashes
+                            //  remote storage gets de-configured
+                            //  pageserver starts
+                            //
+                            // We don't really anticipate remote storage to be de-configured, so, for now, this is fine.
+                            // Also, maybe we'll remove that option entirely in the future, see https://github.com/neondatabase/neon/issues/4099.
+                            info!("is_deleted is set on remote, resuming removal of local data originally done by timeline deletion handler");
+                            std::fs::remove_dir_all(
+                                self.conf.timeline_path(&timeline_id, &self.tenant_id),
+                            )
+                            .context("remove_dir_all")?;
+
+                            return Ok(());
+                        }
+                    };
+
                     let remote_metadata = index_part.parse_metadata().context("parse_metadata")?;
                     Some(RemoteStartupData {
                         index_part,
@@ -1087,6 +1110,14 @@ impl Tenant {
             None => None,
         };
 
+        let ancestor = if let Some(ancestor_timeline_id) = local_metadata.ancestor_timeline() {
+            let ancestor_timeline = self.get_timeline(ancestor_timeline_id, false)
+            .with_context(|| anyhow::anyhow!("cannot find ancestor timeline {ancestor_timeline_id} for timeline {timeline_id}"))?;
+            Some(ancestor_timeline)
+        } else {
+            None
+        };
+
         self.timeline_init_and_sync(
             timeline_id,
             remote_client,
@@ -1334,6 +1365,8 @@ impl Tenant {
         timeline_id: TimelineId,
         _ctx: &RequestContext,
     ) -> Result<(), DeleteTimelineError> {
+        timeline::debug_assert_current_span_has_tenant_and_timeline_id();
+
         // Transition the timeline into TimelineState::Stopping.
         // This should prevent new operations from starting.
         let timeline = {
@@ -1374,9 +1407,44 @@ impl Tenant {
         timeline.walreceiver.stop().await;
         debug!("wal receiver shutdown confirmed");
 
+        // Prevent new uploads from starting.
+        if let Some(remote_client) = timeline.remote_client.as_ref() {
+            let res = remote_client.stop();
+            match res {
+                Ok(()) => {}
+                Err(e) => match e {
+                    remote_timeline_client::StopError::QueueUninitialized => {
+                        // This case shouldn't happen currently because the
+                        // load and attach code bails out if _any_ of the timeline fails to fetch its IndexPart.
+                        // That is, before we declare the Tenant as Active.
+                        // But we only allow calls to delete_timeline on Active tenants.
+                        return Err(DeleteTimelineError::Other(anyhow::anyhow!("upload queue is uninitialized, likely the timeline was in Broken state prior to this call because it failed to fetch IndexPart during load or attach, check the logs")));
+                    }
+                },
+            }
+        }
+
+        // Stop & wait for the remaining timeline tasks, including upload tasks.
+        // NB: This and other delete_timeline calls do not run as a task_mgr task,
+        //     so, they are not affected by this shutdown_tasks() call.
         info!("waiting for timeline tasks to shutdown");
         task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id)).await;
 
+        // Mark timeline as deleted in S3 so we won't pick it up next time
+        // during attach or pageserver restart.
+        // See comment in persist_index_part_with_deleted_flag.
+        if let Some(remote_client) = timeline.remote_client.as_ref() {
+            match remote_client.persist_index_part_with_deleted_flag().await {
+                // If we (now, or already) marked it successfully as deleted, we can proceed
+                Ok(()) | Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(_)) => (),
+                // Bail out otherwise
+                Err(e @ PersistIndexPartWithDeletedFlagError::AlreadyInProgress(_))
+                | Err(e @ PersistIndexPartWithDeletedFlagError::Other(_)) => {
+                    return Err(DeleteTimelineError::Other(anyhow::anyhow!(e)));
+                }
+            }
+        }
+
         {
             // Grab the layer_removal_cs lock, and actually perform the deletion.
             //
@@ -1400,19 +1468,54 @@ impl Tenant {
             //     by the caller.
 
             let local_timeline_directory = self.conf.timeline_path(&timeline_id, &self.tenant_id);
-            // XXX make this atomic so that, if we crash-mid-way, the timeline won't be picked up
-            // with some layers missing.
-            std::fs::remove_dir_all(&local_timeline_directory).with_context(|| {
-                format!(
-                    "Failed to remove local timeline directory '{}'",
-                    local_timeline_directory.display()
-                )
-            })?;
+
+            fail::fail_point!("timeline-delete-before-rm", |_| {
+                Err(anyhow::anyhow!("failpoint: timeline-delete-before-rm"))?
+            });
+
+            // NB: This need not be atomic because the deleted flag in the IndexPart
+            // will be observed during tenant/timeline load. The deletion will be resumed there.
+            //
+            // For configurations without remote storage, we tolerate that we're not crash-safe here.
+            // The timeline may come up Active but with missing layer files, in such setups.
+            // See https://github.com/neondatabase/neon/pull/3919#issuecomment-1531726720
+            match std::fs::remove_dir_all(&local_timeline_directory) {
+                Err(e) if e.kind() == std::io::ErrorKind::NotFound => {
+                    // This can happen if we're called a second time, e.g.,
+                    // because of a previous failure/cancellation at/after
+                    // failpoint timeline-delete-after-rm.
+                    //
+                    // It can also happen if we race with tenant detach, because,
+                    // it doesn't grab the layer_removal_cs lock.
+                    //
+                    // For now, log and continue.
+                    // warn! level is technically not appropriate for the
+                    // first case because we should expect retries to happen.
+                    // But the error is so rare, it seems better to get attention if it happens.
+                    let tenant_state = self.current_state();
+                    warn!(
+                        timeline_dir=?local_timeline_directory,
+                        ?tenant_state,
+                        "timeline directory not found, proceeding anyway"
+                    );
+                    // continue with the rest of the deletion
+                }
+                res => res.with_context(|| {
+                    format!(
+                        "Failed to remove local timeline directory '{}'",
+                        local_timeline_directory.display()
+                    )
+                })?,
+            }
 
             info!("finished deleting layer files, releasing layer_removal_cs.lock()");
             drop(layer_removal_guard);
         }
 
+        fail::fail_point!("timeline-delete-after-rm", |_| {
+            Err(anyhow::anyhow!("failpoint: timeline-delete-after-rm"))?
+        });
+
         // Remove the timeline from the map.
         let mut timelines = self.timelines.lock().unwrap();
         let children_exist = timelines
diff --git a/pageserver/src/tenant/metadata.rs b/pageserver/src/tenant/metadata.rs
index 297cccbe30..1ea61fa26b 100644
--- a/pageserver/src/tenant/metadata.rs
+++ b/pageserver/src/tenant/metadata.rs
@@ -12,6 +12,7 @@ use std::io::Write;
 use anyhow::{bail, ensure, Context};
 use serde::{Deserialize, Serialize};
 use tracing::info_span;
+use utils::bin_ser::SerializeError;
 use utils::{
     bin_ser::BeSer,
     id::{TenantId, TimelineId},
@@ -182,7 +183,7 @@ impl TimelineMetadata {
         }
     }
 
-    pub fn to_bytes(&self) -> anyhow::Result<Vec<u8>> {
+    pub fn to_bytes(&self) -> Result<Vec<u8>, SerializeError> {
         let body_bytes = self.body.ser()?;
         let metadata_size = METADATA_HDR_SIZE + body_bytes.len();
         let hdr = TimelineMetadataHeader {
diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index c42824a8b5..96aabd7945 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -204,8 +204,11 @@ mod download;
 pub mod index;
 mod upload;
 
+use anyhow::Context;
+use chrono::{NaiveDateTime, Utc};
 // re-export these
 pub use download::{is_temp_download_file, list_remote_timelines};
+use scopeguard::ScopeGuard;
 
 use std::sync::atomic::{AtomicU32, Ordering};
 use std::sync::{Arc, Mutex};
@@ -213,7 +216,7 @@ use std::sync::{Arc, Mutex};
 use remote_storage::{DownloadError, GenericRemoteStorage};
 use std::ops::DerefMut;
 use tokio::runtime::Runtime;
-use tracing::{debug, info, warn};
+use tracing::{debug, error, info, warn};
 use tracing::{info_span, Instrument};
 use utils::lsn::Lsn;
 
@@ -240,6 +243,7 @@ use utils::id::{TenantId, TimelineId};
 use self::index::IndexPart;
 
 use super::storage_layer::LayerFileName;
+use super::upload_queue::SetDeletedFlagProgress;
 
 // Occasional network issues and such can cause remote operations to fail, and
 // that's expected. If a download fails, we log it at info-level, and retry.
@@ -253,6 +257,30 @@ const FAILED_DOWNLOAD_RETRIES: u32 = 10;
 // retries. Uploads and deletions are retried forever, though.
 const FAILED_UPLOAD_WARN_THRESHOLD: u32 = 3;
 
+pub enum MaybeDeletedIndexPart {
+    IndexPart(IndexPart),
+    Deleted,
+}
+
+/// Errors that can arise when calling [`RemoteTimelineClient::stop`].
+#[derive(Debug, thiserror::Error)]
+pub enum StopError {
+    /// Returned if the upload queue was never initialized.
+    /// See [`RemoteTimelineClient::init_upload_queue`] and [`RemoteTimelineClient::init_upload_queue_for_empty_remote`].
+    #[error("queue is not initialized")]
+    QueueUninitialized,
+}
+
+#[derive(Debug, thiserror::Error)]
+pub enum PersistIndexPartWithDeletedFlagError {
+    #[error("another task is already setting the deleted_flag, started at {0:?}")]
+    AlreadyInProgress(NaiveDateTime),
+    #[error("the deleted_flag was already set, value is {0:?}")]
+    AlreadyDeleted(NaiveDateTime),
+    #[error(transparent)]
+    Other(#[from] anyhow::Error),
+}
+
 /// A client for accessing a timeline's data in remote storage.
 ///
 /// This takes care of managing the number of connections, and balancing them
@@ -367,7 +395,7 @@ impl RemoteTimelineClient {
     //
 
     /// Download index file
-    pub async fn download_index_file(&self) -> Result<IndexPart, DownloadError> {
+    pub async fn download_index_file(&self) -> Result<MaybeDeletedIndexPart, DownloadError> {
         let _unfinished_gauge_guard = self.metrics.call_begin(
             &RemoteOpFileKind::Index,
             &RemoteOpKind::Download,
@@ -376,7 +404,7 @@ impl RemoteTimelineClient {
             },
         );
 
-        download::download_index_part(
+        let index_part = download::download_index_part(
             self.conf,
             &self.storage_impl,
             self.tenant_id,
@@ -389,7 +417,13 @@ impl RemoteTimelineClient {
             RemoteOpKind::Download,
             Arc::clone(&self.metrics),
         )
-        .await
+        .await?;
+
+        if index_part.deleted_at.is_some() {
+            Ok(MaybeDeletedIndexPart::Deleted)
+        } else {
+            Ok(MaybeDeletedIndexPart::IndexPart(index_part))
+        }
     }
 
     /// Download a (layer) file from `path`, into local filesystem.
@@ -624,6 +658,116 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
+    /// Set the deleted_at field in the remote index file.
+    ///
+    /// This fails if the upload queue has not been `stop()`ed.
+    ///
+    /// The caller is responsible for calling `stop()` AND for waiting
+    /// for any ongoing upload tasks to finish after `stop()` has succeeded.
+    /// Check method [`RemoteTimelineClient::stop`] for details.
+    pub(crate) async fn persist_index_part_with_deleted_flag(
+        self: &Arc<Self>,
+    ) -> Result<(), PersistIndexPartWithDeletedFlagError> {
+        let index_part_with_deleted_at = {
+            let mut locked = self.upload_queue.lock().unwrap();
+
+            // We must be in stopped state because otherwise
+            // we can have inprogress index part upload that can overwrite the file
+            // with missing is_deleted flag that we going to set below
+            let stopped = match &mut *locked {
+                UploadQueue::Uninitialized => {
+                    return Err(anyhow::anyhow!("is not Stopped but Uninitialized").into())
+                }
+                UploadQueue::Initialized(_) => {
+                    return Err(anyhow::anyhow!("is not Stopped but Initialized").into())
+                }
+                UploadQueue::Stopped(stopped) => stopped,
+            };
+
+            match stopped.deleted_at {
+                SetDeletedFlagProgress::NotRunning => (), // proceed
+                SetDeletedFlagProgress::InProgress(at) => {
+                    return Err(PersistIndexPartWithDeletedFlagError::AlreadyInProgress(at));
+                }
+                SetDeletedFlagProgress::Successful(at) => {
+                    return Err(PersistIndexPartWithDeletedFlagError::AlreadyDeleted(at));
+                }
+            };
+            let deleted_at = Utc::now().naive_utc();
+            stopped.deleted_at = SetDeletedFlagProgress::InProgress(deleted_at);
+
+            let mut index_part = IndexPart::new(
+                stopped.latest_files.clone(),
+                stopped.last_uploaded_consistent_lsn,
+                stopped
+                    .latest_metadata
+                    .to_bytes()
+                    .context("serialize metadata")?,
+            );
+            index_part.deleted_at = Some(deleted_at);
+            index_part
+        };
+
+        let undo_deleted_at = scopeguard::guard(Arc::clone(self), |self_clone| {
+            let mut locked = self_clone.upload_queue.lock().unwrap();
+            let stopped = match &mut *locked {
+                UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
+                    "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
+                    locked.as_str(),
+                ),
+                UploadQueue::Stopped(stopped) => stopped,
+            };
+            stopped.deleted_at = SetDeletedFlagProgress::NotRunning;
+        });
+
+        // Have a failpoint that can use the `pause` failpoint action.
+        // We don't want to block the executor thread, hence, spawn_blocking + await.
+        #[cfg(feature = "testing")]
+        tokio::task::spawn_blocking({
+            let current = tracing::Span::current();
+            move || {
+                let _entered = current.entered();
+                tracing::info!(
+                    "at failpoint persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+                );
+                fail::fail_point!(
+                    "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+                );
+            }
+        })
+        .await
+        .expect("spawn_blocking");
+
+        upload::upload_index_part(
+            self.conf,
+            &self.storage_impl,
+            self.tenant_id,
+            self.timeline_id,
+            &index_part_with_deleted_at,
+        )
+        .await?;
+
+        // all good, disarm the guard and mark as success
+        ScopeGuard::into_inner(undo_deleted_at);
+        {
+            let mut locked = self.upload_queue.lock().unwrap();
+            let stopped = match &mut *locked {
+                UploadQueue::Uninitialized | UploadQueue::Initialized(_) => unreachable!(
+                    "there's no way out of Stopping, and we checked it's Stopping above: {:?}",
+                    locked.as_str(),
+                ),
+                UploadQueue::Stopped(stopped) => stopped,
+            };
+            stopped.deleted_at = SetDeletedFlagProgress::Successful(
+                index_part_with_deleted_at
+                    .deleted_at
+                    .expect("we set it above"),
+            );
+        }
+
+        Ok(())
+    }
+
     ///
     /// Pick next tasks from the queue, and start as many of them as possible without violating
     /// the ordering constraints.
@@ -741,8 +885,13 @@ impl RemoteTimelineClient {
             // upload finishes or times out soon enough.
             if task_mgr::is_shutdown_requested() {
                 info!("upload task cancelled by shutdown request");
+                match self.stop() {
+                    Ok(()) => {}
+                    Err(StopError::QueueUninitialized) => {
+                        unreachable!("we never launch an upload task if the queue is uninitialized, and once it is initialized, we never go back")
+                    }
+                }
                 self.calls_unfinished_metric_end(&task.op);
-                self.stop();
                 return;
             }
 
@@ -946,32 +1095,48 @@ impl RemoteTimelineClient {
         self.metrics.call_end(&file_kind, &op_kind, track_bytes);
     }
 
-    fn stop(&self) {
+    /// Close the upload queue for new operations and cancel queued operations.
+    /// In-progress operations will still be running after this function returns.
+    /// Use `task_mgr::shutdown_tasks(None, Some(self.tenant_id), Some(timeline_id))`
+    /// to wait for them to complete, after calling this function.
+    pub fn stop(&self) -> Result<(), StopError> {
         // Whichever *task* for this RemoteTimelineClient grabs the mutex first will transition the queue
         // into stopped state, thereby dropping all off the queued *ops* which haven't become *tasks* yet.
         // The other *tasks* will come here and observe an already shut down queue and hence simply wrap up their business.
         let mut guard = self.upload_queue.lock().unwrap();
-        match &*guard {
-            UploadQueue::Uninitialized => panic!(
-                "callers are responsible for ensuring this is only called on initialized queue"
-            ),
+        match &mut *guard {
+            UploadQueue::Uninitialized => Err(StopError::QueueUninitialized),
             UploadQueue::Stopped(_) => {
                 // nothing to do
                 info!("another concurrent task already shut down the queue");
+                Ok(())
             }
-            UploadQueue::Initialized(qi) => {
+            UploadQueue::Initialized(UploadQueueInitialized {
+                latest_files,
+                latest_metadata,
+                last_uploaded_consistent_lsn,
+                ..
+            }) => {
                 info!("shutting down upload queue");
 
                 // Replace the queue with the Stopped state, taking ownership of the old
                 // Initialized queue. We will do some checks on it, and then drop it.
                 let qi = {
-                    let last_uploaded_consistent_lsn = qi.last_uploaded_consistent_lsn;
-                    let upload_queue = std::mem::replace(
-                        &mut *guard,
-                        UploadQueue::Stopped(UploadQueueStopped {
-                            last_uploaded_consistent_lsn,
-                        }),
-                    );
+                    // take or clone what we need
+                    let latest_files = std::mem::take(latest_files);
+                    let last_uploaded_consistent_lsn = *last_uploaded_consistent_lsn;
+                    // this could be Copy
+                    let latest_metadata = latest_metadata.clone();
+
+                    let stopped = UploadQueueStopped {
+                        latest_files,
+                        last_uploaded_consistent_lsn,
+                        latest_metadata,
+                        deleted_at: SetDeletedFlagProgress::NotRunning,
+                    };
+
+                    let upload_queue =
+                        std::mem::replace(&mut *guard, UploadQueue::Stopped(stopped));
                     if let UploadQueue::Initialized(qi) = upload_queue {
                         qi
                     } else {
@@ -979,6 +1144,8 @@ impl RemoteTimelineClient {
                     }
                 };
 
+                assert!(qi.latest_files.is_empty(), "do not use this anymore");
+
                 // consistency check
                 assert_eq!(
                     qi.num_inprogress_layer_uploads
@@ -1002,6 +1169,7 @@ impl RemoteTimelineClient {
 
                 // We're done.
                 drop(guard);
+                Ok(())
             }
         }
     }
@@ -1240,7 +1408,11 @@ mod tests {
         }
 
         // Download back the index.json, and check that the list of files is correct
-        let index_part = runtime.block_on(client.download_index_file())?;
+        let index_part = match runtime.block_on(client.download_index_file())? {
+            MaybeDeletedIndexPart::IndexPart(index_part) => index_part,
+            MaybeDeletedIndexPart::Deleted => panic!("unexpectedly got deleted index part"),
+        };
+
         assert_file_list(
             &index_part.timeline_layers,
             &[
diff --git a/pageserver/src/tenant/remote_timeline_client/index.rs b/pageserver/src/tenant/remote_timeline_client/index.rs
index 9c84f8e977..7a06e57a6b 100644
--- a/pageserver/src/tenant/remote_timeline_client/index.rs
+++ b/pageserver/src/tenant/remote_timeline_client/index.rs
@@ -4,6 +4,7 @@
 
 use std::collections::{HashMap, HashSet};
 
+use chrono::NaiveDateTime;
 use serde::{Deserialize, Serialize};
 use serde_with::{serde_as, DisplayFromStr};
 
@@ -55,6 +56,10 @@ pub struct IndexPart {
     #[serde(default)]
     version: usize,
 
+    #[serde(default)]
+    #[serde(skip_serializing_if = "Option::is_none")]
+    pub deleted_at: Option<NaiveDateTime>,
+
     /// Layer names, which are stored on the remote storage.
     ///
     /// Additional metadata can might exist in `layer_metadata`.
@@ -78,7 +83,7 @@ impl IndexPart {
     /// used to understand later versions.
     ///
     /// Version is currently informative only.
-    const LATEST_VERSION: usize = 1;
+    const LATEST_VERSION: usize = 2;
     pub const FILE_NAME: &'static str = "index_part.json";
 
     pub fn new(
@@ -101,6 +106,7 @@ impl IndexPart {
             layer_metadata,
             disk_consistent_lsn,
             metadata_bytes,
+            deleted_at: None,
         }
     }
 
@@ -156,6 +162,7 @@ mod tests {
             ]),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata_bytes: [113,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            deleted_at: None,
         };
 
         let part = serde_json::from_str::<IndexPart>(example).unwrap();
@@ -192,6 +199,7 @@ mod tests {
             ]),
             disk_consistent_lsn: "0/16960E8".parse::<Lsn>().unwrap(),
             metadata_bytes: [112,11,159,210,0,54,0,4,0,0,0,0,1,105,96,232,1,0,0,0,0,1,105,96,112,0,0,0,0,0,0,0,0,0,0,0,0,0,1,105,96,112,0,0,0,0,1,105,96,112,0,0,0,14,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0].to_vec(),
+            deleted_at: None,
         };
 
         let part = serde_json::from_str::<IndexPart>(example).unwrap();
@@ -236,6 +244,7 @@ mod tests {
                 0, 0,
             ]
             .to_vec(),
+            deleted_at: None,
         };
 
         let empty_layers_parsed = serde_json::from_str::<IndexPart>(empty_layers_json).unwrap();
diff --git a/pageserver/src/tenant/remote_timeline_client/upload.rs b/pageserver/src/tenant/remote_timeline_client/upload.rs
index 699121ccd9..b520bb4b0c 100644
--- a/pageserver/src/tenant/remote_timeline_client/upload.rs
+++ b/pageserver/src/tenant/remote_timeline_client/upload.rs
@@ -19,9 +19,12 @@ pub(super) async fn upload_index_part<'a>(
     timeline_id: TimelineId,
     index_part: &'a IndexPart,
 ) -> anyhow::Result<()> {
+    tracing::trace!("uploading new index part");
+
     fail_point!("before-upload-index", |_| {
         bail!("failpoint before-upload-index")
     });
+
     let index_part_bytes = serde_json::to_vec(&index_part)
         .context("Failed to serialize index part file into bytes")?;
     let index_part_size = index_part_bytes.len();
@@ -31,6 +34,7 @@ pub(super) async fn upload_index_part<'a>(
         .metadata_path(timeline_id, tenant_id)
         .with_file_name(IndexPart::FILE_NAME);
     let storage_path = conf.remote_path(&index_part_path)?;
+
     storage
         .upload_storage_object(Box::new(index_part_bytes), index_part_size, &storage_path)
         .await
diff --git a/pageserver/src/tenant/upload_queue.rs b/pageserver/src/tenant/upload_queue.rs
index 08bc1f219d..8f5faff627 100644
--- a/pageserver/src/tenant/upload_queue.rs
+++ b/pageserver/src/tenant/upload_queue.rs
@@ -7,6 +7,7 @@ use crate::tenant::remote_timeline_client::index::LayerFileMetadata;
 use std::collections::{HashMap, VecDeque};
 use std::fmt::Debug;
 
+use chrono::NaiveDateTime;
 use std::sync::Arc;
 use tracing::info;
 
@@ -18,14 +19,14 @@ use utils::lsn::Lsn;
 // that many upload queues in a running pageserver, and most of them are initialized
 // anyway.
 #[allow(clippy::large_enum_variant)]
-pub(crate) enum UploadQueue {
+pub(super) enum UploadQueue {
     Uninitialized,
     Initialized(UploadQueueInitialized),
     Stopped(UploadQueueStopped),
 }
 
 impl UploadQueue {
-    fn as_str(&self) -> &'static str {
+    pub fn as_str(&self) -> &'static str {
         match self {
             UploadQueue::Uninitialized => "Uninitialized",
             UploadQueue::Initialized(_) => "Initialized",
@@ -75,8 +76,18 @@ pub(crate) struct UploadQueueInitialized {
     pub(crate) queued_operations: VecDeque<UploadOp>,
 }
 
-pub(crate) struct UploadQueueStopped {
-    pub(crate) last_uploaded_consistent_lsn: Lsn,
+#[derive(Clone, Copy)]
+pub(super) enum SetDeletedFlagProgress {
+    NotRunning,
+    InProgress(NaiveDateTime),
+    Successful(NaiveDateTime),
+}
+
+pub(super) struct UploadQueueStopped {
+    pub(super) latest_files: HashMap<LayerFileName, LayerFileMetadata>,
+    pub(super) last_uploaded_consistent_lsn: Lsn,
+    pub(super) latest_metadata: TimelineMetadata,
+    pub(super) deleted_at: SetDeletedFlagProgress,
 }
 
 impl UploadQueue {
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 0c4ed60c8d..388e834b56 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -314,9 +314,9 @@ class PageserverHttpClient(requests.Session):
         assert isinstance(res_json, dict)
         return res_json
 
-    def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId):
+    def timeline_delete(self, tenant_id: TenantId, timeline_id: TimelineId, **kwargs):
         res = self.delete(
-            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}"
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}", **kwargs
         )
         self.verbose_error(res)
         res_json = res.json()
diff --git a/test_runner/fixtures/pageserver/utils.py b/test_runner/fixtures/pageserver/utils.py
index 7f8bb40bda..c558387413 100644
--- a/test_runner/fixtures/pageserver/utils.py
+++ b/test_runner/fixtures/pageserver/utils.py
@@ -87,7 +87,9 @@ def wait_until_tenant_state(
 
         time.sleep(period)
 
-    raise Exception(f"Tenant {tenant_id} did not become {expected_state} in {iterations} seconds")
+    raise Exception(
+        f"Tenant {tenant_id} did not become {expected_state} within {iterations * period} seconds"
+    )
 
 
 def wait_until_tenant_active(
diff --git a/test_runner/regress/test_import.py b/test_runner/regress/test_import.py
index 137ce457bc..77030288f0 100644
--- a/test_runner/regress/test_import.py
+++ b/test_runner/regress/test_import.py
@@ -143,6 +143,8 @@ def test_import_from_vanilla(test_output_dir, pg_bin, vanilla_pg, neon_env_build
     assert env.pageserver.log_contains(
         ".*WARN.*ignored .* unexpected bytes after the tar archive.*"
     )
+
+    # NOTE: delete can easily come before upload operations are completed
     client.timeline_delete(tenant, timeline)
 
     # Importing correct backup works
diff --git a/test_runner/regress/test_timeline_delete.py b/test_runner/regress/test_timeline_delete.py
index cf607f4f7b..7135b621cb 100644
--- a/test_runner/regress/test_timeline_delete.py
+++ b/test_runner/regress/test_timeline_delete.py
@@ -1,8 +1,26 @@
+import os
+import queue
+import shutil
+import threading
+from pathlib import Path
+
 import pytest
-from fixtures.neon_fixtures import NeonEnv
+import requests
+from fixtures.log_helper import log
+from fixtures.neon_fixtures import (
+    NeonEnv,
+    NeonEnvBuilder,
+    RemoteStorageKind,
+    available_remote_storages,
+)
 from fixtures.pageserver.http import PageserverApiException
-from fixtures.types import TenantId, TimelineId
-from fixtures.utils import wait_until
+from fixtures.pageserver.utils import (
+    wait_for_last_record_lsn,
+    wait_for_upload,
+    wait_until_tenant_active,
+)
+from fixtures.types import Lsn, TenantId, TimelineId
+from fixtures.utils import query_scalar, wait_until
 
 
 def test_timeline_delete(neon_simple_env: NeonEnv):
@@ -39,23 +57,17 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
         "test_ancestor_branch_delete_branch1", "test_ancestor_branch_delete_parent"
     )
 
-    ps_http = env.pageserver.http_client()
+    timeline_path = (
+        env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(parent_timeline_id)
+    )
+
     with pytest.raises(
         PageserverApiException, match="Cannot delete timeline which has child timelines"
     ) as exc:
-        timeline_path = (
-            env.repo_dir
-            / "tenants"
-            / str(env.initial_tenant)
-            / "timelines"
-            / str(parent_timeline_id)
-        )
         assert timeline_path.exists()
 
         ps_http.timeline_delete(env.initial_tenant, parent_timeline_id)
 
-        assert not timeline_path.exists()
-
     assert exc.value.status_code == 400
 
     timeline_path = (
@@ -87,3 +99,350 @@ def test_timeline_delete(neon_simple_env: NeonEnv):
         )
 
     assert exc.value.status_code == 404
+
+    # Check that we didn't pick up the timeline again after restart.
+    # See https://github.com/neondatabase/neon/issues/3560
+    env.pageserver.stop(immediate=True)
+    env.pageserver.start()
+
+    with pytest.raises(
+        PageserverApiException,
+        match=f"Timeline {env.initial_tenant}/{leaf_timeline_id} was not found",
+    ) as exc:
+        ps_http.timeline_detail(env.initial_tenant, leaf_timeline_id)
+
+
+# cover the two cases: remote storage configured vs not configured
+@pytest.mark.parametrize("remote_storage_kind", [None, RemoteStorageKind.LOCAL_FS])
+def test_delete_timeline_post_rm_failure(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    """
+    If there is a failure after removing the timeline directory, the delete operation
+    should be retryable.
+    """
+
+    if remote_storage_kind is not None:
+        neon_env_builder.enable_remote_storage(
+            remote_storage_kind, "test_delete_timeline_post_rm_failure"
+        )
+
+    env = neon_env_builder.init_start()
+    assert env.initial_timeline
+
+    ps_http = env.pageserver.http_client()
+
+    failpoint_name = "timeline-delete-after-rm"
+    ps_http.configure_failpoints((failpoint_name, "return"))
+
+    with pytest.raises(PageserverApiException, match=f"failpoint: {failpoint_name}"):
+        ps_http.timeline_delete(env.initial_tenant, env.initial_timeline)
+
+    at_failpoint_log_message = f".*{env.initial_timeline}.*at failpoint {failpoint_name}.*"
+    env.pageserver.allowed_errors.append(at_failpoint_log_message)
+    env.pageserver.allowed_errors.append(
+        f".*DELETE.*{env.initial_timeline}.*InternalServerError.*{failpoint_name}"
+    )
+
+    # retry without failpoint, it should succeed
+    ps_http.configure_failpoints((failpoint_name, "off"))
+
+    # this should succeed
+    ps_http.timeline_delete(env.initial_tenant, env.initial_timeline, timeout=2)
+    # the second call will try to transition the timeline into Stopping state, but it's already in that state
+    env.pageserver.allowed_errors.append(
+        f".*{env.initial_timeline}.*Ignoring new state, equal to the existing one: Stopping"
+    )
+    env.pageserver.allowed_errors.append(
+        f".*{env.initial_timeline}.*timeline directory not found, proceeding anyway.*"
+    )
+
+
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+@pytest.mark.parametrize("fill_branch", [True, False])
+def test_timeline_resurrection_on_attach(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+    fill_branch: bool,
+):
+    """
+    After deleting a timeline it should never appear again.
+    This test ensures that this invariant holds for detach+attach.
+    Original issue: https://github.com/neondatabase/neon/issues/3560
+    """
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_timeline_resurrection_on_attach",
+    )
+
+    ##### First start, insert data and upload it to the remote storage
+    env = neon_env_builder.init_start()
+
+    ps_http = env.pageserver.http_client()
+    pg = env.endpoints.create_start("main")
+
+    tenant_id = TenantId(pg.safe_psql("show neon.tenant_id")[0][0])
+    main_timeline_id = TimelineId(pg.safe_psql("show neon.timeline_id")[0][0])
+
+    with pg.cursor() as cur:
+        cur.execute("CREATE TABLE f (i integer);")
+        cur.execute("INSERT INTO f VALUES (generate_series(1,1000));")
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+        # wait until pageserver receives that data
+        wait_for_last_record_lsn(ps_http, tenant_id, main_timeline_id, current_lsn)
+
+        # run checkpoint manually to be sure that data landed in remote storage
+        ps_http.timeline_checkpoint(tenant_id, main_timeline_id)
+
+        # wait until pageserver successfully uploaded a checkpoint to remote storage
+        log.info("waiting for checkpoint upload")
+        wait_for_upload(ps_http, tenant_id, main_timeline_id, current_lsn)
+        log.info("upload of checkpoint is done")
+
+    branch_timeline_id = env.neon_cli.create_branch("new", "main")
+
+    # Two variants of this test:
+    # - In fill_branch=True, the deleted branch has layer files.
+    # - In fill_branch=False, it doesn't, it just has the metadata file.
+    # A broken implementation is conceivable that tries to "optimize" handling of empty branches, e.g.,
+    # by skipping IndexPart uploads if the layer file set doesn't change. That would be wrong, catch those.
+    if fill_branch:
+        with env.endpoints.create_start("new") as new_pg:
+            with new_pg.cursor() as cur:
+                cur.execute("INSERT INTO f VALUES (generate_series(1,1000));")
+                current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+                # wait until pageserver receives that data
+                wait_for_last_record_lsn(ps_http, tenant_id, branch_timeline_id, current_lsn)
+
+                # run checkpoint manually to be sure that data landed in remote storage
+                ps_http.timeline_checkpoint(tenant_id, branch_timeline_id)
+
+                # wait until pageserver successfully uploaded a checkpoint to remote storage
+                log.info("waiting for checkpoint upload")
+                wait_for_upload(ps_http, tenant_id, branch_timeline_id, current_lsn)
+                log.info("upload of checkpoint is done")
+    else:
+        pass
+
+    # delete new timeline
+    ps_http.timeline_delete(tenant_id=tenant_id, timeline_id=branch_timeline_id)
+
+    ##### Stop the pageserver instance, erase all its data
+    env.endpoints.stop_all()
+    env.pageserver.stop()
+
+    dir_to_clear = Path(env.repo_dir) / "tenants"
+    shutil.rmtree(dir_to_clear)
+    os.mkdir(dir_to_clear)
+
+    ##### Second start, restore the data and ensure that we see only timeline that wasnt deleted
+    env.pageserver.start()
+
+    ps_http.tenant_attach(tenant_id=tenant_id)
+
+    wait_until_tenant_active(ps_http, tenant_id=tenant_id, iterations=10, period=0.5)
+
+    timelines = ps_http.timeline_list(tenant_id=tenant_id)
+    assert {TimelineId(tl["timeline_id"]) for tl in timelines} == {
+        main_timeline_id
+    }, "the deleted timeline should not have been resurrected"
+    assert all([tl["state"] == "Active" for tl in timelines])
+
+
+def test_timeline_delete_fail_before_local_delete(neon_env_builder: NeonEnvBuilder):
+    """
+    When deleting a timeline, if we succeed in setting the deleted flag remotely
+    but fail to delete the local state, restarting the pageserver should resume
+    the deletion of the local state.
+    (Deletion of the state in S3 is not implemented yet.)
+    """
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+        test_name="test_timeline_delete_fail_before_local_delete",
+    )
+
+    env = neon_env_builder.init_start()
+
+    env.pageserver.allowed_errors.append(".*failpoint: timeline-delete-before-rm")
+    env.pageserver.allowed_errors.append(
+        ".*Ignoring new state, equal to the existing one: Stopping"
+    )
+    env.pageserver.allowed_errors.append(
+        ".*during shutdown: cannot flush frozen layers when flush_loop is not running, state is Exited"
+    )
+
+    ps_http = env.pageserver.http_client()
+    ps_http.configure_failpoints(("timeline-delete-before-rm", "return"))
+
+    # construct pair of branches
+    intermediate_timeline_id = env.neon_cli.create_branch(
+        "test_timeline_delete_fail_before_local_delete"
+    )
+
+    leaf_timeline_id = env.neon_cli.create_branch(
+        "test_timeline_delete_fail_before_local_delete1",
+        "test_timeline_delete_fail_before_local_delete",
+    )
+
+    leaf_timeline_path = (
+        env.repo_dir / "tenants" / str(env.initial_tenant) / "timelines" / str(leaf_timeline_id)
+    )
+
+    with pytest.raises(
+        PageserverApiException,
+        match="failpoint: timeline-delete-before-rm",
+    ):
+        ps_http.timeline_delete(env.initial_tenant, leaf_timeline_id)
+
+    assert leaf_timeline_path.exists(), "the failpoint didn't work"
+
+    env.pageserver.stop()
+    env.pageserver.start()
+
+    # Wait for tenant to finish loading.
+    wait_until_tenant_active(ps_http, tenant_id=env.initial_tenant, iterations=10, period=0.5)
+
+    assert (
+        not leaf_timeline_path.exists()
+    ), "timeline load procedure should have resumed the deletion interrupted by the failpoint"
+    timelines = ps_http.timeline_list(env.initial_tenant)
+    assert {TimelineId(tl["timeline_id"]) for tl in timelines} == {
+        intermediate_timeline_id,
+        env.initial_timeline,
+    }, "other timelines should not have been affected"
+    assert all([tl["state"] == "Active" for tl in timelines])
+
+
+def test_concurrent_timeline_delete_if_first_stuck_at_index_upload(
+    neon_env_builder: NeonEnvBuilder,
+):
+    """
+    If we're stuck uploading the index file with the is_delete flag,
+    eventually console will hand up and retry.
+    If we're still stuck at the retry time, ensure that the retry
+    fails with status 500, signalling to console that it should retry
+    later.
+    Ideally, timeline_delete should return 202 Accepted and require
+    console to poll for completion, but, that would require changing
+    the API contract.
+    """
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+        test_name="test_concurrent_timeline_delete_if_first_stuck_at_index_upload",
+    )
+
+    env = neon_env_builder.init_start()
+
+    child_timeline_id = env.neon_cli.create_branch("child", "main")
+
+    ps_http = env.pageserver.http_client()
+
+    # make the first call sleep practically forever
+    failpoint_name = "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+    ps_http.configure_failpoints((failpoint_name, "pause"))
+
+    def first_call(result_queue):
+        try:
+            log.info("first call start")
+            ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=10)
+            log.info("first call success")
+            result_queue.put("success")
+        except Exception:
+            log.exception("first call failed")
+            result_queue.put("failure, see log for stack trace")
+
+    first_call_result: queue.Queue[str] = queue.Queue()
+    first_call_thread = threading.Thread(target=first_call, args=(first_call_result,))
+    first_call_thread.start()
+
+    try:
+
+        def first_call_hit_failpoint():
+            assert env.pageserver.log_contains(
+                f".*{child_timeline_id}.*at failpoint {failpoint_name}"
+            )
+
+        wait_until(50, 0.1, first_call_hit_failpoint)
+
+        # make the second call and assert behavior
+        log.info("second call start")
+        error_msg_re = "another task is already setting the deleted_flag, started at"
+        with pytest.raises(PageserverApiException, match=error_msg_re) as second_call_err:
+            ps_http.timeline_delete(env.initial_tenant, child_timeline_id)
+        assert second_call_err.value.status_code == 500
+        env.pageserver.allowed_errors.append(f".*{child_timeline_id}.*{error_msg_re}.*")
+        # the second call will try to transition the timeline into Stopping state as well
+        env.pageserver.allowed_errors.append(
+            f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping"
+        )
+        log.info("second call failed as expected")
+
+        # by now we know that the second call failed, let's ensure the first call will finish
+        ps_http.configure_failpoints((failpoint_name, "off"))
+
+        result = first_call_result.get()
+        assert result == "success"
+
+    finally:
+        log.info("joining first call thread")
+        # in any case, make sure the lifetime of the thread is bounded to this test
+        first_call_thread.join()
+
+
+def test_delete_timeline_client_hangup(neon_env_builder: NeonEnvBuilder):
+    """
+    If the client hangs up before we start the index part upload but after we mark it
+    deleted in local memory, a subsequent delete_timeline call should be able to do
+    another delete timeline operation.
+
+    This tests cancel safety up to the given failpoint.
+    """
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=RemoteStorageKind.MOCK_S3,
+        test_name="test_delete_timeline_client_hangup",
+    )
+
+    env = neon_env_builder.init_start()
+
+    child_timeline_id = env.neon_cli.create_branch("child", "main")
+
+    ps_http = env.pageserver.http_client()
+
+    failpoint_name = "persist_index_part_with_deleted_flag_after_set_before_upload_pause"
+    ps_http.configure_failpoints((failpoint_name, "pause"))
+
+    with pytest.raises(requests.exceptions.Timeout):
+        ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2)
+
+    # make sure the timeout was due to the failpoint
+    at_failpoint_log_message = f".*{child_timeline_id}.*at failpoint {failpoint_name}.*"
+
+    def hit_failpoint():
+        assert env.pageserver.log_contains(at_failpoint_log_message)
+
+    wait_until(50, 0.1, hit_failpoint)
+
+    # we log this error if a client hangs up
+    # might as well use it as another indicator that the test works
+    hangup_log_message = f".*DELETE.*{child_timeline_id}.*request was dropped before completing"
+    env.pageserver.allowed_errors.append(hangup_log_message)
+
+    def got_hangup_log_message():
+        assert env.pageserver.log_contains(hangup_log_message)
+
+    wait_until(50, 0.1, got_hangup_log_message)
+
+    # ok, retry without failpoint, it should succeed
+    ps_http.configure_failpoints((failpoint_name, "off"))
+
+    # this should succeed
+    ps_http.timeline_delete(env.initial_tenant, child_timeline_id, timeout=2)
+    # the second call will try to transition the timeline into Stopping state, but it's already in that state
+    env.pageserver.allowed_errors.append(
+        f".*{child_timeline_id}.*Ignoring new state, equal to the existing one: Stopping"
+    )

From 1d490b2311e44b8617c0e050f86665a927c721b7 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Sun, 7 May 2023 17:10:59 +0100
Subject: [PATCH 04/23] Make benchmark_fixture less noisy

---
 test_runner/fixtures/benchmark_fixture.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/test_runner/fixtures/benchmark_fixture.py b/test_runner/fixtures/benchmark_fixture.py
index 67a99aa452..99682caf80 100644
--- a/test_runner/fixtures/benchmark_fixture.py
+++ b/test_runner/fixtures/benchmark_fixture.py
@@ -451,13 +451,17 @@ def pytest_terminal_summary(
     revision = os.getenv("GITHUB_SHA", "local")
     platform = os.getenv("PLATFORM", "local")
 
-    terminalreporter.section("Benchmark results", "-")
+    is_header_printed = False
 
     result = []
     for test_report in terminalreporter.stats.get("passed", []):
         result_entry = []
 
         for _, recorded_property in test_report.user_properties:
+            if not is_header_printed:
+                terminalreporter.section("Benchmark results", "-")
+                is_header_printed = True
+
             terminalreporter.write(
                 "{}.{}: ".format(test_report.head_line, recorded_property["name"])
             )
@@ -485,7 +489,6 @@ def pytest_terminal_summary(
 
     out_dir = config.getoption("out_dir")
     if out_dir is None:
-        warnings.warn("no out dir provided to store performance test results")
         return
 
     if not result:

From 7fc778d251dd682d124c77387147bd4e0d4cf8f1 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 10 May 2023 11:53:02 +0100
Subject: [PATCH 05/23] GitHub Autocomment: fix flaky test notifications

---
 scripts/pr-comment-test-report.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/pr-comment-test-report.js b/scripts/pr-comment-test-report.js
index 21386d3898..134df432ae 100644
--- a/scripts/pr-comment-test-report.js
+++ b/scripts/pr-comment-test-report.js
@@ -69,7 +69,7 @@ module.exports = async ({ github, context, fetch, reports }) => {
                     if (test.retriesCount > 0) {
                         retriedTests.push(test);
 
-                        if (test.retriedStatusChangedTests) {
+                        if (test.retriesStatusChange) {
                             retriedStatusChangedTests.push(test);
                         }
                     }

From 59510f644953a43142155e796b620d58c21f9d52 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Wed, 10 May 2023 11:58:24 +0100
Subject: [PATCH 06/23] scripts/flaky_tests.py: use retriesStatusChange from
 Allure

---
 scripts/flaky_tests.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/flaky_tests.py b/scripts/flaky_tests.py
index 262950b61d..a3b29909e5 100755
--- a/scripts/flaky_tests.py
+++ b/scripts/flaky_tests.py
@@ -21,6 +21,7 @@ FLAKY_TESTS_QUERY = """
                 jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'name' as suite,
                 jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'name' as test,
                 jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'status' as status,
+                jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'retriesStatusChange' as retries_status_change,
                 to_timestamp((jsonb_array_elements(jsonb_array_elements(jsonb_array_elements(data -> 'children') -> 'children') -> 'children') -> 'time' -> 'start')::bigint / 1000)::date as timestamp
             FROM
                 regress_test_results
@@ -29,7 +30,7 @@ FLAKY_TESTS_QUERY = """
         ) data
     WHERE
         timestamp > CURRENT_DATE - INTERVAL '%s' day
-        AND status::text IN ('"failed"', '"broken"')
+        AND (status::text IN ('"failed"', '"broken"') OR retries_status_change::boolean)
     ;
 """
 

From ecced13d906afdee72cfbe7e8bb03e3440d26302 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 11 May 2023 16:14:42 +0300
Subject: [PATCH 07/23] try: higher page_service timeouts to isolate an issue
 (#4206)

See #4205.
---
 pageserver/src/page_service.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/page_service.rs b/pageserver/src/page_service.rs
index a7a0d1a22e..bd3ece2dfc 100644
--- a/pageserver/src/page_service.rs
+++ b/pageserver/src/page_service.rs
@@ -256,7 +256,10 @@ async fn page_service_conn_main(
     //
     // no write timeout is used, because the kernel is assumed to error writes after some time.
     let mut socket = tokio_io_timeout::TimeoutReader::new(socket);
-    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 10)));
+
+    // timeout should be lower, but trying out multiple days for
+    // <https://github.com/neondatabase/neon/issues/4205>
+    socket.set_timeout(Some(std::time::Duration::from_secs(60 * 60 * 24 * 3)));
     let socket = std::pin::pin!(socket);
 
     // XXX: pgbackend.run() should take the connection_ctx,

From 80522a1b9df545664bab49df827f57f5ca085acc Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 11 May 2023 15:53:46 +0200
Subject: [PATCH 08/23] replace has_in_progress_downloads with new
 attachment_status field (#4168)

Control Plane currently [^1] polls for `has_in_progress_downloads ==
false` after /attach to determine that an attach operation succeeded.

As pointed out in the OpenAPI spec as of neon#4151, polling for
`has_in_progress_downloads` is incorrect.

This patch changes the situation by
- removing `has_in_progress_downloads`
- adding a new field `attachment_status.`
- changing instructions for `/attach` to poll for `attachment_status ==
attached`.

This makes the instructions in `/attach` actionable for Control Plane.
NB that we don't expose the TenantState in the OpenAPI docs, even though
we expose it in the endpoint. That is with good reason because we don't
want to commit to a fixed set of tenant states forever. Hence, the
separate `attachment_status` field that exposes the bare minimum
required to make /attach + subsequent polling 100% safe wrt split brain.

It would have been nice to report failures explicitly, but the problem
is that we lose that state when we restart. So, we return `attached`
upon attach failure. The tenant is Broken in that case, causing Control
Plane's subsequent health check will fail. Control Plane can roll back
the relocation operation then.
NB: the reliance on the subsequent health check is no change to what we
had before this patch!
NB: we can always add additional TenantAttachmentStatus'es in the future
to communicate failure.

This PR also moves the attach-marker file's creation to the API
handler's synchronous part. That was done to avoid the need to
distinguish
* `Attaching but marker not yet written => AttachmentStatus::Maybe` from
* `Attaching, marker written, but attach failed for other reason =>
AttachmentStatus::Attached`

Coincidentally, this also adds more transactionality to the /attach API
because we only return 202 once we've written the marker file. But, in
the end, it doesn't affect how the control plane interacts with us or
how it needs to do retries. So, we don't mention any of this in the API
docs.

[^1]: The one-click tenant relocation PR cloud#4740, currently WIP, is
      the first real user.
---
 libs/pageserver_api/src/models.rs    | 51 +++++++++++++----
 pageserver/src/http/openapi_spec.yml | 46 +++++++++------
 pageserver/src/http/routes.rs        |  4 +-
 pageserver/src/tenant.rs             | 83 ++++++++++++++++++++--------
 pageserver/src/tenant/mgr.rs         | 17 +++++-
 5 files changed, 145 insertions(+), 56 deletions(-)

diff --git a/libs/pageserver_api/src/models.rs b/libs/pageserver_api/src/models.rs
index 15c37b9453..e4df81c9ad 100644
--- a/libs/pageserver_api/src/models.rs
+++ b/libs/pageserver_api/src/models.rs
@@ -48,13 +48,33 @@ pub enum TenantState {
 }
 
 impl TenantState {
-    pub fn has_in_progress_downloads(&self) -> bool {
+    pub fn attachment_status(&self) -> TenantAttachmentStatus {
+        use TenantAttachmentStatus::*;
         match self {
-            Self::Loading => true,
-            Self::Attaching => true,
-            Self::Active => false,
-            Self::Stopping => false,
-            Self::Broken { .. } => false,
+            // The attach procedure writes the marker file before adding the Attaching tenant to the tenants map.
+            // So, technically, we can return Attached here.
+            // However, as soon as Console observes Attached, it will proceed with the Postgres-level health check.
+            // But, our attach task might still be fetching the remote timelines, etc.
+            // So, return `Maybe` while Attaching, making Console wait for the attach task to finish.
+            Self::Attaching => Maybe,
+            // tenant mgr startup distinguishes attaching from loading via marker file.
+            // If it's loading, there is no attach marker file, i.e., attach had finished in the past.
+            Self::Loading => Attached,
+            // We only reach Active after successful load / attach.
+            // So, call atttachment status Attached.
+            Self::Active => Attached,
+            // If the (initial or resumed) attach procedure fails, the tenant becomes Broken.
+            // However, it also becomes Broken if the regular load fails.
+            // We would need a separate TenantState variant to distinguish these cases.
+            // However, there's no practical difference from Console's perspective.
+            // It will run a Postgres-level health check as soon as it observes Attached.
+            // That will fail on Broken tenants.
+            // Console can then rollback the attach, or, wait for operator to fix the Broken tenant.
+            Self::Broken { .. } => Attached,
+            // Why is Stopping a Maybe case? Because, during pageserver shutdown,
+            // we set the Stopping state irrespective of whether the tenant
+            // has finished attaching or not.
+            Self::Stopping => Maybe,
         }
     }
 
@@ -209,16 +229,25 @@ impl TenantConfigRequest {
     }
 }
 
+/// See [`TenantState::attachment_status`] and the OpenAPI docs for context.
+#[derive(Serialize, Deserialize, Clone)]
+#[serde(rename_all = "snake_case")]
+pub enum TenantAttachmentStatus {
+    Maybe,
+    Attached,
+}
+
 #[serde_as]
 #[derive(Serialize, Deserialize, Clone)]
 pub struct TenantInfo {
     #[serde_as(as = "DisplayFromStr")]
     pub id: TenantId,
+    // NB: intentionally not part of OpenAPI, we don't want to commit to a specific set of TenantState's
     pub state: TenantState,
     /// Sum of the size of all layer files.
     /// If a layer is present in both local FS and S3, it counts only once.
     pub current_physical_size: Option<u64>, // physical size is only included in `tenant_status` endpoint
-    pub has_in_progress_downloads: Option<bool>,
+    pub attachment_status: TenantAttachmentStatus,
 }
 
 /// This represents the output of the "timeline_detail" and "timeline_list" API calls.
@@ -691,7 +720,7 @@ mod tests {
             id: TenantId::generate(),
             state: TenantState::Active,
             current_physical_size: Some(42),
-            has_in_progress_downloads: Some(false),
+            attachment_status: TenantAttachmentStatus::Attached,
         };
         let expected_active = json!({
             "id": original_active.id.to_string(),
@@ -699,7 +728,7 @@ mod tests {
                 "slug": "Active",
             },
             "current_physical_size": 42,
-            "has_in_progress_downloads": false,
+            "attachment_status": "attached",
         });
 
         let original_broken = TenantInfo {
@@ -709,7 +738,7 @@ mod tests {
                 backtrace: "backtrace info".into(),
             },
             current_physical_size: Some(42),
-            has_in_progress_downloads: Some(false),
+            attachment_status: TenantAttachmentStatus::Attached,
         };
         let expected_broken = json!({
             "id": original_broken.id.to_string(),
@@ -721,7 +750,7 @@ mod tests {
                 }
             },
             "current_physical_size": 42,
-            "has_in_progress_downloads": false,
+            "attachment_status": "attached",
         });
 
         assert_eq!(
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index 877b367b40..330587310f 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -346,23 +346,23 @@ paths:
         starts writing to the tenant's S3 state unless it receives one of the
         distinguished errors below that state otherwise.
 
-        The method to identify whether a request has arrived at the pageserver, and
-        whether it has succeeded, is to poll for the tenant status to reach "Active"
-        or "Broken" state. These values are currently not explicitly documented in
-        the API spec.
-        Polling for `has_in_progress_downloads == false` is INCORRECT because that
-        value can turn `false` during shutdown while the Attach operation is still
-        unfinished.
+        If a client receives a not-distinguished response, e.g., a network timeout,
+        it MUST retry the /attach request and poll again for the tenant's
+        attachment status.
+
+        After the client has received a 202, it MUST poll the tenant's
+        attachment status (field `attachment_status`) to reach state `attached`.
+        If the `attachment_status` is missing, the client MUST retry the `/attach`
+        request (goto previous paragraph). This is a robustness measure in case the tenant
+        status endpoint is buggy, but the attach operation is ongoing.
 
         There is no way to cancel an in-flight request.
 
-        If a client receives a not-distinguished response, e.g., a network timeout,
-        it MUST retry the /attach request and poll again for tenant status.
-
-        In any case, it must
-        * NOT ASSUME that the /attach request has been lost in the network,
-        * NOT ASSUME that the request has been lost based on a subsequent
-          tenant status request returning 404. (The request may still be in flight!)
+        In any case, the client
+        * MUST NOT ASSUME that the /attach request has been lost in the network,
+        * MUST NOT ASSUME that the request has been lost, based on the observation
+          that a subsequent tenant status request returns 404. The request may
+          still be in flight. It must be retried.
       responses:
         "202":
           description: Tenant attaching scheduled
@@ -888,13 +888,27 @@ components:
       type: object
       required:
         - id
+        - attachment_status
       properties:
         id:
           type: string
         current_physical_size:
           type: integer
-        has_in_progress_downloads:
-          type: boolean
+        attachment_status:
+          description: |
+            Status of this tenant's attachment to this pageserver.
+
+            - `maybe` means almost nothing, don't read anything into it
+              except for the fact that the pageserver _might_ be already
+              writing to the tenant's S3 state, so, DO NOT ATTACH the
+              tenant to any other pageserver, or we risk split-brain.
+            - `attached` means that the attach operation has completed,
+              maybe successfully, maybe not. Perform a health check at
+              the Postgres level to determine healthiness of the tenant.
+
+            See the tenant `/attach` endpoint for more information.
+          type: string
+          enum: [ "maybe", "attached" ]
     TenantCreateInfo:
       type: object
       properties:
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 58ee7367ca..19d8243b37 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -467,7 +467,7 @@ async fn tenant_list_handler(request: Request<Body>) -> Result<Response<Body>, A
             id: *id,
             state: state.clone(),
             current_physical_size: None,
-            has_in_progress_downloads: Some(state.has_in_progress_downloads()),
+            attachment_status: state.attachment_status(),
         })
         .collect::<Vec<TenantInfo>>();
 
@@ -492,7 +492,7 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
             id: tenant_id,
             state: state.clone(),
             current_physical_size: Some(current_physical_size),
-            has_in_progress_downloads: Some(state.has_in_progress_downloads()),
+            attachment_status: state.attachment_status(),
         })
     }
     .instrument(info_span!("tenant_status_handler", tenant = %tenant_id))
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1c6006493e..f97afe9cc9 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -593,16 +593,19 @@ impl Tenant {
     /// finishes. You can use wait_until_active() to wait for the task to
     /// complete.
     ///
-    pub fn spawn_attach(
+    pub(crate) fn spawn_attach(
         conf: &'static PageServerConf,
         tenant_id: TenantId,
         remote_storage: GenericRemoteStorage,
         ctx: &RequestContext,
-    ) -> Arc<Tenant> {
+    ) -> anyhow::Result<Arc<Tenant>> {
         // XXX: Attach should provide the config, especially during tenant migration.
         //      See https://github.com/neondatabase/neon/issues/1555
         let tenant_conf = TenantConfOpt::default();
 
+        Self::attach_idempotent_create_marker_file(conf, tenant_id)
+            .context("create attach marker file")?;
+
         let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
         let tenant = Arc::new(Tenant::new(
             TenantState::Attaching,
@@ -635,7 +638,46 @@ impl Tenant {
                 Ok(())
             },
         );
-        tenant
+        Ok(tenant)
+    }
+
+    fn attach_idempotent_create_marker_file(
+        conf: &'static PageServerConf,
+        tenant_id: TenantId,
+    ) -> anyhow::Result<()> {
+        // Create directory with marker file to indicate attaching state.
+        // The load_local_tenants() function in tenant::mgr relies on the marker file
+        // to determine whether a tenant has finished attaching.
+        let tenant_dir = conf.tenant_path(&tenant_id);
+        let marker_file = conf.tenant_attaching_mark_file_path(&tenant_id);
+        debug_assert_eq!(marker_file.parent().unwrap(), tenant_dir);
+        // TODO: should use tokio::fs here, but
+        // 1. caller is not async, for good reason (it holds tenants map lock)
+        // 2. we'd need to think about cancel safety. Turns out dropping a tokio::fs future
+        //    doesn't wait for the activity in the fs thread pool.
+        crashsafe::create_dir_all(&tenant_dir).context("create tenant directory")?;
+        match fs::OpenOptions::new()
+            .write(true)
+            .create_new(true)
+            .open(&marker_file)
+        {
+            Ok(_) => {}
+            Err(e) if e.kind() == std::io::ErrorKind::AlreadyExists => {
+                // Either this is a retry of attach or there is a concurrent task also doing attach for this tenant.
+                // We cannot distinguish this here.
+                // The caller is responsible for ensuring there's no concurrent attach for a tenant.
+                {} // fsync again, we don't know if that already happened
+            }
+            err => {
+                err.context("create tenant attaching marker file")?;
+                unreachable!("we covered the Ok() case above");
+            }
+        }
+        crashsafe::fsync_file_and_parent(&marker_file)
+            .context("fsync tenant attaching marker file and parent")?;
+        debug_assert!(tenant_dir.is_dir());
+        debug_assert!(marker_file.is_file());
+        Ok(())
     }
 
     ///
@@ -643,26 +685,15 @@ impl Tenant {
     ///
     #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
     async fn attach(self: &Arc<Tenant>, ctx: RequestContext) -> anyhow::Result<()> {
-        // Create directory with marker file to indicate attaching state.
-        // The load_local_tenants() function in tenant::mgr relies on the marker file
-        // to determine whether a tenant has finished attaching.
-        let tenant_dir = self.conf.tenant_path(&self.tenant_id);
         let marker_file = self.conf.tenant_attaching_mark_file_path(&self.tenant_id);
-        debug_assert_eq!(marker_file.parent().unwrap(), tenant_dir);
-        if tenant_dir.exists() {
-            if !marker_file.is_file() {
-                anyhow::bail!(
-                    "calling Tenant::attach with a tenant directory that doesn't have the attaching marker file:\ntenant_dir: {}\nmarker_file: {}",
-                    tenant_dir.display(), marker_file.display());
-            }
-        } else {
-            crashsafe::create_dir_all(&tenant_dir).context("create tenant directory")?;
-            fs::File::create(&marker_file).context("create tenant attaching marker file")?;
-            crashsafe::fsync_file_and_parent(&marker_file)
-                .context("fsync tenant attaching marker file and parent")?;
+        if !tokio::fs::try_exists(&marker_file)
+            .await
+            .context("check for existence of marker file")?
+        {
+            anyhow::bail!(
+                "implementation error: marker file should exist at beginning of this function"
+            );
         }
-        debug_assert!(tenant_dir.is_dir());
-        debug_assert!(marker_file.is_file());
 
         // Get list of remote timelines
         // download index files for every tenant timeline
@@ -839,11 +870,15 @@ impl Tenant {
     }
 
     /// Create a placeholder Tenant object for a broken tenant
-    pub fn create_broken_tenant(conf: &'static PageServerConf, tenant_id: TenantId) -> Arc<Tenant> {
+    pub fn create_broken_tenant(
+        conf: &'static PageServerConf,
+        tenant_id: TenantId,
+        reason: String,
+    ) -> Arc<Tenant> {
         let wal_redo_manager = Arc::new(PostgresRedoManager::new(conf, tenant_id));
         Arc::new(Tenant::new(
             TenantState::Broken {
-                reason: "create_broken_tenant".into(),
+                reason,
                 backtrace: String::new(),
             },
             conf,
@@ -876,7 +911,7 @@ impl Tenant {
             Ok(conf) => conf,
             Err(e) => {
                 error!("load tenant config failed: {:?}", e);
-                return Tenant::create_broken_tenant(conf, tenant_id);
+                return Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"));
             }
         };
 
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index 754316b3cd..8191880bb5 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -186,10 +186,20 @@ pub fn schedule_local_tenant_processing(
     let tenant = if conf.tenant_attaching_mark_file_path(&tenant_id).exists() {
         info!("tenant {tenant_id} has attaching mark file, resuming its attach operation");
         if let Some(remote_storage) = remote_storage {
-            Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx)
+            match Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx) {
+                Ok(tenant) => tenant,
+                Err(e) => {
+                    error!("Failed to spawn_attach tenant {tenant_id}, reason: {e:#}");
+                    Tenant::create_broken_tenant(conf, tenant_id, format!("{e:#}"))
+                }
+            }
         } else {
             warn!("tenant {tenant_id} has attaching mark file, but pageserver has no remote storage configured");
-            Tenant::create_broken_tenant(conf, tenant_id)
+            Tenant::create_broken_tenant(
+                conf,
+                tenant_id,
+                "attaching mark file present but no remote storage configured".to_string(),
+            )
         }
     } else {
         info!("tenant {tenant_id} is assumed to be loadable, starting load operation");
@@ -466,7 +476,8 @@ pub async fn attach_tenant(
             "Cannot attach tenant {tenant_id}, local tenant directory already exists"
         );
 
-        let tenant = Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx);
+        let tenant =
+            Tenant::spawn_attach(conf, tenant_id, remote_storage, ctx).context("spawn_attach")?;
         vacant_entry.insert(tenant);
         Ok(())
     })

From 1d266a6365565b30fc2d913bdf00490c8f51fe9e Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 11 May 2023 16:09:29 +0200
Subject: [PATCH 09/23] logical size calculation metrics: differentiate regular
 vs imitated (#4197)

I want this distinction so I can prove my assumption that the disk IO
peaks which we see every 24h on prod are due to eviction's imitate
synthetic size calculations.

refs https://github.com/neondatabase/neon/issues/4154
---
 pageserver/src/metrics.rs                       |  5 +++++
 pageserver/src/tenant/timeline.rs               | 12 +++++++++---
 pageserver/src/tenant/timeline/eviction_task.rs |  7 ++++++-
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index ec2f49c85a..542fd511e1 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -30,6 +30,7 @@ const STORAGE_TIME_OPERATIONS: &[&str] = &[
     "create images",
     "init logical size",
     "logical size",
+    "imitate logical size",
     "load layer map",
     "gc",
 ];
@@ -688,6 +689,7 @@ pub struct TimelineMetrics {
     pub compact_time_histo: StorageTimeMetrics,
     pub create_images_time_histo: StorageTimeMetrics,
     pub logical_size_histo: StorageTimeMetrics,
+    pub imitate_logical_size_histo: StorageTimeMetrics,
     pub load_layer_map_histo: StorageTimeMetrics,
     pub garbage_collect_histo: StorageTimeMetrics,
     pub last_record_gauge: IntGauge,
@@ -720,6 +722,8 @@ impl TimelineMetrics {
         let create_images_time_histo =
             StorageTimeMetrics::new("create images", &tenant_id, &timeline_id);
         let logical_size_histo = StorageTimeMetrics::new("logical size", &tenant_id, &timeline_id);
+        let imitate_logical_size_histo =
+            StorageTimeMetrics::new("imitate logical size", &tenant_id, &timeline_id);
         let load_layer_map_histo =
             StorageTimeMetrics::new("load layer map", &tenant_id, &timeline_id);
         let garbage_collect_histo = StorageTimeMetrics::new("gc", &tenant_id, &timeline_id);
@@ -756,6 +760,7 @@ impl TimelineMetrics {
             compact_time_histo,
             create_images_time_histo,
             logical_size_histo,
+            imitate_logical_size_histo,
             garbage_collect_histo,
             load_layer_map_histo,
             last_record_gauge,
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 90f2951aef..658b5d1289 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -49,7 +49,7 @@ use crate::tenant::{
 
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::{TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS};
+use crate::metrics::{StorageTimeMetrics, TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
 use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
@@ -1938,7 +1938,12 @@ impl Timeline {
             let cancel = cancel.child_token();
             let ctx = ctx.attached_child();
             self_calculation
-                .calculate_logical_size(lsn, cancel, &ctx)
+                .calculate_logical_size(
+                    lsn,
+                    &self_calculation.metrics.logical_size_histo,
+                    cancel,
+                    &ctx,
+                )
                 .await
         });
         let timeline_state_cancellation = async {
@@ -1993,6 +1998,7 @@ impl Timeline {
     pub async fn calculate_logical_size(
         &self,
         up_to_lsn: Lsn,
+        storage_time_metrics: &StorageTimeMetrics,
         cancel: CancellationToken,
         ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
@@ -2026,7 +2032,7 @@ impl Timeline {
         if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) {
             return Ok(size);
         }
-        let timer = self.metrics.logical_size_histo.start_timer();
+        let timer = storage_time_metrics.start_timer();
         let logical_size = self
             .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
             .await?;
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 523a5f8fa7..eb04e7e579 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -335,7 +335,12 @@ impl Timeline {
 
         // imitiate on-restart initial logical size
         let size = self
-            .calculate_logical_size(lsn, cancel.clone(), ctx)
+            .calculate_logical_size(
+                lsn,
+                &self.metrics.imitate_logical_size_histo,
+                cancel.clone(),
+                ctx,
+            )
             .instrument(info_span!("calculate_logical_size"))
             .await;
 

From 1988cc55275912f42b65f8823658b420a05fa790 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 11 May 2023 17:53:42 +0300
Subject: [PATCH 10/23] Fix `failpoint_sleep_millis_async` without `use
 std::time::Duration` (#4195)

I tried to use failpoint_sleep_millis_async(...) in a source file that
didn't do `use std::time::Duration`, and got a compiler error:

```
error[E0433]: failed to resolve: use of undeclared type `Duration`
   --> pageserver/src/walingest.rs:316:17
    |
316 |                 utils::failpoint_sleep_millis_async!("wal-ingest-logical-message-sleep");
    |                 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ not found in this scope
    |
    = note: this error originates in the macro `utils::failpoint_sleep_millis_async` (in Nightly builds, run with -Z macro-backtrace for more info)
help: consider importing one of these items
    |
24  | use chrono::Duration;
    |
24  | use core::time::Duration;
    |
24  | use humantime::Duration;
    |
24  | use serde_with::__private__::Duration;
    |
      and 2 other candidates
```
---
 libs/utils/src/lib.rs | 53 +++++++++++++++++++++++++++++--------------
 1 file changed, 36 insertions(+), 17 deletions(-)

diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs
index 0615d50c1a..4e4f79ab6b 100644
--- a/libs/utils/src/lib.rs
+++ b/libs/utils/src/lib.rs
@@ -60,24 +60,43 @@ pub mod tracing_span_assert;
 
 pub mod rate_limit;
 
-/// use with fail::cfg("$name", "return(2000)")
-#[macro_export]
-macro_rules! failpoint_sleep_millis_async {
-    ($name:literal) => {{
-        let should_sleep: Option<std::time::Duration> = (|| {
-            fail::fail_point!($name, |v: Option<_>| {
-                let millis = v.unwrap().parse::<u64>().unwrap();
-                Some(Duration::from_millis(millis))
-            });
-            None
-        })();
-        if let Some(d) = should_sleep {
-            tracing::info!("failpoint {:?}: sleeping for {:?}", $name, d);
-            tokio::time::sleep(d).await;
-            tracing::info!("failpoint {:?}: sleep done", $name);
-        }
-    }};
+mod failpoint_macro_helpers {
+
+    /// use with fail::cfg("$name", "return(2000)")
+    ///
+    /// The effect is similar to a "sleep(2000)" action, i.e. we sleep for the
+    /// specified time (in milliseconds). The main difference is that we use async
+    /// tokio sleep function. Another difference is that we print lines to the log,
+    /// which can be useful in tests to check that the failpoint was hit.
+    #[macro_export]
+    macro_rules! failpoint_sleep_millis_async {
+        ($name:literal) => {{
+            // If the failpoint is used with a "return" action, set should_sleep to the
+            // returned value (as string). Otherwise it's set to None.
+            let should_sleep = (|| {
+                ::fail::fail_point!($name, |x| x);
+                ::std::option::Option::None
+            })();
+
+            // Sleep if the action was a returned value
+            if let ::std::option::Option::Some(duration_str) = should_sleep {
+                $crate::failpoint_sleep_helper($name, duration_str).await
+            }
+        }};
+    }
+
+    // Helper function used by the macro. (A function has nicer scoping so we
+    // don't need to decorate everything with "::")
+    pub async fn failpoint_sleep_helper(name: &'static str, duration_str: String) {
+        let millis = duration_str.parse::<u64>().unwrap();
+        let d = std::time::Duration::from_millis(millis);
+
+        tracing::info!("failpoint {:?}: sleeping for {:?}", name, d);
+        tokio::time::sleep(d).await;
+        tracing::info!("failpoint {:?}: sleep done", name);
+    }
 }
+pub use failpoint_macro_helpers::failpoint_sleep_helper;
 
 /// This is a shortcut to embed git sha into binaries and avoid copying the same build script to all packages
 ///

From 845e2965628a931e9caf8b7233cac3d4b834a6c8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Thu, 11 May 2023 17:02:19 +0200
Subject: [PATCH 11/23] eviction: add global histogram for iteration durations
 (#4212)

I would like to know whether and by how much the eviction iterations
spike in the $period-sized window that happens every $threshold , when
all the timelines do the imitate accesses.

refs https://github.com/neondatabase/neon/issues/4154
---
 pageserver/src/metrics.rs                       | 10 ++++++++++
 pageserver/src/tenant/timeline/eviction_task.rs |  7 +++++++
 2 files changed, 17 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 542fd511e1..c04f6e054b 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -187,6 +187,16 @@ static PERSISTENT_BYTES_WRITTEN: Lazy<IntCounterVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+pub(crate) static EVICTION_ITERATION_DURATION: Lazy<HistogramVec> = Lazy::new(|| {
+    register_histogram_vec!(
+        "pageserver_eviction_iteration_duration_seconds_global",
+        "Time spent on a single eviction iteration",
+        &["period_secs", "threshold_secs"],
+        STORAGE_OP_BUCKETS.into(),
+    )
+    .expect("failed to define a metric")
+});
+
 static EVICTIONS: Lazy<IntCounterVec> = Lazy::new(|| {
     register_int_counter_vec!(
         "pageserver_evictions",
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index eb04e7e579..5ea3d5b14d 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -120,6 +120,13 @@ impl Timeline {
                 }
                 let elapsed = start.elapsed();
                 crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
+                crate::metrics::EVICTION_ITERATION_DURATION
+                    .get_metric_with_label_values(&[
+                        &format!("{}", p.period.as_secs()),
+                        &format!("{}", p.threshold.as_secs()),
+                    ])
+                    .unwrap()
+                    .observe(elapsed.as_secs_f64());
                 ControlFlow::Continue(start + p.period)
             }
         }

From ecfe4757d348ca78649271623ef3efa464207123 Mon Sep 17 00:00:00 2001
From: Rahul Modpur <rmodpur2@gmail.com>
Date: Wed, 10 May 2023 10:34:23 +0530
Subject: [PATCH 12/23] fix bogus at character context in log messages

Signed-off-by: Rahul Modpur <rmodpur2@gmail.com>
---
 pgxn/neon/pagestore_client.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/pagestore_client.h b/pgxn/neon/pagestore_client.h
index 22f5cdb73a..8257b90ac3 100644
--- a/pgxn/neon/pagestore_client.h
+++ b/pgxn/neon/pagestore_client.h
@@ -52,7 +52,7 @@ typedef struct
 #define NEON_TAG "[NEON_SMGR] "
 #define neon_log(tag, fmt, ...) ereport(tag,                                  \
 										(errmsg(NEON_TAG fmt, ##__VA_ARGS__), \
-										 errhidestmt(true), errhidecontext(true), internalerrposition(0)))
+										 errhidestmt(true), errhidecontext(true), errposition(0), internalerrposition(0)))
 
 /*
  * supertype of all the Neon*Request structs below

From 586923429065297cdd31b59d15cfb97065f21b4a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 12 May 2023 15:36:30 +0200
Subject: [PATCH 13/23] logical size calculation: spawn with in_current_span
 (#4196)

While investigating https://github.com/neondatabase/neon/issues/4154 I
found that the `Calculating logical size for timeline` tracing events
created from within the logical size computation code are not always
attributable to the background task that caused it.

My goal is to be able to distinguish in the logs whether a `Calculating
logical size for timeline` was logged as part of a real synthetic size
calculation VS an imitation by the eviction task.

I want this distinction so I can prove my assumption that the disk IO
peaks which we see every 24h on prod are due to eviction's imitate
synthetic size calculations.

The alternative here, which I would have preferred, but is more work:
link RequestContext's into a child->parent list and dump this list when
we log `Calculating logical size for timeline`.

I would have preferred that over what we have in this PR because,
technically, the ondemand logical size computation can outlive the
caller that spawned it. This is against the idea of correctly nested
spans.

I guess in OpenTelemetry land, the correct modelling would be a link
between the caller's span and the task_mgr task's span.

Anyways, I think the case where we hang up on the spawned ondemand
logical size calculation is quite rare. So, I'm willing to tolerate
incorrectly nested spans for these edge-cases.

refs https://github.com/neondatabase/neon/issues/4154
---
 pageserver/src/consumption_metrics.rs |  3 +-
 pageserver/src/http/routes.rs         | 41 ++++++++++++++++-----------
 pageserver/src/pgdatadir_mapping.rs   |  2 ++
 pageserver/src/tenant.rs              |  1 +
 pageserver/src/tenant/size.rs         | 17 ++++++-----
 pageserver/src/tenant/timeline.rs     |  9 ++++--
 6 files changed, 45 insertions(+), 28 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 8916d4f1c9..23cfc55b65 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -164,7 +164,8 @@ pub async fn collect_metrics_iteration(
                     timeline_written_size,
                 ));
 
-                match timeline.get_current_logical_size(ctx) {
+                let span = info_span!("collect_metrics_iteration", tenant_id = %timeline.tenant_id, timeline_id = %timeline.timeline_id);
+                match span.in_scope(|| timeline.get_current_logical_size(ctx)) {
                     // Only send timeline logical size when it is fully calculated.
                     Ok((size, is_exact)) if is_exact => {
                         current_metrics.push((
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 19d8243b37..3c87e6f79d 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -169,6 +169,8 @@ async fn build_timeline_info(
     include_non_incremental_logical_size: bool,
     ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
+    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
+
     let mut info = build_timeline_info_common(timeline, ctx)?;
     if include_non_incremental_logical_size {
         // XXX we should be using spawn_ondemand_logical_size_calculation here.
@@ -191,6 +193,7 @@ fn build_timeline_info_common(
     timeline: &Arc<Timeline>,
     ctx: &RequestContext,
 ) -> anyhow::Result<TimelineInfo> {
+    crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
     let last_record_lsn = timeline.get_last_record_lsn();
     let (wal_source_connstr, last_received_msg_lsn, last_received_msg_ts) = {
         let guard = timeline.last_received_wal.lock().unwrap();
@@ -263,25 +266,28 @@ async fn timeline_create_handler(mut request: Request<Body>) -> Result<Response<
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Error);
 
-    let tenant = mgr::get_tenant(tenant_id, true).await?;
-    match tenant.create_timeline(
-        new_timeline_id,
-        request_data.ancestor_timeline_id.map(TimelineId::from),
-        request_data.ancestor_start_lsn,
-        request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
-        &ctx,
-    )
-    .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
-    .await {
-        Ok(Some(new_timeline)) => {
-            // Created. Construct a TimelineInfo for it.
-            let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
-                .map_err(ApiError::InternalServerError)?;
-            json_response(StatusCode::CREATED, timeline_info)
+    async {
+        let tenant = mgr::get_tenant(tenant_id, true).await?;
+        match tenant.create_timeline(
+            new_timeline_id,
+            request_data.ancestor_timeline_id.map(TimelineId::from),
+            request_data.ancestor_start_lsn,
+            request_data.pg_version.unwrap_or(crate::DEFAULT_PG_VERSION),
+            &ctx,
+        )
+        .await {
+            Ok(Some(new_timeline)) => {
+                // Created. Construct a TimelineInfo for it.
+                let timeline_info = build_timeline_info_common(&new_timeline, &ctx)
+                    .map_err(ApiError::InternalServerError)?;
+                json_response(StatusCode::CREATED, timeline_info)
+            }
+            Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists
+            Err(err) => Err(ApiError::InternalServerError(err)),
         }
-        Ok(None) => json_response(StatusCode::CONFLICT, ()), // timeline already exists
-        Err(err) => Err(ApiError::InternalServerError(err)),
     }
+    .instrument(info_span!("timeline_create", tenant = %tenant_id, new_timeline = ?request_data.new_timeline_id, timeline_id = %new_timeline_id, lsn=?request_data.ancestor_start_lsn, pg_version=?request_data.pg_version))
+    .await
 }
 
 async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
@@ -303,6 +309,7 @@ async fn timeline_list_handler(request: Request<Body>) -> Result<Response<Body>,
                 include_non_incremental_logical_size.unwrap_or(false),
                 &ctx,
             )
+            .instrument(info_span!("build_timeline_info", timeline_id = %timeline.timeline_id))
             .await
             .context("Failed to convert tenant timeline {timeline_id} into the local one: {e:?}")
             .map_err(ApiError::InternalServerError)?;
diff --git a/pageserver/src/pgdatadir_mapping.rs b/pageserver/src/pgdatadir_mapping.rs
index 6f9035305d..67f37ee519 100644
--- a/pageserver/src/pgdatadir_mapping.rs
+++ b/pageserver/src/pgdatadir_mapping.rs
@@ -500,6 +500,8 @@ impl Timeline {
         cancel: CancellationToken,
         ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
+        crate::tenant::debug_assert_current_span_has_tenant_and_timeline_id();
+
         // Fetch list of database dirs and iterate them
         let buf = self.get(DBDIR_KEY, lsn, ctx).await.context("read dbdir")?;
         let dbdir = DbDirectory::des(&buf).context("deserialize db directory")?;
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index f97afe9cc9..c9d5a54f0b 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -97,6 +97,7 @@ mod timeline;
 
 pub mod size;
 
+pub(crate) use timeline::debug_assert_current_span_has_tenant_and_timeline_id;
 pub use timeline::{LocalLayerInfoForDiskUsageEviction, PageReconstructError, Timeline};
 
 // re-export this function so that page_cache.rs can use it.
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 77275f96bd..9b2e5dc4f1 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -373,13 +373,16 @@ async fn fill_logical_sizes(
                 let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap());
                 let parallel_size_calcs = Arc::clone(limit);
                 let ctx = ctx.attached_child();
-                joinset.spawn(calculate_logical_size(
-                    parallel_size_calcs,
-                    timeline,
-                    lsn,
-                    ctx,
-                    cancel.child_token(),
-                ));
+                joinset.spawn(
+                    calculate_logical_size(
+                        parallel_size_calcs,
+                        timeline,
+                        lsn,
+                        ctx,
+                        cancel.child_token(),
+                    )
+                    .in_current_span(),
+                );
             }
             e.insert(cached_size);
         }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 658b5d1289..17b1d55c1f 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1886,7 +1886,7 @@ impl Timeline {
                 // so that we prevent future callers from spawning this task
                 permit.forget();
                 Ok(())
-            },
+            }.in_current_span(),
         );
     }
 
@@ -1919,18 +1919,21 @@ impl Timeline {
                     .await;
                 let _ = sender.send(res).ok();
                 Ok(()) // Receiver is responsible for handling errors
-            },
+            }
+            .in_current_span(),
         );
         receiver
     }
 
-    #[instrument(skip_all, fields(tenant = %self.tenant_id, timeline = %self.timeline_id))]
+    #[instrument(skip_all)]
     async fn logical_size_calculation_task(
         self: &Arc<Self>,
         lsn: Lsn,
         ctx: &RequestContext,
         cancel: CancellationToken,
     ) -> Result<u64, CalculateLogicalSizeError> {
+        debug_assert_current_span_has_tenant_and_timeline_id();
+
         let mut timeline_state_updates = self.subscribe_for_state_updates();
         let self_calculation = Arc::clone(self);
 

From bb06d281ea8cdcde69461e5be31e05efe511e345 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 12 May 2023 15:28:51 +0100
Subject: [PATCH 14/23] Run regressions tests on both Postgres 14 and 15
 (#4192)

This PR adds tests runs on Postgres 15 and created unified Allure report
with results for all tests.

- Split `.github/actions/allure-report` into
`.github/actions/allure-report-store` and
`.github/actions/allure-report-generate`
- Add debug or release pytest parameter for all tests (depending on
`BUILD_TYPE` env variable)
- Add Postgres version as a pytest parameter for all tests (depending on
`DEFAULT_PG_VERSION` env variable)
- Fix `test_wal_restore` and `restore_from_wal.sh` to support path with
`[`/`]` in it (fixed by applying spellcheck to the script and fixing all
warnings), `restore_from_wal_archive.sh` is deleted as unused.
- All known failures on Postgres 15 marked with xfail
---
 .../actions/allure-report-generate/action.yml | 184 +++++++++++++
 .../actions/allure-report-store/action.yml    |  72 +++++
 .github/actions/allure-report/action.yml      | 254 ------------------
 .../actions/run-python-test-set/action.yml    |  13 +-
 .github/workflows/benchmarking.yml            |  27 +-
 .github/workflows/build_and_test.yml          |  68 ++---
 libs/utils/scripts/restore_from_wal.sh        |  28 +-
 .../utils/scripts/restore_from_wal_archive.sh |  20 --
 scripts/pr-comment-test-report.js             | 158 +++++++----
 test_runner/conftest.py                       |   1 +
 test_runner/fixtures/allure.py                |  25 ++
 test_runner/fixtures/pg_version.py            |  14 +
 test_runner/regress/test_compatibility.py     |  27 +-
 test_runner/regress/test_hot_standby.py       |   2 +
 test_runner/regress/test_pg_regress.py        |   2 +
 test_runner/regress/test_tenant_size.py       |   2 +
 test_runner/regress/test_wal_restore.py       |  20 +-
 17 files changed, 488 insertions(+), 429 deletions(-)
 create mode 100644 .github/actions/allure-report-generate/action.yml
 create mode 100644 .github/actions/allure-report-store/action.yml
 delete mode 100644 .github/actions/allure-report/action.yml
 delete mode 100755 libs/utils/scripts/restore_from_wal_archive.sh
 create mode 100644 test_runner/fixtures/allure.py

diff --git a/.github/actions/allure-report-generate/action.yml b/.github/actions/allure-report-generate/action.yml
new file mode 100644
index 0000000000..07120c4c8a
--- /dev/null
+++ b/.github/actions/allure-report-generate/action.yml
@@ -0,0 +1,184 @@
+name: 'Create Allure report'
+description: 'Generate Allure report from uploaded by actions/allure-report-store tests results'
+
+outputs:
+  report-url:
+    description: 'Allure report URL'
+    value: ${{ steps.generate-report.outputs.report-url }}
+  report-json-url:
+    description: 'Allure report JSON URL'
+    value: ${{ steps.generate-report.outputs.report-json-url }}
+
+runs:
+  using: "composite"
+
+  steps:
+    # We're using some of env variables quite offen, so let's set them once.
+    #
+    # It would be nice to have them set in common runs.env[0] section, but it doesn't work[1]
+    #
+    # - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv
+    # - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456
+    #
+    - name: Set variables
+      shell: bash -euxo pipefail {0}
+      run: |
+        PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
+        if [ "${PR_NUMBER}" != "null" ]; then
+          BRANCH_OR_PR=pr-${PR_NUMBER}
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
+          # Shortcut for special branches
+          BRANCH_OR_PR=${GITHUB_REF_NAME}
+        else
+          BRANCH_OR_PR=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-")
+        fi
+
+        LOCK_FILE=reports/${BRANCH_OR_PR}/lock.txt
+
+        WORKDIR=/tmp/${BRANCH_OR_PR}-$(date +%s)
+        mkdir -p ${WORKDIR}
+
+        echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV
+        echo "LOCK_FILE=${LOCK_FILE}"       >> $GITHUB_ENV
+        echo "WORKDIR=${WORKDIR}"           >> $GITHUB_ENV
+        echo "BUCKET=${BUCKET}"             >> $GITHUB_ENV
+      env:
+        BUCKET: neon-github-public-dev
+
+    # TODO: We can replace with a special docker image with Java and Allure pre-installed
+    - uses: actions/setup-java@v3
+      with:
+        distribution: 'temurin'
+        java-version: '17'
+
+    - name: Install Allure
+      shell: bash -euxo pipefail {0}
+      run: |
+        if ! which allure; then
+          ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
+          wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP}
+          echo "${ALLURE_ZIP_MD5}  ${ALLURE_ZIP}" | md5sum -c
+          unzip -q ${ALLURE_ZIP}
+          echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH
+          rm -f ${ALLURE_ZIP}
+        fi
+      env:
+        ALLURE_VERSION: 2.22.0
+        ALLURE_ZIP_MD5: d5c9f0989b896482536956340a7d5ec9
+
+    # Potentially we could have several running build for the same key (for example, for the main branch), so we use improvised lock for this
+    - name: Acquire lock
+      shell: bash -euxo pipefail {0}
+      run: |
+        LOCK_TIMEOUT=300 # seconds
+
+        LOCK_CONTENT="${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}"
+        echo ${LOCK_CONTENT} > ${WORKDIR}/lock.txt
+
+        # Do it up to 5 times to avoid race condition
+        for _ in $(seq 1 5); do
+          for i in $(seq 1 ${LOCK_TIMEOUT}); do
+            LOCK_ACQUIRED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true)
+            # `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS)
+            if [ -z "${LOCK_ACQUIRED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ACQUIRED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then
+              break
+            fi
+            sleep 1
+          done
+
+          aws s3 mv --only-show-errors ${WORKDIR}/lock.txt "s3://${BUCKET}/${LOCK_FILE}"
+
+          # Double-check that exactly THIS run has acquired the lock
+          aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt
+          if [ "$(cat lock.txt)" = "${LOCK_CONTENT}" ]; then
+            break
+          fi
+        done
+
+    - name: Generate and publish final Allure report
+      id: generate-report
+      shell: bash -euxo pipefail {0}
+      run: |
+        REPORT_PREFIX=reports/${BRANCH_OR_PR}
+        RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID}
+
+        # Get previously uploaded data for this run
+        ZSTD_NBTHREADS=0
+
+        S3_FILEPATHS=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/ | jq --raw-output  '.Contents[].Key')
+        if [ -z "$S3_FILEPATHS" ]; then
+          # There's no previously uploaded data for this $GITHUB_RUN_ID
+          exit 0
+        fi
+        for S3_FILEPATH in ${S3_FILEPATHS}; do
+          time aws s3 cp --only-show-errors "s3://${BUCKET}/${S3_FILEPATH}" "${WORKDIR}"
+
+          archive=${WORKDIR}/$(basename $S3_FILEPATH)
+          mkdir -p ${archive%.tar.zst}
+          time tar -xf ${archive} -C ${archive%.tar.zst}
+          rm -f ${archive}
+        done
+
+        # Get history trend
+        time aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${WORKDIR}/latest/history" || true
+
+        # Generate report
+        time allure generate --clean --output ${WORKDIR}/report ${WORKDIR}/*
+
+        # Replace a logo link with a redirect to the latest version of the report
+        sed -i 's|<a href="." class=|<a href="https://'${BUCKET}'.s3.amazonaws.com/'${REPORT_PREFIX}'/latest/index.html?nocache='"'+Date.now()+'"'" class=|g' ${WORKDIR}/report/app.js
+
+        # Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
+        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
+        time aws s3 mv --recursive --only-show-errors "${WORKDIR}/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
+
+        REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
+
+        # Generate redirect
+        cat <<EOF > ${WORKDIR}/index.html
+          <!DOCTYPE html>
+
+          <meta charset="utf-8">
+          <title>Redirecting to ${REPORT_URL}</title>
+          <meta http-equiv="refresh" content="0; URL=${REPORT_URL}">
+        EOF
+        time aws s3 cp --only-show-errors ${WORKDIR}/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
+
+        echo "report-url=${REPORT_URL}"                                   >> $GITHUB_OUTPUT
+        echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
+
+    - name: Release lock
+      if: always()
+      shell: bash -euxo pipefail {0}
+      run: |
+        aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
+
+        if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}" ]; then
+          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
+        fi
+
+    - name: Cleanup
+      if: always()
+      shell: bash -euxo pipefail {0}
+      run: |
+        if [ -d "${WORKDIR}" ]; then
+          rm -rf ${WORKDIR}
+        fi
+
+    - uses: actions/github-script@v6
+      if: always()
+      env:
+        REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
+        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+      with:
+        script: |
+          const { REPORT_URL, COMMIT_SHA } = process.env
+
+          await github.rest.repos.createCommitStatus({
+            owner: context.repo.owner,
+            repo: context.repo.repo,
+            sha: `${COMMIT_SHA}`,
+            state: 'success',
+            target_url: `${REPORT_URL}`,
+            context: 'Allure report',
+          })
diff --git a/.github/actions/allure-report-store/action.yml b/.github/actions/allure-report-store/action.yml
new file mode 100644
index 0000000000..7ae9937d42
--- /dev/null
+++ b/.github/actions/allure-report-store/action.yml
@@ -0,0 +1,72 @@
+name: 'Store Allure results'
+description: 'Upload test results to be used by actions/allure-report-generate'
+
+inputs:
+  report-dir:
+    description: 'directory with test results generated by tests'
+    required: true
+  unique-key:
+    description: 'string to distinguish different results in the same run'
+    required: true
+
+runs:
+  using: "composite"
+
+  steps:
+    - name: Set variables
+      shell: bash -euxo pipefail {0}
+      run: |
+        PR_NUMBER=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
+        if [ "${PR_NUMBER}" != "null" ]; then
+          BRANCH_OR_PR=pr-${PR_NUMBER}
+        elif [ "${GITHUB_REF_NAME}" = "main" ] || [ "${GITHUB_REF_NAME}" = "release" ]; then
+          # Shortcut for special branches
+          BRANCH_OR_PR=${GITHUB_REF_NAME}
+        else
+          BRANCH_OR_PR=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-")
+        fi
+
+        echo "BRANCH_OR_PR=${BRANCH_OR_PR}" >> $GITHUB_ENV
+        echo "REPORT_DIR=${REPORT_DIR}"     >> $GITHUB_ENV
+      env:
+        REPORT_DIR: ${{ inputs.report-dir }}
+
+    - name: Upload test results
+      shell: bash -euxo pipefail {0}
+      run: |
+        REPORT_PREFIX=reports/${BRANCH_OR_PR}
+        RAW_PREFIX=reports-raw/${BRANCH_OR_PR}/${GITHUB_RUN_ID}
+
+        # Add metadata
+        cat <<EOF > ${REPORT_DIR}/executor.json
+          {
+            "name": "GitHub Actions",
+            "type": "github",
+            "url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html",
+            "buildOrder": ${GITHUB_RUN_ID},
+            "buildName": "GitHub Actions Run #${GITHUB_RUN_NUMBER}/${GITHUB_RUN_ATTEMPT}",
+            "buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}",
+            "reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html",
+            "reportName": "Allure Report"
+          }
+        EOF
+
+        cat <<EOF > ${REPORT_DIR}/environment.properties
+          COMMIT_SHA=${COMMIT_SHA}
+        EOF
+
+        ARCHIVE="${UNIQUE_KEY}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
+        ZSTD_NBTHREADS=0
+
+        time tar -C ${REPORT_DIR} -cf ${ARCHIVE} --zstd .
+        time aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}"
+      env:
+        UNIQUE_KEY: ${{ inputs.unique-key }}
+        COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+        BUCKET: neon-github-public-dev
+
+    - name: Cleanup
+      if: always()
+      shell: bash -euxo pipefail {0}
+      run: |
+        rm -rf ${REPORT_DIR}
diff --git a/.github/actions/allure-report/action.yml b/.github/actions/allure-report/action.yml
deleted file mode 100644
index 9a1037064a..0000000000
--- a/.github/actions/allure-report/action.yml
+++ /dev/null
@@ -1,254 +0,0 @@
-name: 'Create Allure report'
-description: 'Create and publish Allure report'
-
-inputs:
-  action:
-    desctiption: 'generate or store'
-    required: true
-  build_type:
-    description: '`build_type` from run-python-test-set action'
-    required: true
-  test_selection:
-    description: '`test_selector` from run-python-test-set action'
-    required: false
-outputs:
-  report-url:
-    description: 'Allure report URL'
-    value: ${{ steps.generate-report.outputs.report-url }}
-  report-json-url:
-    description: 'Allure report JSON URL'
-    value: ${{ steps.generate-report.outputs.report-json-url }}
-
-runs:
-  using: "composite"
-
-  steps:
-    # We're using some of env variables quite offen, so let's set them once.
-    #
-    # It would be nice to have them set in common runs.env[0] section, but it doesn't work[1]
-    #
-    # - [0] https://docs.github.com/en/actions/creating-actions/metadata-syntax-for-github-actions#runsenv
-    # - [1] https://github.com/neondatabase/neon/pull/3907#discussion_r1154703456
-    #
-    - name: Set common environment variables
-      shell: bash -euxo pipefail {0}
-      run: |
-        echo "BUILD_TYPE=${BUILD_TYPE}"   >> $GITHUB_ENV
-        echo "BUCKET=${BUCKET}"           >> $GITHUB_ENV
-        echo "TEST_OUTPUT=${TEST_OUTPUT}" >> $GITHUB_ENV
-      env:
-        BUILD_TYPE: ${{ inputs.build_type }}
-        BUCKET: neon-github-public-dev
-        TEST_OUTPUT: /tmp/test_output
-
-    - name: Validate input parameters
-      shell: bash -euxo pipefail {0}
-      run: |
-        if [ "${{ inputs.action }}" != "store" ] && [ "${{ inputs.action }}" != "generate" ]; then
-          echo >&2 "Unknown inputs.action type '${{ inputs.action }}'; allowed 'generate' or 'store' only"
-          exit 1
-        fi
-
-        if [ -z "${{ inputs.test_selection }}" ] && [ "${{ inputs.action }}" == "store" ]; then
-          echo >&2 "inputs.test_selection must be set for 'store' action"
-          exit 2
-        fi
-
-    - name: Calculate variables
-      id: calculate-vars
-      shell: bash -euxo pipefail {0}
-      run: |
-        # TODO: for manually triggered workflows (via workflow_dispatch) we need to have a separate key
-
-        pr_number=$(jq --raw-output .pull_request.number "$GITHUB_EVENT_PATH" || true)
-        if [ "${pr_number}" != "null" ]; then
-          key=pr-${pr_number}
-        elif [ "${GITHUB_REF_NAME}" = "main" ]; then
-          # Shortcut for a special branch
-          key=main
-        elif [ "${GITHUB_REF_NAME}" = "release" ]; then
-          # Shortcut for a special branch
-          key=release
-        else
-          key=branch-$(printf "${GITHUB_REF_NAME}" | tr -c "[:alnum:]._-" "-")
-        fi
-        echo "KEY=${key}" >> $GITHUB_OUTPUT
-
-        # Sanitize test selection to remove `/` and any other special characters
-        # Use printf instead of echo to avoid having `\n` at the end of the string
-        test_selection=$(printf "${{ inputs.test_selection }}" | tr -c "[:alnum:]._-" "-" )
-        echo "TEST_SELECTION=${test_selection}" >> $GITHUB_OUTPUT
-
-    - uses: actions/setup-java@v3
-      if: ${{ inputs.action == 'generate' }}
-      with:
-        distribution: 'temurin'
-        java-version: '17'
-
-    - name: Install Allure
-      if: ${{ inputs.action == 'generate' }}
-      shell: bash -euxo pipefail {0}
-      run: |
-        if ! which allure; then
-          ALLURE_ZIP=allure-${ALLURE_VERSION}.zip
-          wget -q https://github.com/allure-framework/allure2/releases/download/${ALLURE_VERSION}/${ALLURE_ZIP}
-          echo "${ALLURE_ZIP_MD5}  ${ALLURE_ZIP}" | md5sum -c
-          unzip -q ${ALLURE_ZIP}
-          echo "$(pwd)/allure-${ALLURE_VERSION}/bin" >> $GITHUB_PATH
-          rm -f ${ALLURE_ZIP}
-        fi
-      env:
-        ALLURE_VERSION: 2.21.0
-        ALLURE_ZIP_MD5: c8db4dd8e2a7882583d569ed2c82879c
-
-    - name: Upload Allure results
-      if: ${{ inputs.action == 'store' }}
-      env:
-        REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-        RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
-      shell: bash -euxo pipefail {0}
-      run: |
-        # Add metadata
-        cat <<EOF > $TEST_OUTPUT/allure/results/executor.json
-          {
-            "name": "GitHub Actions",
-            "type": "github",
-            "url": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/latest/index.html",
-            "buildOrder": ${GITHUB_RUN_ID},
-            "buildName": "GitHub Actions Run #${{ github.run_number }}/${GITHUB_RUN_ATTEMPT}",
-            "buildUrl": "${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/actions/runs/${GITHUB_RUN_ID}/attempts/${GITHUB_RUN_ATTEMPT}",
-            "reportUrl": "https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html",
-            "reportName": "Allure Report"
-          }
-        EOF
-        cat <<EOF > $TEST_OUTPUT/allure/results/environment.properties
-          TEST_SELECTION=${{ inputs.test_selection }}
-          BUILD_TYPE=${BUILD_TYPE}
-        EOF
-
-        ARCHIVE="${GITHUB_RUN_ID}-${TEST_SELECTION}-${GITHUB_RUN_ATTEMPT}-$(date +%s).tar.zst"
-        ZSTD_NBTHREADS=0
-
-        tar -C ${TEST_OUTPUT}/allure/results -cf ${ARCHIVE} --zstd .
-        aws s3 mv --only-show-errors ${ARCHIVE} "s3://${BUCKET}/${RAW_PREFIX}/${ARCHIVE}"
-
-    # Potentially we could have several running build for the same key (for example for the main branch), so we use improvised lock for this
-    - name: Acquire Allure lock
-      if: ${{ inputs.action == 'generate' }}
-      shell: bash -euxo pipefail {0}
-      env:
-        LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
-        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
-      run: |
-        LOCK_TIMEOUT=300 # seconds
-
-        for _ in $(seq 1 5); do
-          for i in $(seq 1 ${LOCK_TIMEOUT}); do
-            LOCK_ADDED=$(aws s3api head-object --bucket neon-github-public-dev --key ${LOCK_FILE} | jq --raw-output '.LastModified' || true)
-            # `date --date="..."` is supported only by gnu date (i.e. it doesn't work on BSD/macOS)
-            if [ -z "${LOCK_ADDED}" ] || [ "$(( $(date +%s) - $(date --date="${LOCK_ADDED}" +%s) ))" -gt "${LOCK_TIMEOUT}" ]; then
-              break
-            fi
-            sleep 1
-          done
-          echo "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" > lock.txt
-          aws s3 mv --only-show-errors lock.txt "s3://${BUCKET}/${LOCK_FILE}"
-
-          # A double-check that exactly WE have acquired the lock
-          aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt
-          if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then
-            break
-          fi
-        done
-
-    - name: Generate and publish final Allure report
-      if: ${{ inputs.action == 'generate' }}
-      id: generate-report
-      env:
-        REPORT_PREFIX: reports/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-        RAW_PREFIX: reports-raw/${{ steps.calculate-vars.outputs.KEY }}/${{ inputs.build_type }}
-      shell: bash -euxo pipefail {0}
-      run: |
-        # Get previously uploaded data for this run
-        ZSTD_NBTHREADS=0
-
-        s3_filepaths=$(aws s3api list-objects-v2 --bucket ${BUCKET} --prefix ${RAW_PREFIX}/${GITHUB_RUN_ID}- | jq --raw-output  '.Contents[].Key')
-        if [ -z "$s3_filepaths" ]; then
-          # There's no previously uploaded data for this run
-          exit 0
-        fi
-        for s3_filepath in ${s3_filepaths}; do
-          aws s3 cp --only-show-errors "s3://${BUCKET}/${s3_filepath}" "${TEST_OUTPUT}/allure/"
-
-          archive=${TEST_OUTPUT}/allure/$(basename $s3_filepath)
-          mkdir -p ${archive%.tar.zst}
-          tar -xf ${archive} -C ${archive%.tar.zst}
-          rm -f ${archive}
-        done
-
-        # Get history trend
-        aws s3 cp --recursive --only-show-errors "s3://${BUCKET}/${REPORT_PREFIX}/latest/history" "${TEST_OUTPUT}/allure/latest/history" || true
-
-        # Generate report
-        allure generate --clean --output $TEST_OUTPUT/allure/report $TEST_OUTPUT/allure/*
-
-        # Replace a logo link with a redirect to the latest version of the report
-        sed -i 's|<a href="." class=|<a href="https://'${BUCKET}'.s3.amazonaws.com/'${REPORT_PREFIX}'/latest/index.html" class=|g' $TEST_OUTPUT/allure/report/app.js
-
-        # Upload a history and the final report (in this particular order to not to have duplicated history in 2 places)
-        aws s3 mv --recursive --only-show-errors "${TEST_OUTPUT}/allure/report/history" "s3://${BUCKET}/${REPORT_PREFIX}/latest/history"
-        aws s3 mv --recursive --only-show-errors "${TEST_OUTPUT}/allure/report" "s3://${BUCKET}/${REPORT_PREFIX}/${GITHUB_RUN_ID}"
-
-        REPORT_URL=https://${BUCKET}.s3.amazonaws.com/${REPORT_PREFIX}/${GITHUB_RUN_ID}/index.html
-
-        # Generate redirect
-        cat <<EOF > ${TEST_OUTPUT}/allure/index.html
-          <!DOCTYPE html>
-
-          <meta charset="utf-8">
-          <title>Redirecting to ${REPORT_URL}</title>
-          <meta http-equiv="refresh" content="0; URL=${REPORT_URL}">
-        EOF
-        aws s3 cp --only-show-errors ${TEST_OUTPUT}/allure/index.html "s3://${BUCKET}/${REPORT_PREFIX}/latest/index.html"
-
-        echo "[Allure Report](${REPORT_URL})" >> ${GITHUB_STEP_SUMMARY}
-        echo "report-url=${REPORT_URL}" >> $GITHUB_OUTPUT
-        echo "report-json-url=${REPORT_URL%/index.html}/data/suites.json" >> $GITHUB_OUTPUT
-
-    - name: Release Allure lock
-      if: ${{ inputs.action == 'generate' && always() }}
-      shell: bash -euxo pipefail {0}
-      env:
-        LOCK_FILE: reports/${{ steps.calculate-vars.outputs.KEY }}/lock.txt
-        TEST_SELECTION: ${{ steps.calculate-vars.outputs.TEST_SELECTION }}
-      run: |
-        aws s3 cp --only-show-errors "s3://${BUCKET}/${LOCK_FILE}" ./lock.txt || exit 0
-
-        if [ "$(cat lock.txt)" = "${GITHUB_RUN_ID}-${GITHUB_RUN_ATTEMPT}-${TEST_SELECTION}" ]; then
-          aws s3 rm "s3://${BUCKET}/${LOCK_FILE}"
-        fi
-
-    - name: Cleanup
-      if: always()
-      shell: bash -euxo pipefail {0}
-      run: |
-        rm -rf ${TEST_OUTPUT}/allure
-
-    - uses: actions/github-script@v6
-      if: ${{ inputs.action == 'generate' && always() }}
-      env:
-        REPORT_URL: ${{ steps.generate-report.outputs.report-url }}
-        SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-      with:
-        script: |
-          const { REPORT_URL, BUILD_TYPE, SHA } = process.env
-
-          await github.rest.repos.createCommitStatus({
-            owner: context.repo.owner,
-            repo: context.repo.repo,
-            sha: `${SHA}`,
-            state: 'success',
-            target_url: `${REPORT_URL}`,
-            context: `Allure report / ${BUILD_TYPE}`,
-          })
diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 115f555913..1dd294d17a 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -197,14 +197,13 @@ runs:
       uses: ./.github/actions/upload
       with:
         name: compatibility-snapshot-${{ inputs.build_type }}-pg14-${{ github.run_id }}
-        # The path includes a test name (test_create_snapshot) and directory that the test creates (compatibility_snapshot_pg14), keep the path in sync with the test
-        path: /tmp/test_output/test_create_snapshot/compatibility_snapshot_pg14/
+        # Directory is created by test_compatibility.py::test_create_snapshot, keep the path in sync with the test
+        path: /tmp/test_output/compatibility_snapshot_pg14/
         prefix: latest
 
-    - name: Create Allure report
+    - name: Upload test results
       if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report
+      uses: ./.github/actions/allure-report-store
       with:
-        action: store
-        build_type: ${{ inputs.build_type }}
-        test_selection: ${{ inputs.test_selection }}
+        report-dir: /tmp/test_output/allure/results
+        unique-key: ${{ inputs.test_selection }}-${{ inputs.build_type }}
diff --git a/.github/workflows/benchmarking.yml b/.github/workflows/benchmarking.yml
index a5a27e59a8..11363b2407 100644
--- a/.github/workflows/benchmarking.yml
+++ b/.github/workflows/benchmarking.yml
@@ -93,10 +93,7 @@ jobs:
 
     - name: Create Allure report
       if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      uses: ./.github/actions/allure-report-generate
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -283,10 +280,7 @@ jobs:
 
     - name: Create Allure report
       if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      uses: ./.github/actions/allure-report-generate
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -380,10 +374,7 @@ jobs:
 
     - name: Create Allure report
       if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      uses: ./.github/actions/allure-report-generate
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -476,10 +467,7 @@ jobs:
 
     - name: Create Allure report
       if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      uses: ./.github/actions/allure-report-generate
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
@@ -566,16 +554,13 @@ jobs:
 
     - name: Create Allure report
       if: ${{ !cancelled() }}
-      uses: ./.github/actions/allure-report
-      with:
-        action: generate
-        build_type: ${{ env.BUILD_TYPE }}
+      uses: ./.github/actions/allure-report-generate
 
     - name: Post to a Slack channel
       if: ${{ github.event.schedule && failure() }}
       uses: slackapi/slack-github-action@v1
       with:
         channel-id: "C033QLM5P7D" # dev-staging-stream
-        slack-message: "Periodic TPC-H perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
+        slack-message: "Periodic User example perf testing ${{ matrix.platform }}: ${{ job.status }}\n${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}"
       env:
         SLACK_BOT_TOKEN: ${{ secrets.SLACK_BOT_TOKEN }}
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index fd150f642b..5a09f0b4aa 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -330,6 +330,7 @@ jobs:
       fail-fast: false
       matrix:
         build_type: [ debug, release ]
+        pg_version: [ v14, v15 ]
     steps:
       - name: Checkout
         uses: actions/checkout@v3
@@ -350,11 +351,12 @@ jobs:
           real_s3_secret_access_key: "${{ secrets.AWS_SECRET_ACCESS_KEY_CI_TESTS_S3 }}"
           rerun_flaky: true
         env:
+          DEFAULT_PG_VERSION: ${{ matrix.pg_version }}
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
           CHECK_ONDISK_DATA_COMPATIBILITY: nonempty
 
       - name: Merge and upload coverage data
-        if: matrix.build_type == 'debug'
+        if: matrix.build_type == 'debug' && matrix.pg_version == 'v14'
         uses: ./.github/actions/save-coverage-data
 
   benchmarks:
@@ -399,21 +401,10 @@ jobs:
     steps:
       - uses: actions/checkout@v3
 
-      - name: Create Allure report (debug)
+      - name: Create Allure report
         if: ${{ !cancelled() }}
-        id: create-allure-report-debug
-        uses: ./.github/actions/allure-report
-        with:
-          action: generate
-          build_type: debug
-
-      - name: Create Allure report (release)
-        if: ${{ !cancelled() }}
-        id: create-allure-report-release
-        uses: ./.github/actions/allure-report
-        with:
-          action: generate
-          build_type: release
+        id: create-allure-report
+        uses: ./.github/actions/allure-report-generate
 
       - uses: actions/github-script@v6
         if: >
@@ -423,52 +414,37 @@ jobs:
           # Retry script for 5XX server errors: https://github.com/actions/github-script#retries
           retries: 5
           script: |
-            const reports = [{
-              buildType: "debug",
-              reportUrl: "${{ steps.create-allure-report-debug.outputs.report-url }}",
-              jsonUrl:   "${{ steps.create-allure-report-debug.outputs.report-json-url }}",
-            }, {
-              buildType: "release",
-              reportUrl: "${{ steps.create-allure-report-release.outputs.report-url }}",
-              jsonUrl:   "${{ steps.create-allure-report-release.outputs.report-json-url }}",
-            }]
+            const report = {
+              reportUrl:     "${{ steps.create-allure-report.outputs.report-url }}",
+              reportJsonUrl: "${{ steps.create-allure-report.outputs.report-json-url }}",
+            }
 
             const script = require("./scripts/pr-comment-test-report.js")
             await script({
               github,
               context,
               fetch,
-              reports,
+              report,
             })
 
       - name: Store Allure test stat in the DB
-        if: >
-          !cancelled() && (
-            steps.create-allure-report-debug.outputs.report-url ||
-            steps.create-allure-report-release.outputs.report-url
-          )
+        if: ${{ !cancelled() && steps.create-allure-report.outputs.report-json-url }}
         env:
-          SHA: ${{ github.event.pull_request.head.sha || github.sha }}
-          REPORT_JSON_URL_DEBUG: ${{ steps.create-allure-report-debug.outputs.report-json-url }}
-          REPORT_JSON_URL_RELEASE: ${{ steps.create-allure-report-release.outputs.report-json-url }}
+          COMMIT_SHA: ${{ github.event.pull_request.head.sha || github.sha }}
+          REPORT_JSON_URL: ${{ steps.create-allure-report.outputs.report-json-url }}
           TEST_RESULT_CONNSTR: ${{ secrets.REGRESS_TEST_RESULT_CONNSTR }}
         run: |
           ./scripts/pysync
 
-          for report_url in $REPORT_JSON_URL_DEBUG $REPORT_JSON_URL_RELEASE; do
-            if [ -z "$report_url" ]; then
-              continue
-            fi
+          curl --fail --output suites.json "${REPORT_JSON_URL}"
+          export BUILD_TYPE=unified
+          export DATABASE_URL="$TEST_RESULT_CONNSTR"
 
-            if [[ "$report_url" == "$REPORT_JSON_URL_DEBUG" ]]; then
-              BUILD_TYPE=debug
-            else
-              BUILD_TYPE=release
-            fi
-
-            curl --fail --output suites.json "${report_url}"
-            DATABASE_URL="$TEST_RESULT_CONNSTR" poetry run python3 scripts/ingest_regress_test_result.py --revision ${SHA} --reference ${GITHUB_REF} --build-type ${BUILD_TYPE} --ingest suites.json
-          done
+          poetry run python3 scripts/ingest_regress_test_result.py \
+            --revision ${COMMIT_SHA} \
+            --reference ${GITHUB_REF} \
+            --build-type ${BUILD_TYPE} \
+            --ingest suites.json
 
   coverage-report:
     runs-on: [ self-hosted, gen3, small ]
diff --git a/libs/utils/scripts/restore_from_wal.sh b/libs/utils/scripts/restore_from_wal.sh
index 9bd860affb..92cd164b7d 100755
--- a/libs/utils/scripts/restore_from_wal.sh
+++ b/libs/utils/scripts/restore_from_wal.sh
@@ -1,21 +1,21 @@
 #!/bin/bash
+
+set -euxo pipefail
+
 PG_BIN=$1
 WAL_PATH=$2
 DATA_DIR=$3
 PORT=$4
-SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
-rm -fr $DATA_DIR
-env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID
-echo port=$PORT >> $DATA_DIR/postgresql.conf
-REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
+SYSID=$(od -A n -j 24 -N 8 -t d8 "$WAL_PATH"/000000010000000000000002* | cut -c 3-)
+rm -fr "$DATA_DIR"
+env -i LD_LIBRARY_PATH="$PG_BIN"/../lib "$PG_BIN"/initdb -E utf8 -U cloud_admin -D "$DATA_DIR" --sysid="$SYSID"
+echo port="$PORT" >> "$DATA_DIR"/postgresql.conf
+REDO_POS=0x$("$PG_BIN"/pg_controldata -D "$DATA_DIR" | grep -F "REDO location"| cut -c 42-)
 declare -i WAL_SIZE=$REDO_POS+114
-$PG_BIN/pg_ctl -D $DATA_DIR -l logfile start
-$PG_BIN/pg_ctl -D $DATA_DIR -l logfile stop -m immediate
-cp $DATA_DIR/pg_wal/000000010000000000000001 .
-cp $WAL_PATH/* $DATA_DIR/pg_wal/
-if [ -f $DATA_DIR/pg_wal/*.partial ]
-then
-	(cd $DATA_DIR/pg_wal ; for partial in \*.partial ; do mv $partial `basename $partial .partial` ; done)
-fi
-dd if=000000010000000000000001 of=$DATA_DIR/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile start
+"$PG_BIN"/pg_ctl -D "$DATA_DIR" -l logfile stop -m immediate
+cp "$DATA_DIR"/pg_wal/000000010000000000000001 .
+cp "$WAL_PATH"/* "$DATA_DIR"/pg_wal/
+for partial in "$DATA_DIR"/pg_wal/*.partial ; do mv "$partial" "${partial%.partial}" ; done
+dd if=000000010000000000000001 of="$DATA_DIR"/pg_wal/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
 rm -f 000000010000000000000001
diff --git a/libs/utils/scripts/restore_from_wal_archive.sh b/libs/utils/scripts/restore_from_wal_archive.sh
deleted file mode 100755
index ce58b349fc..0000000000
--- a/libs/utils/scripts/restore_from_wal_archive.sh
+++ /dev/null
@@ -1,20 +0,0 @@
-PG_BIN=$1
-WAL_PATH=$2
-DATA_DIR=$3
-PORT=$4
-SYSID=`od -A n -j 24 -N 8 -t d8 $WAL_PATH/000000010000000000000002* | cut -c 3-`
-rm -fr $DATA_DIR /tmp/pg_wals
-mkdir /tmp/pg_wals
-env -i LD_LIBRARY_PATH=$PG_BIN/../lib $PG_BIN/initdb -E utf8 -U cloud_admin -D $DATA_DIR --sysid=$SYSID
-echo port=$PORT >> $DATA_DIR/postgresql.conf
-REDO_POS=0x`$PG_BIN/pg_controldata -D $DATA_DIR | fgrep "REDO location"| cut -c 42-`
-declare -i WAL_SIZE=$REDO_POS+114
-cp $WAL_PATH/* /tmp/pg_wals
-if [ -f $DATA_DIR/pg_wal/*.partial ]
-then
-	(cd /tmp/pg_wals ; for partial in \*.partial ; do  mv $partial `basename $partial .partial` ; done)
-fi
-dd if=$DATA_DIR/pg_wal/000000010000000000000001 of=/tmp/pg_wals/000000010000000000000001 bs=$WAL_SIZE count=1 conv=notrunc
-echo > $DATA_DIR/recovery.signal
-rm -f $DATA_DIR/pg_wal/*
-echo "restore_command = 'cp /tmp/pg_wals/%f %p'" >> $DATA_DIR/postgresql.conf
diff --git a/scripts/pr-comment-test-report.js b/scripts/pr-comment-test-report.js
index 134df432ae..7cb2a5494f 100644
--- a/scripts/pr-comment-test-report.js
+++ b/scripts/pr-comment-test-report.js
@@ -1,6 +1,5 @@
 //
 // The script parses Allure reports and posts a comment with a summary of the test results to the PR.
-// It accepts an array of items and creates a comment with a summary for each one (for "release" and "debug", together or separately if any of them failed to be generated).
 //
 // The comment is updated on each run with the latest results.
 //
@@ -13,19 +12,37 @@
 //         github,
 //         context,
 //         fetch,
-//         reports: [{...}, ...], // each report is expected to have "buildType", "reportUrl", and "jsonUrl" properties
+//         report: {
+//           reportUrl: "...",
+//           reportJsonUrl: "...",
+//         },
 //       })
 //
 
-module.exports = async ({ github, context, fetch, reports }) => {
+// Analog of Python's defaultdict.
+//
+// const dm = new DefaultMap(() => new DefaultMap(() => []))
+// dm["firstKey"]["secondKey"].push("value")
+//
+class DefaultMap extends Map {
+    constructor(getDefaultValue) {
+        return new Proxy({}, {
+            get: (target, name) => name in target ? target[name] : (target[name] = getDefaultValue(name))
+        })
+    }
+}
+
+module.exports = async ({ github, context, fetch, report }) => {
     // Marker to find the comment in the subsequent runs
     const startMarker = `<!--AUTOMATIC COMMENT START #${context.payload.number}-->`
+    // Let users know that the comment is updated automatically
+    const autoupdateNotice = `<div align="right"><sub>The comment gets automatically updated with the latest test results :recycle:</sub></div>`
     // GitHub bot id taken from (https://api.github.com/users/github-actions[bot])
     const githubActionsBotId = 41898282
     // The latest commit in the PR URL
     const commitUrl = `${context.serverUrl}/${context.repo.owner}/${context.repo.repo}/pull/${context.payload.number}/commits/${context.payload.pull_request.head.sha}`
     // Commend body itself
-    let commentBody = `${startMarker}\n### Test results for ${commitUrl}:\n___\n`
+    let commentBody = `${startMarker}\n`
 
     // Common parameters for GitHub API requests
     const ownerRepoParams = {
@@ -33,76 +50,109 @@ module.exports = async ({ github, context, fetch, reports }) => {
         repo: context.repo.repo,
     }
 
-    for (const report of reports) {
-        const {buildType, reportUrl, jsonUrl} = report
+    const {reportUrl, reportJsonUrl} = report
 
-        if (!reportUrl || !jsonUrl) {
-            commentBody += `#### ${buildType} build: no tests were run or test report is not available\n`
-            continue
-        }
+    if (!reportUrl || !reportJsonUrl) {
+        commentBody += `#### No tests were run or test report is not available\n`
+        commentBody += autoupdateNotice
+        return
+    }
 
-        const suites = await (await fetch(jsonUrl)).json()
+    const suites = await (await fetch(reportJsonUrl)).json()
 
-        // Allure distinguishes "failed" (with an assertion error) and "broken" (with any other error) tests.
-        // For this report it's ok to treat them in the same way (as failed).
-        failedTests = []
-        passedTests = []
-        skippedTests = []
+    // Allure distinguishes "failed" (with an assertion error) and "broken" (with any other error) tests.
+    // For this report it's ok to treat them in the same way (as failed).
+    const failedTests = new DefaultMap(() => new DefaultMap(() => []))
+    const passedTests = new DefaultMap(() => new DefaultMap(() => []))
+    const skippedTests = new DefaultMap(() => new DefaultMap(() => []))
+    const retriedTests = new DefaultMap(() => new DefaultMap(() => []))
+    const flakyTests = new DefaultMap(() => new DefaultMap(() => []))
 
-        retriedTests = []
-        retriedStatusChangedTests = []
+    let failedTestsCount = 0
+    let passedTestsCount = 0
+    let skippedTestsCount = 0
+    let flakyTestsCount = 0
 
-        for (const parentSuite of suites.children) {
-            for (const suite of parentSuite.children) {
-                for (const test of suite.children) {
-                    pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${test.name}`
-                    test.pytestName = pytestName
+    const pgVersions = new Set()
+    const buildTypes = new Set()
 
-                    if (test.status === "passed") {
-                        passedTests.push(test);
-                    } else if (test.status === "failed" || test.status === "broken") {
-                        failedTests.push(test);
-                    } else if (test.status === "skipped") {
-                        skippedTests.push(test);
-                    }
+    for (const parentSuite of suites.children) {
+        for (const suite of parentSuite.children) {
+            for (const test of suite.children) {
+                const {groups: {buildType, pgVersion}} = test.name.match(/[\[-](?<buildType>debug|release)-pg(?<pgVersion>\d+)[-\]]/)
 
-                    if (test.retriesCount > 0) {
-                        retriedTests.push(test);
+                pgVersions.add(pgVersion)
+                buildTypes.add(buildType)
 
-                        if (test.retriesStatusChange) {
-                            retriedStatusChangedTests.push(test);
-                        }
+                // Removing build type and PostgreSQL version from the test name to make it shorter
+                const testName = test.name.replace(new RegExp(`${buildType}-pg${pgVersion}-?`), "").replace("[]", "")
+                test.pytestName = `${parentSuite.name.replace(".", "/")}/${suite.name}.py::${testName}`
+
+                if (test.status === "passed") {
+                    passedTests[pgVersion][buildType].push(test)
+                    passedTestsCount += 1
+                } else if (test.status === "failed" || test.status === "broken") {
+                    failedTests[pgVersion][buildType].push(test)
+                    failedTestsCount += 1
+                } else if (test.status === "skipped") {
+                    skippedTests[pgVersion][buildType].push(test)
+                    skippedTestsCount += 1
+                }
+
+                if (test.retriesCount > 0) {
+                    retriedTests[pgVersion][buildType].push(test)
+
+                    if (test.retriesStatusChange) {
+                        flakyTests[pgVersion][buildType].push(test)
+                        flakyTestsCount += 1
                     }
                 }
             }
         }
+    }
 
-        const totalTestsCount = failedTests.length + passedTests.length + skippedTests.length
-        commentBody += `#### ${buildType} build: ${totalTestsCount} tests run: ${passedTests.length} passed, ${failedTests.length} failed, ${skippedTests.length} skipped ([full report](${reportUrl}))\n`
-        if (failedTests.length > 0) {
-            commentBody += `Failed tests:\n`
-            for (const test of failedTests) {
-                const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
+    const totalTestsCount = failedTestsCount + passedTestsCount + skippedTestsCount
+    commentBody += `### ${totalTestsCount} tests run: ${passedTestsCount} passed, ${failedTestsCount} failed, ${skippedTestsCount} skipped ([full report](${reportUrl}) for ${commitUrl})\n___\n`
 
-                commentBody += `- [\`${test.pytestName}\`](${allureLink})`
-                if (test.retriesCount > 0) {
-                    commentBody += ` (ran [${test.retriesCount + 1} times](${allureLink}/retries))`
+    // Print test resuls from the newest to the oldest PostgreSQL version for release and debug builds.
+    for (const pgVersion of Array.from(pgVersions).sort().reverse()) {
+        for (const buildType of Array.from(buildTypes).sort().reverse()) {
+            if (failedTests[pgVersion][buildType].length > 0) {
+                commentBody += `#### PostgreSQL ${pgVersion} (${buildType} build)\n\n`
+                commentBody += `Failed tests:\n`
+                for (const test of failedTests[pgVersion][buildType]) {
+                    const allureLink = `${reportUrl}#suites/${test.parentUid}/${test.uid}`
+
+                    commentBody += `- [\`${test.pytestName}\`](${allureLink})`
+                    if (test.retriesCount > 0) {
+                        commentBody += ` (ran [${test.retriesCount + 1} times](${allureLink}/retries))`
+                    }
+                    commentBody += "\n"
                 }
                 commentBody += "\n"
             }
-            commentBody += "\n"
         }
-        if (retriedStatusChangedTests > 0) {
-            commentBody += `Flaky tests:\n`
-            for (const test of retriedStatusChangedTests) {
-                const status = test.status === "passed" ? ":white_check_mark:" : ":x:"
-                commentBody += `- ${status} [\`${test.pytestName}\`](${reportUrl}#suites/${test.parentUid}/${test.uid}/retries)\n`
-            }
-            commentBody += "\n"
-        }
-        commentBody += "___\n"
     }
 
+    if (flakyTestsCount > 0) {
+        commentBody += "<details>\n<summary>Flaky tests</summary>\n\n"
+        for (const pgVersion of Array.from(pgVersions).sort().reverse()) {
+            for (const buildType of Array.from(buildTypes).sort().reverse()) {
+                if (flakyTests[pgVersion][buildType].length > 0) {
+                    commentBody += `#### PostgreSQL ${pgVersion} (${buildType} build)\n\n`
+                    for (const test of flakyTests[pgVersion][buildType]) {
+                        const status = test.status === "passed" ? ":white_check_mark:" : ":x:"
+                        commentBody += `- ${status} [\`${test.pytestName}\`](${reportUrl}#suites/${test.parentUid}/${test.uid}/retries)\n`
+                    }
+                    commentBody += "\n"
+                }
+            }
+        }
+        commentBody += "\n</details>\n"
+    }
+
+    commentBody += autoupdateNotice
+
     const { data: comments } = await github.rest.issues.listComments({
         issue_number: context.payload.number,
         ...ownerRepoParams,
diff --git a/test_runner/conftest.py b/test_runner/conftest.py
index 4640861936..4e649e111a 100644
--- a/test_runner/conftest.py
+++ b/test_runner/conftest.py
@@ -1,5 +1,6 @@
 pytest_plugins = (
     "fixtures.pg_version",
+    "fixtures.allure",
     "fixtures.neon_fixtures",
     "fixtures.benchmark_fixture",
     "fixtures.pg_stats",
diff --git a/test_runner/fixtures/allure.py b/test_runner/fixtures/allure.py
new file mode 100644
index 0000000000..6f40bd2aa2
--- /dev/null
+++ b/test_runner/fixtures/allure.py
@@ -0,0 +1,25 @@
+import os
+
+import pytest
+
+from fixtures.pg_version import DEFAULT_VERSION, PgVersion
+
+"""
+Set of utilities to make Allure report more informative.
+
+- It adds BUILD_TYPE and DEFAULT_PG_VERSION to the test names (only in test_runner/regress)
+to make tests distinguishable in Allure report.
+"""
+
+
+@pytest.fixture(scope="function", autouse=True)
+def allure_noop():
+    pass
+
+
+def pytest_generate_tests(metafunc):
+    if "test_runner/regress" in metafunc.definition._nodeid:
+        build_type = os.environ.get("BUILD_TYPE", "DEBUG").lower()
+        pg_version = PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION))
+
+        metafunc.parametrize("allure_noop", [f"{build_type}-pg{pg_version}"])
diff --git a/test_runner/fixtures/pg_version.py b/test_runner/fixtures/pg_version.py
index 904a274197..554f841d14 100644
--- a/test_runner/fixtures/pg_version.py
+++ b/test_runner/fixtures/pg_version.py
@@ -46,6 +46,20 @@ class PgVersion(str, enum.Enum):
 DEFAULT_VERSION: PgVersion = PgVersion.V14
 
 
+def skip_on_postgres(version: PgVersion, reason: str):
+    return pytest.mark.skipif(
+        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version,
+        reason=reason,
+    )
+
+
+def xfail_on_postgres(version: PgVersion, reason: str):
+    return pytest.mark.xfail(
+        PgVersion(os.environ.get("DEFAULT_PG_VERSION", DEFAULT_VERSION)) is version,
+        reason=reason,
+    )
+
+
 def pytest_addoption(parser: Parser):
     parser.addoption(
         "--pg-version",
diff --git a/test_runner/regress/test_compatibility.py b/test_runner/regress/test_compatibility.py
index ff61af8fd3..7bc12847b7 100644
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -16,7 +16,7 @@ from fixtures.neon_fixtures import (
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import wait_for_last_record_lsn, wait_for_upload
-from fixtures.pg_version import PgVersion
+from fixtures.pg_version import PgVersion, skip_on_postgres
 from fixtures.types import Lsn
 from pytest import FixtureRequest
 
@@ -41,12 +41,15 @@ check_ondisk_data_compatibility_if_enabled = pytest.mark.skipif(
 )
 
 
-# Note: if renaming this test, don't forget to update a reference to it in a workflow file:
-# "Upload compatibility snapshot" step in .github/actions/run-python-test-set/action.yml
-@check_ondisk_data_compatibility_if_enabled
+@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet")
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(before="test_forward_compatibility")
-def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_output_dir: Path):
+def test_create_snapshot(
+    neon_env_builder: NeonEnvBuilder,
+    pg_bin: PgBin,
+    top_output_dir: Path,
+    test_output_dir: Path,
+):
     # The test doesn't really test anything
     # it creates a new snapshot for releases after we tested the current version against the previous snapshot in `test_backward_compatibility`.
     #
@@ -86,10 +89,14 @@ def test_create_snapshot(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, test_o
         sk.stop()
     env.pageserver.stop()
 
-    shutil.copytree(test_output_dir, test_output_dir / "compatibility_snapshot_pg14")
-    # Directory `test_output_dir / "compatibility_snapshot_pg14"` is uploaded to S3 in a workflow, keep the name in sync with it
+    # Directory `compatibility_snapshot_dir` is uploaded to S3 in a workflow, keep the name in sync with it
+    compatibility_snapshot_dir = top_output_dir / "compatibility_snapshot_pg14"
+    if compatibility_snapshot_dir.exists():
+        shutil.rmtree(compatibility_snapshot_dir)
+    shutil.copytree(test_output_dir, compatibility_snapshot_dir)
 
 
+@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet")
 @check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
@@ -148,11 +155,13 @@ def test_backward_compatibility(
     ), "Breaking changes are allowed by ALLOW_BACKWARD_COMPATIBILITY_BREAKAGE, but the test has passed without any breakage"
 
 
+@skip_on_postgres(PgVersion.V15, "Compatibility tests doesn't support Postgres 15 yet")
 @check_ondisk_data_compatibility_if_enabled
 @pytest.mark.xdist_group("compatibility")
 @pytest.mark.order(after="test_create_snapshot")
 def test_forward_compatibility(
     test_output_dir: Path,
+    top_output_dir: Path,
     port_distributor: PortDistributor,
     pg_version: PgVersion,
     request: FixtureRequest,
@@ -174,9 +183,7 @@ def test_forward_compatibility(
     ), "COMPATIBILITY_POSTGRES_DISTRIB_DIR is not set. It should be set to a pg_install directrory (ideally generated by the previous version of Neon)"
     compatibility_postgres_distrib_dir = Path(compatibility_postgres_distrib_dir_env).resolve()
 
-    compatibility_snapshot_dir = (
-        test_output_dir.parent / "test_create_snapshot" / "compatibility_snapshot_pg14"
-    )
+    compatibility_snapshot_dir = top_output_dir / "compatibility_snapshot_pg14"
 
     breaking_changes_allowed = (
         os.environ.get("ALLOW_FORWARD_COMPATIBILITY_BREAKAGE", "false").lower() == "true"
diff --git a/test_runner/regress/test_hot_standby.py b/test_runner/regress/test_hot_standby.py
index 12e034cea2..582ac1b17e 100644
--- a/test_runner/regress/test_hot_standby.py
+++ b/test_runner/regress/test_hot_standby.py
@@ -1,7 +1,9 @@
 import pytest
 from fixtures.neon_fixtures import NeonEnv
+from fixtures.pg_version import PgVersion, xfail_on_postgres
 
 
+@xfail_on_postgres(PgVersion.V15, reason="https://github.com/neondatabase/neon/pull/4182")
 @pytest.mark.timeout(1800)
 def test_hot_standby(neon_simple_env: NeonEnv):
     env = neon_simple_env
diff --git a/test_runner/regress/test_pg_regress.py b/test_runner/regress/test_pg_regress.py
index 64625ea4ee..a1d2a56d8a 100644
--- a/test_runner/regress/test_pg_regress.py
+++ b/test_runner/regress/test_pg_regress.py
@@ -5,6 +5,7 @@ from pathlib import Path
 
 import pytest
 from fixtures.neon_fixtures import NeonEnv, check_restored_datadir_content
+from fixtures.pg_version import PgVersion, xfail_on_postgres
 
 
 # Run the main PostgreSQL regression tests, in src/test/regress.
@@ -71,6 +72,7 @@ def test_pg_regress(
 #
 # This runs for a long time, especially in debug mode, so use a larger-than-default
 # timeout.
+@xfail_on_postgres(PgVersion.V15, reason="https://github.com/neondatabase/neon/pull/4213")
 @pytest.mark.timeout(1800)
 def test_isolation(
     neon_simple_env: NeonEnv,
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index 2d905910f8..60ab268882 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -11,6 +11,7 @@ from fixtures.neon_fixtures import (
     wait_for_wal_insert_lsn,
 )
 from fixtures.pageserver.http import PageserverHttpClient
+from fixtures.pg_version import PgVersion, xfail_on_postgres
 from fixtures.types import Lsn, TenantId, TimelineId
 
 
@@ -512,6 +513,7 @@ def test_single_branch_get_tenant_size_grows(
     assert size_after == prev, "size after restarting pageserver should not have changed"
 
 
+@xfail_on_postgres(PgVersion.V15, reason="Test significantly more flaky on Postgres 15")
 def test_get_tenant_size_with_multiple_branches(
     neon_env_builder: NeonEnvBuilder, test_output_dir: Path
 ):
diff --git a/test_runner/regress/test_wal_restore.py b/test_runner/regress/test_wal_restore.py
index dd944af7eb..f3d3a84c20 100644
--- a/test_runner/regress/test_wal_restore.py
+++ b/test_runner/regress/test_wal_restore.py
@@ -1,14 +1,20 @@
+import sys
 from pathlib import Path
 
+import pytest
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
     PgBin,
     PortDistributor,
     VanillaPostgres,
 )
-from fixtures.types import TenantId
+from fixtures.types import TenantId, TimelineId
 
 
+@pytest.mark.skipif(
+    sys.platform != "linux",
+    reason="restore_from_wal.sh supports only Linux",
+)
 def test_wal_restore(
     neon_env_builder: NeonEnvBuilder,
     pg_bin: PgBin,
@@ -22,6 +28,7 @@ def test_wal_restore(
     endpoint = env.endpoints.create_start("test_wal_restore")
     endpoint.safe_psql("create table t as select generate_series(1,300000)")
     tenant_id = TenantId(endpoint.safe_psql("show neon.tenant_id")[0][0])
+    timeline_id = TimelineId(endpoint.safe_psql("show neon.timeline_id")[0][0])
     env.neon_cli.pageserver_stop()
     port = port_distributor.get_port()
     data_dir = test_output_dir / "pgsql.restored"
@@ -30,9 +37,16 @@ def test_wal_restore(
     ) as restored:
         pg_bin.run_capture(
             [
-                str(base_dir / "libs/utils/scripts/restore_from_wal.sh"),
+                str(base_dir / "libs" / "utils" / "scripts" / "restore_from_wal.sh"),
                 str(pg_distrib_dir / f"v{env.pg_version}/bin"),
-                str(test_output_dir / "repo" / "safekeepers" / "sk1" / str(tenant_id) / "*"),
+                str(
+                    test_output_dir
+                    / "repo"
+                    / "safekeepers"
+                    / "sk1"
+                    / str(tenant_id)
+                    / str(timeline_id)
+                ),
                 str(data_dir),
                 str(port),
             ]

From a2a9c598beff888c3b097b28bbe06b87c947ff26 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 12 May 2023 18:00:06 +0200
Subject: [PATCH 15/23] add counter metric that increases whenever a background
 loop overruns its period (#4223)

We already have the warn!() log line for this condition. This PR adds a
corresponding metric on which we can have a dedicated alert. Cheaper and
more reliable than alerting on the logs, because, we run into log rate
limits from time to time these days.

refs https://github.com/neondatabase/neon/issues/4222
---
 pageserver/src/metrics.rs      | 12 ++++++++++++
 pageserver/src/tenant/tasks.rs |  4 ++++
 2 files changed, 16 insertions(+)

diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index c04f6e054b..bbd59c13bd 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -489,6 +489,15 @@ pub static TENANT_TASK_EVENTS: Lazy<IntCounterVec> = Lazy::new(|| {
     .expect("Failed to register tenant_task_events metric")
 });
 
+pub static BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT: Lazy<IntCounterVec> = Lazy::new(|| {
+    register_int_counter_vec!(
+        "pageserver_background_loop_period_overrun_count",
+        "Incremented whenever warn_when_period_overrun() logs a warning.",
+        &["task", "period"],
+    )
+    .expect("failed to define a metric")
+});
+
 // walreceiver metrics
 
 pub static WALRECEIVER_STARTED_CONNECTIONS: Lazy<IntCounter> = Lazy::new(|| {
@@ -1231,4 +1240,7 @@ pub fn preinitialize_metrics() {
     // Initialize it eagerly, so that our alert rule can distinguish absence of the metric from metric value 0.
     assert_eq!(UNEXPECTED_ONDEMAND_DOWNLOADS.get(), 0);
     UNEXPECTED_ONDEMAND_DOWNLOADS.reset();
+
+    // Same as above for this metric, but, it's a Vec-type metric for which we don't know all the labels.
+    BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT.reset();
 }
diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index 7e7dbd3c5c..6bf26f1da1 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -259,6 +259,7 @@ pub(crate) async fn random_init_delay(
     }
 }
 
+/// Attention: the `task` and `period` beocme labels of a pageserver-wide prometheus metric.
 pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
     // Duration::ZERO will happen because it's the "disable [bgtask]" value.
     if elapsed >= period && period != Duration::ZERO {
@@ -271,5 +272,8 @@ pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task
             task,
             "task iteration took longer than the configured period"
         );
+        crate::metrics::BACKGROUND_LOOP_PERIOD_OVERRUN_COUNT
+            .with_label_values(&[task, &format!("{}", period.as_secs())])
+            .inc();
     }
 }

From edcf4d61a473ca9b0399607c71006dc1c351e57b Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 12 May 2023 19:57:33 +0200
Subject: [PATCH 16/23] distinguish imitated from real size::gather_input calls
 in metrics (#4224)

Before this PR, the gather_inputs() calls made to imitate synthetic size
calculation accesses were accounted towards the real logical size
calculation metric.

This PR forces all callers to declare the cause for making logical size
calculations, making the decision which cause counts towards which
metric explicit.

This is follow-up to

```
commit 1d266a6365565b30fc2d913bdf00490c8f51fe9e
Author: Christian Schwarz <christian@neon.tech>
Date:   Thu May 11 16:09:29 2023 +0200

    logical size calculation metrics: differentiate regular vs imitated (#4197)
```

After merging this patch, I hope to be able to explain why we have ca
30x more "logical size" ops in prod than "imitate logical size" for any
given observation interval.

refs https://github.com/neondatabase/neon/issues/4154
---
 pageserver/src/consumption_metrics.rs         |  6 ++--
 pageserver/src/http/routes.rs                 |  8 +++--
 pageserver/src/tenant.rs                      | 14 ++++++--
 pageserver/src/tenant/size.rs                 | 18 ++++++++--
 pageserver/src/tenant/timeline.rs             | 33 +++++++++++++------
 .../src/tenant/timeline/eviction_task.rs      | 16 ++++++---
 6 files changed, 70 insertions(+), 25 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 23cfc55b65..ca7b9650e8 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -5,7 +5,7 @@
 //!
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::task_mgr::{self, TaskKind, BACKGROUND_RUNTIME};
-use crate::tenant::mgr;
+use crate::tenant::{mgr, LogicalSizeCalculationCause};
 use anyhow;
 use chrono::Utc;
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
@@ -335,7 +335,9 @@ pub async fn calculate_synthetic_size_worker(
 
                     if let Ok(tenant) = mgr::get_tenant(tenant_id, true).await
                     {
-                        if let Err(e) = tenant.calculate_synthetic_size(ctx).await {
+                        if let Err(e) = tenant.calculate_synthetic_size(
+                            LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize,
+                            ctx).await {
                             error!("failed to calculate synthetic size for tenant {}: {}", tenant_id, e);
                         }
                     }
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 3c87e6f79d..699014b574 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -25,7 +25,7 @@ use crate::tenant::config::TenantConfOpt;
 use crate::tenant::mgr::{TenantMapInsertError, TenantStateError};
 use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
-use crate::tenant::{PageReconstructError, Timeline};
+use crate::tenant::{LogicalSizeCalculationCause, PageReconstructError, Timeline};
 use crate::{config::PageServerConf, tenant::mgr};
 use utils::{
     auth::JwtAuth,
@@ -534,7 +534,11 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
 
     // this can be long operation
     let inputs = tenant
-        .gather_size_inputs(retention_period, &ctx)
+        .gather_size_inputs(
+            retention_period,
+            LogicalSizeCalculationCause::TenantSizeHandler,
+            &ctx,
+        )
         .await
         .map_err(ApiError::InternalServerError)?;
 
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index c9d5a54f0b..1aabe20c57 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -98,7 +98,9 @@ mod timeline;
 pub mod size;
 
 pub(crate) use timeline::debug_assert_current_span_has_tenant_and_timeline_id;
-pub use timeline::{LocalLayerInfoForDiskUsageEviction, PageReconstructError, Timeline};
+pub use timeline::{
+    LocalLayerInfoForDiskUsageEviction, LogicalSizeCalculationCause, PageReconstructError, Timeline,
+};
 
 // re-export this function so that page_cache.rs can use it.
 pub use crate::tenant::ephemeral_file::writeback as writeback_ephemeral_file;
@@ -2642,6 +2644,7 @@ impl Tenant {
         // `max_retention_period` overrides the cutoff that is used to calculate the size
         // (only if it is shorter than the real cutoff).
         max_retention_period: Option<u64>,
+        cause: LogicalSizeCalculationCause,
         ctx: &RequestContext,
     ) -> anyhow::Result<size::ModelInputs> {
         let logical_sizes_at_once = self
@@ -2663,6 +2666,7 @@ impl Tenant {
             logical_sizes_at_once,
             max_retention_period,
             &mut shared_cache,
+            cause,
             ctx,
         )
         .await
@@ -2672,8 +2676,12 @@ impl Tenant {
     /// This is periodically called by background worker.
     /// result is cached in tenant struct
     #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
-    pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result<u64> {
-        let inputs = self.gather_size_inputs(None, ctx).await?;
+    pub async fn calculate_synthetic_size(
+        &self,
+        cause: LogicalSizeCalculationCause,
+        ctx: &RequestContext,
+    ) -> anyhow::Result<u64> {
+        let inputs = self.gather_size_inputs(None, cause, ctx).await?;
 
         let size = inputs.calculate()?;
 
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 9b2e5dc4f1..ffcbdc1f1d 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -11,7 +11,7 @@ use tokio_util::sync::CancellationToken;
 use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 
-use super::Tenant;
+use super::{LogicalSizeCalculationCause, Tenant};
 use crate::tenant::Timeline;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
@@ -126,6 +126,7 @@ pub(super) async fn gather_inputs(
     limit: &Arc<Semaphore>,
     max_retention_period: Option<u64>,
     logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
+    cause: LogicalSizeCalculationCause,
     ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
     // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
@@ -318,7 +319,15 @@ pub(super) async fn gather_inputs(
 
     // We left the 'size' field empty in all of the Segments so far.
     // Now find logical sizes for all of the points that might need or benefit from them.
-    fill_logical_sizes(&timelines, &mut segments, limit, logical_size_cache, ctx).await?;
+    fill_logical_sizes(
+        &timelines,
+        &mut segments,
+        limit,
+        logical_size_cache,
+        cause,
+        ctx,
+    )
+    .await?;
 
     Ok(ModelInputs {
         segments,
@@ -336,6 +345,7 @@ async fn fill_logical_sizes(
     segments: &mut [SegmentMeta],
     limit: &Arc<Semaphore>,
     logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
+    cause: LogicalSizeCalculationCause,
     ctx: &RequestContext,
 ) -> anyhow::Result<()> {
     let timeline_hash: HashMap<TimelineId, Arc<Timeline>> = HashMap::from_iter(
@@ -378,6 +388,7 @@ async fn fill_logical_sizes(
                         parallel_size_calcs,
                         timeline,
                         lsn,
+                        cause,
                         ctx,
                         cancel.child_token(),
                     )
@@ -485,6 +496,7 @@ async fn calculate_logical_size(
     limit: Arc<tokio::sync::Semaphore>,
     timeline: Arc<crate::tenant::Timeline>,
     lsn: utils::lsn::Lsn,
+    cause: LogicalSizeCalculationCause,
     ctx: RequestContext,
     cancel: CancellationToken,
 ) -> Result<TimelineAtLsnSizeResult, RecvError> {
@@ -493,7 +505,7 @@ async fn calculate_logical_size(
         .expect("global semaphore should not had been closed");
 
     let size_res = timeline
-        .spawn_ondemand_logical_size_calculation(lsn, ctx, cancel)
+        .spawn_ondemand_logical_size_calculation(lsn, cause, ctx, cancel)
         .instrument(info_span!("spawn_ondemand_logical_size_calculation"))
         .await?;
     Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 17b1d55c1f..db5a4b97b8 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -49,7 +49,7 @@ use crate::tenant::{
 
 use crate::config::PageServerConf;
 use crate::keyspace::{KeyPartitioning, KeySpace};
-use crate::metrics::{StorageTimeMetrics, TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS};
+use crate::metrics::{TimelineMetrics, UNEXPECTED_ONDEMAND_DOWNLOADS};
 use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::pgdatadir_mapping::{is_rel_fsm_block_key, is_rel_vm_block_key};
 use crate::pgdatadir_mapping::{BlockNumber, CalculateLogicalSizeError};
@@ -437,6 +437,14 @@ impl std::fmt::Display for PageReconstructError {
     }
 }
 
+#[derive(Clone, Copy)]
+pub enum LogicalSizeCalculationCause {
+    Initial,
+    ConsumptionMetricsSyntheticSize,
+    EvictionTaskImitation,
+    TenantSizeHandler,
+}
+
 /// Public interface functions
 impl Timeline {
     /// Get the LSN where this branch was created
@@ -1839,7 +1847,7 @@ impl Timeline {
                 // to spawn_ondemand_logical_size_calculation.
                 let cancel = CancellationToken::new();
                 let calculated_size = match self_clone
-                    .logical_size_calculation_task(lsn, &background_ctx, cancel)
+                    .logical_size_calculation_task(lsn, LogicalSizeCalculationCause::Initial, &background_ctx, cancel)
                     .await
                 {
                     Ok(s) => s,
@@ -1893,6 +1901,7 @@ impl Timeline {
     pub fn spawn_ondemand_logical_size_calculation(
         self: &Arc<Self>,
         lsn: Lsn,
+        cause: LogicalSizeCalculationCause,
         ctx: RequestContext,
         cancel: CancellationToken,
     ) -> oneshot::Receiver<Result<u64, CalculateLogicalSizeError>> {
@@ -1915,7 +1924,7 @@ impl Timeline {
             false,
             async move {
                 let res = self_clone
-                    .logical_size_calculation_task(lsn, &ctx, cancel)
+                    .logical_size_calculation_task(lsn, cause, &ctx, cancel)
                     .await;
                 let _ = sender.send(res).ok();
                 Ok(()) // Receiver is responsible for handling errors
@@ -1929,6 +1938,7 @@ impl Timeline {
     async fn logical_size_calculation_task(
         self: &Arc<Self>,
         lsn: Lsn,
+        cause: LogicalSizeCalculationCause,
         ctx: &RequestContext,
         cancel: CancellationToken,
     ) -> Result<u64, CalculateLogicalSizeError> {
@@ -1941,12 +1951,7 @@ impl Timeline {
             let cancel = cancel.child_token();
             let ctx = ctx.attached_child();
             self_calculation
-                .calculate_logical_size(
-                    lsn,
-                    &self_calculation.metrics.logical_size_histo,
-                    cancel,
-                    &ctx,
-                )
+                .calculate_logical_size(lsn, cause, cancel, &ctx)
                 .await
         });
         let timeline_state_cancellation = async {
@@ -2001,7 +2006,7 @@ impl Timeline {
     pub async fn calculate_logical_size(
         &self,
         up_to_lsn: Lsn,
-        storage_time_metrics: &StorageTimeMetrics,
+        cause: LogicalSizeCalculationCause,
         cancel: CancellationToken,
         ctx: &RequestContext,
     ) -> Result<u64, CalculateLogicalSizeError> {
@@ -2035,6 +2040,14 @@ impl Timeline {
         if let Some(size) = self.current_logical_size.initialized_size(up_to_lsn) {
             return Ok(size);
         }
+        let storage_time_metrics = match cause {
+            LogicalSizeCalculationCause::Initial
+            | LogicalSizeCalculationCause::ConsumptionMetricsSyntheticSize
+            | LogicalSizeCalculationCause::TenantSizeHandler => &self.metrics.logical_size_histo,
+            LogicalSizeCalculationCause::EvictionTaskImitation => {
+                &self.metrics.imitate_logical_size_histo
+            }
+        };
         let timer = storage_time_metrics.start_timer();
         let logical_size = self
             .get_current_logical_size_non_incremental(up_to_lsn, cancel, ctx)
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 5ea3d5b14d..558600692e 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -30,7 +30,7 @@ use crate::{
     tenant::{
         config::{EvictionPolicy, EvictionPolicyLayerAccessThreshold},
         storage_layer::PersistentLayer,
-        Tenant,
+        LogicalSizeCalculationCause, Tenant,
     },
 };
 
@@ -344,7 +344,7 @@ impl Timeline {
         let size = self
             .calculate_logical_size(
                 lsn,
-                &self.metrics.imitate_logical_size_histo,
+                LogicalSizeCalculationCause::EvictionTaskImitation,
                 cancel.clone(),
                 ctx,
             )
@@ -419,9 +419,15 @@ impl Timeline {
             .inner();
 
         let mut throwaway_cache = HashMap::new();
-        let gather =
-            crate::tenant::size::gather_inputs(tenant, limit, None, &mut throwaway_cache, ctx)
-                .instrument(info_span!("gather_inputs"));
+        let gather = crate::tenant::size::gather_inputs(
+            tenant,
+            limit,
+            None,
+            &mut throwaway_cache,
+            LogicalSizeCalculationCause::EvictionTaskImitation,
+            ctx,
+        )
+        .instrument(info_span!("gather_inputs"));
 
         tokio::select! {
             _ = cancel.cancelled() => {}

From 2855c73990faa5a7f611c6726f906c538f9f5a95 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Sat, 13 May 2023 10:44:11 +0300
Subject: [PATCH 17/23] Fix race condition after attaching tenant with
 branches. (#4170)

After tenant attach, there is a window where the child timeline is
loaded and accepts GetPage requests, but its parent is not. If a
GetPage request needs to traverse to the parent, it needs to wait for
the parent timeline to become active, or it might miss some records on
the parent timeline.

It's also possible that the parent timeline is active, but it hasn't
yet received all the WAL up to the branch point from the safekeeper.
This happens if a pageserver crashes soon after creating a timeline,
so that the WAL leading to the branch point has not yet been uploaded
to remote storage. After restart, the WAL will be re-streamed and
ingested from the safekeeper, but that takes a while. Because of that,
it's not enough to check that the parent timeline is active, we also
need to wait for the WAL to arrive on the parent timeline, just like
at the beginning of GetPage handling. We probably should change the
behavior at create_timeline so that a timeline can only be created
after all the WAL up to the branch point has been uploaded to remote
storage, but that's not currently the case and out of scope for this
PR (see github issue #4218).

@NanoBjorn encountered this while working on tenant migration. After
migrating a tenant with a parent and child branch, connecting to the
child branch failed with an error like:

```
FATAL:  "base/16385" is not a valid data directory
DETAIL:  File "base/16385/PG_VERSION" is missing.
```

This commit adds two tests that reproduce the bug, with slightly
different symptoms.
---
 libs/postgres_ffi/src/pg_constants.rs         |   5 +
 pageserver/src/http/routes.rs                 |   3 +
 pageserver/src/tenant.rs                      |  22 +-
 pageserver/src/tenant/timeline.rs             |  91 ++++++-
 pageserver/src/walingest.rs                   |   9 +
 test_runner/regress/test_tenant_relocation.py | 241 ++++++++++++++++++
 6 files changed, 364 insertions(+), 7 deletions(-)

diff --git a/libs/postgres_ffi/src/pg_constants.rs b/libs/postgres_ffi/src/pg_constants.rs
index 6bc89ed37e..9c39b46cc1 100644
--- a/libs/postgres_ffi/src/pg_constants.rs
+++ b/libs/postgres_ffi/src/pg_constants.rs
@@ -146,6 +146,10 @@ pub const XLH_UPDATE_OLD_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 pub const XLH_UPDATE_NEW_ALL_VISIBLE_CLEARED: u8 = (1 << 1) as u8;
 pub const XLH_DELETE_ALL_VISIBLE_CLEARED: u8 = (1 << 0) as u8;
 
+// From replication/message.h
+pub const XLOG_LOGICAL_MESSAGE: u8 = 0x00;
+
+// From rmgrlist.h
 pub const RM_XLOG_ID: u8 = 0;
 pub const RM_XACT_ID: u8 = 1;
 pub const RM_SMGR_ID: u8 = 2;
@@ -157,6 +161,7 @@ pub const RM_RELMAP_ID: u8 = 7;
 pub const RM_STANDBY_ID: u8 = 8;
 pub const RM_HEAP2_ID: u8 = 9;
 pub const RM_HEAP_ID: u8 = 10;
+pub const RM_LOGICALMSG_ID: u8 = 21;
 
 // from xlogreader.h
 pub const XLR_INFO_MASK: u8 = 0x0F;
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 699014b574..91f4fda5eb 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -105,6 +105,9 @@ impl From<PageReconstructError> for ApiError {
             PageReconstructError::Cancelled => {
                 ApiError::InternalServerError(anyhow::anyhow!("request was cancelled"))
             }
+            PageReconstructError::AncestorStopping(_) => {
+                ApiError::InternalServerError(anyhow::Error::new(pre))
+            }
             PageReconstructError::WalRedo(pre) => {
                 ApiError::InternalServerError(anyhow::Error::new(pre))
             }
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 1aabe20c57..6851f3ca2c 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -3441,14 +3441,26 @@ mod tests {
             .gc_iteration(Some(TIMELINE_ID), 0x10, Duration::ZERO, &ctx)
             .await?;
 
+        // The branchpoints should contain all timelines, even ones marked
+        // as Broken.
+        {
+            let branchpoints = &tline.gc_info.read().unwrap().retain_lsns;
+            assert_eq!(branchpoints.len(), 1);
+            assert_eq!(branchpoints[0], Lsn(0x40));
+        }
+
+        // You can read the key from the child branch even though the parent is
+        // Broken, as long as you don't need to access data from the parent.
         assert_eq!(
-            newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await?,
-            TEST_IMG(&format!("foo at {}", Lsn(0x40)))
+            newtline.get(*TEST_KEY, Lsn(0x70), &ctx).await?,
+            TEST_IMG(&format!("foo at {}", Lsn(0x70)))
         );
 
-        let branchpoints = &tline.gc_info.read().unwrap().retain_lsns;
-        assert_eq!(branchpoints.len(), 1);
-        assert_eq!(branchpoints[0], Lsn(0x40));
+        // This needs to traverse to the parent, and fails.
+        let err = newtline.get(*TEST_KEY, Lsn(0x50), &ctx).await.unwrap_err();
+        assert!(err
+            .to_string()
+            .contains("will not become active. Current state: Broken"));
 
         Ok(())
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index db5a4b97b8..2543764eca 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -396,6 +396,9 @@ pub enum PageReconstructError {
     /// The operation was cancelled
     Cancelled,
 
+    /// The ancestor of this is being stopped
+    AncestorStopping(TimelineId),
+
     /// An error happened replaying WAL records
     #[error(transparent)]
     WalRedo(#[from] crate::walredo::WalRedoError),
@@ -414,6 +417,9 @@ impl std::fmt::Debug for PageReconstructError {
                 )
             }
             Self::Cancelled => write!(f, "cancelled"),
+            Self::AncestorStopping(timeline_id) => {
+                write!(f, "ancestor timeline {timeline_id} is being stopped")
+            }
             Self::WalRedo(err) => err.fmt(f),
         }
     }
@@ -432,6 +438,9 @@ impl std::fmt::Display for PageReconstructError {
                 )
             }
             Self::Cancelled => write!(f, "cancelled"),
+            Self::AncestorStopping(timeline_id) => {
+                write!(f, "ancestor timeline {timeline_id} is being stopped")
+            }
             Self::WalRedo(err) => err.fmt(f),
         }
     }
@@ -934,6 +943,31 @@ impl Timeline {
         self.state.subscribe()
     }
 
+    pub async fn wait_to_become_active(
+        &self,
+        _ctx: &RequestContext, /* Prepare for use by cancellation */
+    ) -> Result<(), TimelineState> {
+        let mut receiver = self.state.subscribe();
+        loop {
+            let current_state = *receiver.borrow_and_update();
+            match current_state {
+                TimelineState::Loading => {
+                    receiver
+                        .changed()
+                        .await
+                        .expect("holding a reference to self");
+                }
+                TimelineState::Active { .. } => {
+                    return Ok(());
+                }
+                TimelineState::Broken { .. } | TimelineState::Stopping => {
+                    // There's no chance the timeline can transition back into ::Active
+                    return Err(current_state);
+                }
+            }
+        }
+    }
+
     pub fn layer_map_info(&self, reset: LayerAccessStatsReset) -> LayerMapInfo {
         let layer_map = self.layers.read().unwrap();
         let mut in_memory_layers = Vec::with_capacity(layer_map.frozen_layers.len() + 1);
@@ -1853,12 +1887,25 @@ impl Timeline {
                     Ok(s) => s,
                     Err(CalculateLogicalSizeError::Cancelled) => {
                         // Don't make noise, this is a common task.
-                        // In the unlikely case that there ihs another call to this function, we'll retry
+                        // In the unlikely case that there is another call to this function, we'll retry
                         // because initial_logical_size is still None.
                         info!("initial size calculation cancelled, likely timeline delete / tenant detach");
                         return Ok(());
                     }
-                    x @ Err(_) => x.context("Failed to calculate logical size")?,
+                    Err(CalculateLogicalSizeError::Other(err)) => {
+                        if let Some(e @ PageReconstructError::AncestorStopping(_)) =
+                            err.root_cause().downcast_ref()
+                        {
+                            // This can happen if the timeline parent timeline switches to
+                            // Stopping state while we're still calculating the initial
+                            // timeline size for the child, for example if the tenant is
+                            // being detached or the pageserver is shut down. Like with
+                            // CalculateLogicalSizeError::Cancelled, don't make noise.
+                            info!("initial size calculation failed because the timeline or its ancestor is Stopping, likely because the tenant is being detached: {e:#}");
+                            return Ok(());
+                        }
+                        return Err(err.context("Failed to calculate logical size"));
+                    }
                 };
 
                 // we cannot query current_logical_size.current_size() to know the current
@@ -2240,6 +2287,46 @@ impl Timeline {
                     Ok(timeline) => timeline,
                     Err(e) => return Err(PageReconstructError::from(e)),
                 };
+
+                // It's possible that the ancestor timeline isn't active yet, or
+                // is active but hasn't yet caught up to the branch point. Wait
+                // for it.
+                //
+                // This cannot happen while the pageserver is running normally,
+                // because you cannot create a branch from a point that isn't
+                // present in the pageserver yet. However, we don't wait for the
+                // branch point to be uploaded to cloud storage before creating
+                // a branch. I.e., the branch LSN need not be remote consistent
+                // for the branching operation to succeed.
+                //
+                // Hence, if we try to load a tenant in such a state where
+                // 1. the existence of the branch was persisted (in IndexPart and/or locally)
+                // 2. but the ancestor state is behind branch_lsn because it was not yet persisted
+                // then we will need to wait for the ancestor timeline to
+                // re-stream WAL up to branch_lsn before we access it.
+                //
+                // How can a tenant get in such a state?
+                // - ungraceful pageserver process exit
+                // - detach+attach => this is a bug, https://github.com/neondatabase/neon/issues/4219
+                //
+                // NB: this could be avoided by requiring
+                //   branch_lsn >= remote_consistent_lsn
+                // during branch creation.
+                match ancestor.wait_to_become_active(ctx).await {
+                    Ok(()) => {}
+                    Err(state) if state == TimelineState::Stopping => {
+                        return Err(PageReconstructError::AncestorStopping(ancestor.timeline_id));
+                    }
+                    Err(state) => {
+                        return Err(PageReconstructError::Other(anyhow::anyhow!(
+                            "Timeline {} will not become active. Current state: {:?}",
+                            ancestor.timeline_id,
+                            &state,
+                        )));
+                    }
+                }
+                ancestor.wait_lsn(timeline.ancestor_lsn, ctx).await?;
+
                 timeline_owned = ancestor;
                 timeline = &*timeline_owned;
                 prev_lsn = Lsn(u64::MAX);
diff --git a/pageserver/src/walingest.rs b/pageserver/src/walingest.rs
index 63d568a342..4b8e6aa515 100644
--- a/pageserver/src/walingest.rs
+++ b/pageserver/src/walingest.rs
@@ -305,6 +305,15 @@ impl<'a> WalIngest<'a> {
                     self.checkpoint_modified = true;
                 }
             }
+        } else if decoded.xl_rmid == pg_constants::RM_LOGICALMSG_ID {
+            let info = decoded.xl_info & pg_constants::XLR_RMGR_INFO_MASK;
+            if info == pg_constants::XLOG_LOGICAL_MESSAGE {
+                // This is a convenient way to make the WAL ingestion pause at
+                // particular point in the WAL. For more fine-grained control,
+                // we could peek into the message and only pause if it contains
+                // a particular string, for example, but this is enough for now.
+                utils::failpoint_sleep_millis_async!("wal-ingest-logical-message-sleep");
+            }
         }
 
         // Iterate through all the blocks that the record modifies, and
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 180afd88cd..81ad256f4b 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -1,5 +1,7 @@
 import os
+import shutil
 import threading
+import time
 from contextlib import closing, contextmanager
 from pathlib import Path
 from typing import Any, Dict, Optional, Tuple
@@ -12,6 +14,8 @@ from fixtures.neon_fixtures import (
     NeonEnv,
     NeonEnvBuilder,
     PortDistributor,
+    RemoteStorageKind,
+    available_remote_storages,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
@@ -19,6 +23,7 @@ from fixtures.pageserver.utils import (
     tenant_exists,
     wait_for_last_record_lsn,
     wait_for_upload,
+    wait_for_upload_queue_empty,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
@@ -512,3 +517,239 @@ def test_tenant_relocation(
             if line.startswith("listen_pg_addr"):
                 lines[i] = f"listen_pg_addr = 'localhost:{env.pageserver.service_port.pg}'"
         (env.repo_dir / "config").write_text("\n".join(lines))
+
+
+# Simulate hard crash of pageserver and re-attach a tenant with a branch
+#
+# This exercises a race condition after tenant attach, where the
+# branch point on the ancestor timeline is greater than the ancestor's
+# last-record LSN. We had a bug where GetPage incorrectly followed the
+# timeline to the ancestor without waiting for the missing WAL to
+# arrive.
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_emergency_relocate_with_branches_slow_replay(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_emergency_relocate_with_branches_slow_replay",
+    )
+
+    env = neon_env_builder.init_start()
+    env.pageserver.is_testing_enabled_or_skip()
+    pageserver_http = env.pageserver.http_client()
+
+    # Prepare for the test:
+    #
+    # - Main branch, with a table and two inserts to it.
+    # - A logical replication message between the inserts, so that we can conveniently
+    #   pause the WAL ingestion between the two inserts.
+    # - Child branch, created after the inserts
+    tenant_id, timeline_id = env.neon_cli.create_tenant()
+
+    main_endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    with main_endpoint.cursor() as cur:
+        cur.execute("CREATE TABLE test_reattach (t text)")
+        cur.execute("INSERT INTO test_reattach VALUES ('before pause')")
+
+        cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')")
+
+        cur.execute("INSERT INTO test_reattach VALUES ('after pause')")
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+
+    main_endpoint.stop()
+    child_timeline_id = env.neon_cli.create_branch(
+        "child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn
+    )
+
+    # Wait for the index_part.json file of both branches to be uploaded to remote storage.
+    # This is a work around for issue https://github.com/neondatabase/neon/issues/3865.
+    wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
+    wait_for_upload_queue_empty(pageserver_http, tenant_id, child_timeline_id)
+
+    # Now kill the pageserver, remove the tenant directory, and restart. This simulates
+    # the scenario that a pageserver dies unexpectedly and cannot be recovered, so we relocate
+    # the tenant to a different pageserver. We reuse the same pageserver because it's
+    # simpler than initializing a new one from scratch, but the effect on the single tenant
+    # is the same.
+    env.pageserver.stop(immediate=True)
+    shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id))
+    env.pageserver.start()
+
+    # This fail point will pause the WAL ingestion on the main branch, after the
+    # the first insert
+    pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
+
+    # Attach and wait a few seconds to give it time to load the tenants, attach to the
+    # safekeepers, and to stream and ingest the WAL up to the pause-point.
+    before_attach_time = time.time()
+    pageserver_http.tenant_attach(tenant_id)
+    time.sleep(3)
+
+    # The wal ingestion on the main timeline should now be paused at the fail point.
+    # Run a query on the child branch. The GetPage requests for this should recurse to the
+    # parent timeline, and wait for the WAL to be ingested there. Otherwise it won't see
+    # the second insert.
+    child_endpoint = env.endpoints.create_start("child", tenant_id=tenant_id)
+    with child_endpoint.cursor() as cur:
+        cur.execute("SELECT * FROM test_reattach")
+        assert cur.fetchall() == [("before pause",), ("after pause",)]
+
+    # Sanity check that the failpoint was reached
+    assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
+    assert time.time() - before_attach_time > 5
+
+    # Clean up
+    pageserver_http.configure_failpoints(("wal-ingest-logical-message-sleep", "off"))
+
+
+# Simulate hard crash of pageserver and re-attach a tenant with a branch
+#
+# This exercises the same race condition after as
+# 'test_emergency_relocate_with_branches_slow_replay', but this test case
+# is closer to the original scenario where we originally found the
+# issue.
+#
+# In this scenario, the incorrect result to get-request leads to
+# *permanent damage* in the child timeline, because ingesting the WAL
+# on the child timeline depended on incorrect view of the parent. This
+# test reproduced one such case; the symptom was an error on the child, when
+# trying to connect to the child endpoint after re-attaching the tenant:
+#
+# FATAL: database "neondb" does not exist
+#
+# In the original case where we bumped into this, the error was slightly
+# different:
+#
+# FATAL:  "base/16385" is not a valid data directory
+# DETAIL:  File "base/16385/PG_VERSION" is missing.
+#
+### Detailed explanation of the original bug and why it lead to that error:
+#
+# The WAL on the main and the child branches look like this:
+#
+#    Main                                  Child
+# 1. CREATE DATABASE
+#           <child branch is created>
+# 2. CREATE TABLE AS SELECT ...            3. CREATE TABLE AS SELECT ...
+#
+# None of these WAL records have been flushed to disk or uploaded to remote
+# storage in the pageserver yet, when the tenant is detached.
+#
+# After detach and re-attach, a walreceiver is spawned on both timelines.
+# They will connect to the safekeepers and start ingesting the WAL
+# from their respective IndexParts' `disk_consistent_lsn` onward.
+#
+# The bug occurs if the child branch's walreceiver runs before the
+# main's.  It receives the SMGR_CREATE WAL record emitted by the
+# CREATE TABLE statement (3.), and applies it, without seeing the
+# effect of the CREATE DATABASE statement.
+#
+# To understand why that leads to a 'File "base/16385/PG_VERSION" is
+# missing' error, let's look at what the handlers for the WAL records
+# do:
+#
+# CREATE DATABASE WAL record is handled by ingest_xlog_dbase_create:
+#
+#    ingest_xlog_dbase_create:
+#     put_relmap_file()
+#       // NOTE 'true': It means that there is a relmapper and PG_VERSION file
+# 1:    let r = dbdir.dbdirs.insert((spcnode, dbnode), true);
+#
+#
+# CREATE TABLE emits an SMGR_CREATE record, which is handled by:
+#
+#    ingest_xlog_smgr_create:
+#      put_rel_creation:
+#        ...
+#        let mut rel_dir = if dbdir.dbdirs.get(&(rel.spcnode, rel.dbnode)).is_none() {
+# 2:         // Didn't exist. Update dbdir
+#            dbdir.dbdirs.insert((rel.spcnode, rel.dbnode), false);
+#            let buf = DbDirectory::ser(&dbdir)?;
+#            self.put(DBDIR_KEY, Value::Image(buf.into()));
+#
+#            // and create the RelDirectory
+#            RelDirectory::default()
+#        } else {
+# 3:         // reldir already exists, fetch it
+#            RelDirectory::des(&self.get(rel_dir_key, ctx).await?)?
+#        };
+#
+#
+# In the correct ordering, the SMGR_CREATE record is applied after the
+# CREATE DATABASE record. The CREATE DATABASE creates the entry in the
+# 'dbdir', with the 'true' flag that indicates that PG_VERSION exists
+# (1). The SMGR_CREATE handler calls put_rel_creation, which finds the
+# dbdir entry that the CREATE DATABASE record created, and takes the
+# "reldir already exists, fetch it" else-branch at the if statement (3).
+#
+# In the incorrect ordering, the child walreceiver applies the
+# SMGR_CREATE record without seeing the effects of the CREATE
+# DATABASE. In that case, put_rel_creation takes the "Didn't
+# exist. Update dbir" path (2), and inserts an entry in the
+# DbDirectory with 'false' to indicate there is no PG_VERSION file.
+#
+@pytest.mark.parametrize("remote_storage_kind", available_remote_storages())
+def test_emergency_relocate_with_branches_createdb(
+    neon_env_builder: NeonEnvBuilder,
+    remote_storage_kind: RemoteStorageKind,
+):
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_emergency_relocate_with_branches_createdb",
+    )
+
+    env = neon_env_builder.init_start()
+    pageserver_http = env.pageserver.http_client()
+
+    # create new nenant
+    tenant_id, timeline_id = env.neon_cli.create_tenant()
+
+    main_endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
+    with main_endpoint.cursor() as cur:
+        cur.execute("SELECT pg_logical_emit_message(false, 'neon-test', 'between inserts')")
+
+        cur.execute("CREATE DATABASE neondb")
+        current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
+    child_timeline_id = env.neon_cli.create_branch(
+        "child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn
+    )
+
+    with main_endpoint.cursor(dbname="neondb") as cur:
+        cur.execute("CREATE TABLE test_migrate_one AS SELECT generate_series(1,100)")
+    main_endpoint.stop()
+
+    child_endpoint = env.endpoints.create_start("child", tenant_id=tenant_id)
+    with child_endpoint.cursor(dbname="neondb") as cur:
+        cur.execute("CREATE TABLE test_migrate_one AS SELECT generate_series(1,200)")
+    child_endpoint.stop()
+
+    # Wait for the index_part.json file of both branches to be uploaded to remote storage.
+    # This is a work around for issue https://github.com/neondatabase/neon/issues/3865.
+    wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
+    wait_for_upload_queue_empty(pageserver_http, tenant_id, child_timeline_id)
+
+    # Kill the pageserver, remove the tenant directory, and restart
+    env.pageserver.stop(immediate=True)
+    shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id))
+    env.pageserver.start()
+
+    # Wait before ingesting the WAL for CREATE DATABASE on the main branch. The original
+    # bug reproduced easily even without this, as there is always some delay between
+    # loading the timeline and establishing the connection to the safekeeper to stream and
+    # ingest the WAL, but let's make this less dependent on accidental timing.
+    pageserver_http.configure_failpoints([("wal-ingest-logical-message-sleep", "return(5000)")])
+    before_attach_time = time.time()
+    pageserver_http.tenant_attach(tenant_id)
+
+    child_endpoint.start()
+    with child_endpoint.cursor(dbname="neondb") as cur:
+        assert query_scalar(cur, "SELECT count(*) FROM test_migrate_one") == 200
+
+    # Sanity check that the failpoint was reached
+    assert env.pageserver.log_contains('failpoint "wal-ingest-logical-message-sleep": sleep done')
+    assert time.time() - before_attach_time > 5
+
+    # Clean up
+    pageserver_http.configure_failpoints(("wal-ingest-logical-message-sleep", "off"))

From 9cd6f2ceebf6352699cbb03ac02a8fe8d04b6258 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 15 May 2023 10:08:44 +0300
Subject: [PATCH 18/23] Remove duplicated logic in creating TenantConfOpt
 (#4230)

## Describe your changes

Remove duplicated logic in creating TenantConfOpt in both TryFrom of
TenantConfigRequest and TenantCreateRequest
---
 pageserver/src/tenant/config.rs | 170 +++++++++++++++-----------------
 1 file changed, 79 insertions(+), 91 deletions(-)

diff --git a/pageserver/src/tenant/config.rs b/pageserver/src/tenant/config.rs
index 50ce942a09..04983e9d1a 100644
--- a/pageserver/src/tenant/config.rs
+++ b/pageserver/src/tenant/config.rs
@@ -292,77 +292,93 @@ fn bad_duration<'a>(field_name: &'static str, value: &'a str) -> impl 'a + Fn()
     move || format!("Cannot parse `{field_name}` duration {value:?}")
 }
 
-impl TryFrom<&'_ TenantCreateRequest> for TenantConfOpt {
-    type Error = anyhow::Error;
-
-    fn try_from(request_data: &TenantCreateRequest) -> Result<Self, Self::Error> {
+impl TenantConfOpt {
+    #[allow(clippy::too_many_arguments)]
+    fn from_request(
+        checkpoint_distance: Option<u64>,
+        checkpoint_timeout: &Option<String>,
+        compaction_target_size: Option<u64>,
+        compaction_period: &Option<String>,
+        compaction_threshold: Option<usize>,
+        gc_horizon: Option<u64>,
+        gc_period: &Option<String>,
+        image_creation_threshold: Option<usize>,
+        pitr_interval: &Option<String>,
+        walreceiver_connect_timeout: &Option<String>,
+        lagging_wal_timeout: &Option<String>,
+        max_lsn_wal_lag: Option<NonZeroU64>,
+        trace_read_requests: Option<bool>,
+        eviction_policy: &Option<serde_json::Value>,
+        min_resident_size_override: Option<u64>,
+        evictions_low_residence_duration_metric_threshold: &Option<String>,
+    ) -> Result<Self, anyhow::Error> {
         let mut tenant_conf = TenantConfOpt::default();
 
-        if let Some(gc_period) = &request_data.gc_period {
+        if let Some(gc_period) = &gc_period {
             tenant_conf.gc_period = Some(
                 humantime::parse_duration(gc_period)
                     .with_context(bad_duration("gc_period", gc_period))?,
             );
         }
-        tenant_conf.gc_horizon = request_data.gc_horizon;
-        tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
+        tenant_conf.gc_horizon = gc_horizon;
+        tenant_conf.image_creation_threshold = image_creation_threshold;
 
-        if let Some(pitr_interval) = &request_data.pitr_interval {
+        if let Some(pitr_interval) = &pitr_interval {
             tenant_conf.pitr_interval = Some(
                 humantime::parse_duration(pitr_interval)
                     .with_context(bad_duration("pitr_interval", pitr_interval))?,
             );
         }
 
-        if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout {
+        if let Some(walreceiver_connect_timeout) = &walreceiver_connect_timeout {
             tenant_conf.walreceiver_connect_timeout = Some(
                 humantime::parse_duration(walreceiver_connect_timeout).with_context(
                     bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout),
                 )?,
             );
         }
-        if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout {
+        if let Some(lagging_wal_timeout) = &lagging_wal_timeout {
             tenant_conf.lagging_wal_timeout = Some(
                 humantime::parse_duration(lagging_wal_timeout)
                     .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?,
             );
         }
-        if let Some(max_lsn_wal_lag) = request_data.max_lsn_wal_lag {
+        if let Some(max_lsn_wal_lag) = max_lsn_wal_lag {
             tenant_conf.max_lsn_wal_lag = Some(max_lsn_wal_lag);
         }
-        if let Some(trace_read_requests) = request_data.trace_read_requests {
+        if let Some(trace_read_requests) = trace_read_requests {
             tenant_conf.trace_read_requests = Some(trace_read_requests);
         }
 
-        tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
-        if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout {
+        tenant_conf.checkpoint_distance = checkpoint_distance;
+        if let Some(checkpoint_timeout) = &checkpoint_timeout {
             tenant_conf.checkpoint_timeout = Some(
                 humantime::parse_duration(checkpoint_timeout)
                     .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?,
             );
         }
 
-        tenant_conf.compaction_target_size = request_data.compaction_target_size;
-        tenant_conf.compaction_threshold = request_data.compaction_threshold;
+        tenant_conf.compaction_target_size = compaction_target_size;
+        tenant_conf.compaction_threshold = compaction_threshold;
 
-        if let Some(compaction_period) = &request_data.compaction_period {
+        if let Some(compaction_period) = &compaction_period {
             tenant_conf.compaction_period = Some(
                 humantime::parse_duration(compaction_period)
                     .with_context(bad_duration("compaction_period", compaction_period))?,
             );
         }
 
-        if let Some(eviction_policy) = &request_data.eviction_policy {
+        if let Some(eviction_policy) = &eviction_policy {
             tenant_conf.eviction_policy = Some(
                 serde::Deserialize::deserialize(eviction_policy)
                     .context("parse field `eviction_policy`")?,
             );
         }
 
-        tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
+        tenant_conf.min_resident_size_override = min_resident_size_override;
 
         if let Some(evictions_low_residence_duration_metric_threshold) =
-            &request_data.evictions_low_residence_duration_metric_threshold
+            &evictions_low_residence_duration_metric_threshold
         {
             tenant_conf.evictions_low_residence_duration_metric_threshold = Some(
                 humantime::parse_duration(evictions_low_residence_duration_metric_threshold)
@@ -377,81 +393,53 @@ impl TryFrom<&'_ TenantCreateRequest> for TenantConfOpt {
     }
 }
 
+impl TryFrom<&'_ TenantCreateRequest> for TenantConfOpt {
+    type Error = anyhow::Error;
+
+    fn try_from(request_data: &TenantCreateRequest) -> Result<Self, Self::Error> {
+        Self::from_request(
+            request_data.checkpoint_distance,
+            &request_data.checkpoint_timeout,
+            request_data.compaction_target_size,
+            &request_data.compaction_period,
+            request_data.compaction_threshold,
+            request_data.gc_horizon,
+            &request_data.gc_period,
+            request_data.image_creation_threshold,
+            &request_data.pitr_interval,
+            &request_data.walreceiver_connect_timeout,
+            &request_data.lagging_wal_timeout,
+            request_data.max_lsn_wal_lag,
+            request_data.trace_read_requests,
+            &request_data.eviction_policy,
+            request_data.min_resident_size_override,
+            &request_data.evictions_low_residence_duration_metric_threshold,
+        )
+    }
+}
+
 impl TryFrom<&'_ TenantConfigRequest> for TenantConfOpt {
     type Error = anyhow::Error;
 
     fn try_from(request_data: &TenantConfigRequest) -> Result<Self, Self::Error> {
-        let mut tenant_conf = TenantConfOpt::default();
-        if let Some(gc_period) = &request_data.gc_period {
-            tenant_conf.gc_period = Some(
-                humantime::parse_duration(gc_period)
-                    .with_context(bad_duration("gc_period", gc_period))?,
-            );
-        }
-        tenant_conf.gc_horizon = request_data.gc_horizon;
-        tenant_conf.image_creation_threshold = request_data.image_creation_threshold;
-
-        if let Some(pitr_interval) = &request_data.pitr_interval {
-            tenant_conf.pitr_interval = Some(
-                humantime::parse_duration(pitr_interval)
-                    .with_context(bad_duration("pitr_interval", pitr_interval))?,
-            );
-        }
-        if let Some(walreceiver_connect_timeout) = &request_data.walreceiver_connect_timeout {
-            tenant_conf.walreceiver_connect_timeout = Some(
-                humantime::parse_duration(walreceiver_connect_timeout).with_context(
-                    bad_duration("walreceiver_connect_timeout", walreceiver_connect_timeout),
-                )?,
-            );
-        }
-        if let Some(lagging_wal_timeout) = &request_data.lagging_wal_timeout {
-            tenant_conf.lagging_wal_timeout = Some(
-                humantime::parse_duration(lagging_wal_timeout)
-                    .with_context(bad_duration("lagging_wal_timeout", lagging_wal_timeout))?,
-            );
-        }
-        tenant_conf.max_lsn_wal_lag = request_data.max_lsn_wal_lag;
-        tenant_conf.trace_read_requests = request_data.trace_read_requests;
-
-        tenant_conf.checkpoint_distance = request_data.checkpoint_distance;
-        if let Some(checkpoint_timeout) = &request_data.checkpoint_timeout {
-            tenant_conf.checkpoint_timeout = Some(
-                humantime::parse_duration(checkpoint_timeout)
-                    .with_context(bad_duration("checkpoint_timeout", checkpoint_timeout))?,
-            );
-        }
-        tenant_conf.compaction_target_size = request_data.compaction_target_size;
-        tenant_conf.compaction_threshold = request_data.compaction_threshold;
-
-        if let Some(compaction_period) = &request_data.compaction_period {
-            tenant_conf.compaction_period = Some(
-                humantime::parse_duration(compaction_period)
-                    .with_context(bad_duration("compaction_period", compaction_period))?,
-            );
-        }
-
-        if let Some(eviction_policy) = &request_data.eviction_policy {
-            tenant_conf.eviction_policy = Some(
-                serde::Deserialize::deserialize(eviction_policy)
-                    .context("parse field `eviction_policy`")?,
-            );
-        }
-
-        tenant_conf.min_resident_size_override = request_data.min_resident_size_override;
-
-        if let Some(evictions_low_residence_duration_metric_threshold) =
-            &request_data.evictions_low_residence_duration_metric_threshold
-        {
-            tenant_conf.evictions_low_residence_duration_metric_threshold = Some(
-                humantime::parse_duration(evictions_low_residence_duration_metric_threshold)
-                    .with_context(bad_duration(
-                        "evictions_low_residence_duration_metric_threshold",
-                        evictions_low_residence_duration_metric_threshold,
-                    ))?,
-            );
-        }
-
-        Ok(tenant_conf)
+        Self::from_request(
+            request_data.checkpoint_distance,
+            &request_data.checkpoint_timeout,
+            request_data.compaction_target_size,
+            &request_data.compaction_period,
+            request_data.compaction_threshold,
+            request_data.gc_horizon,
+            &request_data.gc_period,
+            request_data.image_creation_threshold,
+            &request_data.pitr_interval,
+            &request_data.walreceiver_connect_timeout,
+            &request_data.lagging_wal_timeout,
+            request_data.max_lsn_wal_lag,
+            request_data.trace_read_requests,
+            &request_data.eviction_policy,
+            request_data.min_resident_size_override,
+            &request_data.evictions_low_residence_duration_metric_threshold,
+        )
     }
 }
 

From 4a76f2b8d6ec77b789345cf3e5d905462d7f5763 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Mon, 15 May 2023 14:16:43 +0300
Subject: [PATCH 19/23] upload new timeline index part json before 201 or on
 retry (#4204)

Await for upload to complete before returning 201 Created on
`branch_timeline` or when `bootstrap_timeline` happens. Should either of
those waits fail, then on the retried request await for uploads again.
This should work as expected assuming control-plane does not start to
use timeline creation as a wait_for_upload mechanism.

Fixes #3865, started from
https://github.com/neondatabase/neon/pull/3857/files#r1144468177

Co-authored-by: Heikki Linnakangas <heikki@neon.tech>
---
 pageserver/src/tenant.rs                      |  40 +++++-
 test_runner/fixtures/pageserver/http.py       |   5 +-
 .../regress/test_disk_usage_eviction.py       |   2 -
 test_runner/regress/test_remote_storage.py    | 126 ++++++++++++++----
 test_runner/regress/test_tenant_relocation.py |  23 +---
 5 files changed, 140 insertions(+), 56 deletions(-)

diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 6851f3ca2c..9ab0262407 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -1264,8 +1264,24 @@ impl Tenant {
             "Cannot create timelines on inactive tenant"
         );
 
-        if self.get_timeline(new_timeline_id, false).is_ok() {
+        if let Ok(existing) = self.get_timeline(new_timeline_id, false) {
             debug!("timeline {new_timeline_id} already exists");
+
+            if let Some(remote_client) = existing.remote_client.as_ref() {
+                // Wait for uploads to complete, so that when we return Ok, the timeline
+                // is known to be durable on remote storage. Just like we do at the end of
+                // this function, after we have created the timeline ourselves.
+                //
+                // We only really care that the initial version of `index_part.json` has
+                // been uploaded. That's enough to remember that the timeline
+                // exists. However, there is no function to wait specifically for that so
+                // we just wait for all in-progress uploads to finish.
+                remote_client
+                    .wait_completion()
+                    .await
+                    .context("wait for timeline uploads to complete")?;
+            }
+
             return Ok(None);
         }
 
@@ -1307,6 +1323,17 @@ impl Tenant {
             }
         };
 
+        if let Some(remote_client) = loaded_timeline.remote_client.as_ref() {
+            // Wait for the upload of the 'index_part.json` file to finish, so that when we return
+            // Ok, the timeline is durable in remote storage.
+            let kind = ancestor_timeline_id
+                .map(|_| "branched")
+                .unwrap_or("bootstrapped");
+            remote_client.wait_completion().await.with_context(|| {
+                format!("wait for {} timeline initial uploads to complete", kind)
+            })?;
+        }
+
         Ok(Some(loaded_timeline))
     }
 
@@ -2376,17 +2403,18 @@ impl Tenant {
             src_timeline.initdb_lsn,
             src_timeline.pg_version,
         );
-        let mut timelines = self.timelines.lock().unwrap();
-        let new_timeline = self
-            .prepare_timeline(
+
+        let new_timeline = {
+            let mut timelines = self.timelines.lock().unwrap();
+            self.prepare_timeline(
                 dst_id,
                 &metadata,
                 timeline_uninit_mark,
                 false,
                 Some(Arc::clone(src_timeline)),
             )?
-            .initialize_with_lock(ctx, &mut timelines, true, true)?;
-        drop(timelines);
+            .initialize_with_lock(ctx, &mut timelines, true, true)?
+        };
 
         // Root timeline gets its layers during creation and uploads them along with the metadata.
         // A branch timeline though, when created, can get no writes for some time, hence won't get any layers created.
diff --git a/test_runner/fixtures/pageserver/http.py b/test_runner/fixtures/pageserver/http.py
index 388e834b56..1ff057fae2 100644
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -272,6 +272,7 @@ class PageserverHttpClient(requests.Session):
         new_timeline_id: Optional[TimelineId] = None,
         ancestor_timeline_id: Optional[TimelineId] = None,
         ancestor_start_lsn: Optional[Lsn] = None,
+        **kwargs,
     ) -> Dict[Any, Any]:
         body: Dict[str, Any] = {
             "new_timeline_id": str(new_timeline_id) if new_timeline_id else None,
@@ -281,7 +282,9 @@ class PageserverHttpClient(requests.Session):
         if pg_version != PgVersion.NOT_SET:
             body["pg_version"] = int(pg_version)
 
-        res = self.post(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json=body)
+        res = self.post(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline", json=body, **kwargs
+        )
         self.verbose_error(res)
         if res.status_code == 409:
             raise Exception(f"could not create timeline: already exists for id {new_timeline_id}")
diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py
index 31c7ef2b17..e8ec657683 100644
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -136,9 +136,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Ev
     env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")
 
     # remove the initial tenant
-    ## why wait for upload queue? => https://github.com/neondatabase/neon/issues/3865
     assert env.initial_timeline
-    wait_for_upload_queue_empty(pageserver_http, env.initial_tenant, env.initial_timeline)
     pageserver_http.tenant_detach(env.initial_tenant)
     assert isinstance(env.remote_storage, LocalFsStorage)
     tenant_remote_storage = env.remote_storage.root / "tenants" / str(env.initial_tenant)
diff --git a/test_runner/regress/test_remote_storage.py b/test_runner/regress/test_remote_storage.py
index 6de5f7db04..cce9cdc175 100644
--- a/test_runner/regress/test_remote_storage.py
+++ b/test_runner/regress/test_remote_storage.py
@@ -2,11 +2,12 @@
 # env NEON_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
 
 import os
+import queue
 import shutil
 import threading
 import time
 from pathlib import Path
-from typing import Dict, List, Tuple
+from typing import Dict, List, Optional, Tuple
 
 import pytest
 from fixtures.log_helper import log
@@ -26,6 +27,7 @@ from fixtures.pageserver.utils import (
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import print_gc_result, query_scalar, wait_until
+from requests import ReadTimeout
 
 
 #
@@ -626,10 +628,7 @@ def test_empty_branch_remote_storage_upload(
 
     new_branch_name = "new_branch"
     new_branch_timeline_id = env.neon_cli.create_branch(new_branch_name, "main", env.initial_tenant)
-
-    with env.endpoints.create_start(new_branch_name, tenant_id=env.initial_tenant) as endpoint:
-        wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_branch_timeline_id)
-    wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id)
+    assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id)
 
     timelines_before_detach = set(
         map(
@@ -658,13 +657,19 @@ def test_empty_branch_remote_storage_upload(
     ), f"Expected to have same timelines after reattach, but got {timelines_after_detach}"
 
 
-# Branches off a root branch, but does not write anything to the new branch, so it has a metadata file only.
-# Ensures the branch is not on the remote storage and restarts the pageserver — the branch should be uploaded after the restart.
 @pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
 def test_empty_branch_remote_storage_upload_on_restart(
     neon_env_builder: NeonEnvBuilder,
     remote_storage_kind: RemoteStorageKind,
 ):
+    """
+    Branches off a root branch, but does not write anything to the new branch, so
+    it has a metadata file only.
+
+    Ensures the branch is not on the remote storage and restarts the pageserver
+    — the upload should be scheduled by load, and create_timeline should await
+    for it even though it gets 409 Conflict.
+    """
     neon_env_builder.enable_remote_storage(
         remote_storage_kind=remote_storage_kind,
         test_name="test_empty_branch_remote_storage_upload_on_restart",
@@ -673,35 +678,87 @@ def test_empty_branch_remote_storage_upload_on_restart(
     env = neon_env_builder.init_start()
     client = env.pageserver.http_client()
 
-    new_branch_name = "new_branch"
-    new_branch_timeline_id = env.neon_cli.create_branch(new_branch_name, "main", env.initial_tenant)
+    client.configure_failpoints(("before-upload-index", "return"))
 
-    with env.endpoints.create_start(new_branch_name, tenant_id=env.initial_tenant) as endpoint:
-        wait_for_last_flush_lsn(env, endpoint, env.initial_tenant, new_branch_timeline_id)
-    wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id)
+    new_branch_timeline_id = TimelineId.generate()
 
+    with pytest.raises(ReadTimeout):
+        client.timeline_create(
+            tenant_id=env.initial_tenant,
+            ancestor_timeline_id=env.initial_timeline,
+            new_timeline_id=new_branch_timeline_id,
+            pg_version=env.pg_version,
+            timeout=4,
+        )
+
+    env.pageserver.allowed_errors.append(
+        f".*POST.* path=/v1/tenant/{env.initial_tenant}/timeline.* request was dropped before completing"
+    )
+
+    # index upload is now hitting the failpoint, should not block the shutdown
     env.pageserver.stop()
 
-    # Remove new branch from the remote storage
-    assert isinstance(env.remote_storage, LocalFsStorage)
-    new_branch_on_remote_storage = (
-        env.remote_storage.root
-        / "tenants"
-        / str(env.initial_tenant)
-        / "timelines"
-        / str(new_branch_timeline_id)
+    timeline_path = (
+        Path("tenants") / str(env.initial_tenant) / "timelines" / str(new_branch_timeline_id)
     )
-    assert (
-        new_branch_on_remote_storage.is_dir()
-    ), f"'{new_branch_on_remote_storage}' path does not exist on the remote storage"
-    shutil.rmtree(new_branch_on_remote_storage)
 
-    env.pageserver.start()
+    local_metadata = env.repo_dir / timeline_path / "metadata"
+    assert local_metadata.is_file(), "timeout cancelled timeline branching, not the upload"
 
-    wait_upload_queue_empty(client, env.initial_tenant, new_branch_timeline_id)
+    assert isinstance(env.remote_storage, LocalFsStorage)
+    new_branch_on_remote_storage = env.remote_storage.root / timeline_path
     assert (
-        new_branch_on_remote_storage.is_dir()
-    ), f"New branch should have been reuploaded on pageserver restart to the remote storage path '{new_branch_on_remote_storage}'"
+        not new_branch_on_remote_storage.exists()
+    ), "failpoint should had prohibited index_part.json upload"
+
+    # during reconciliation we should had scheduled the uploads and on the
+    # retried create_timeline, we will await for those to complete on next
+    # client.timeline_create
+    env.pageserver.start(extra_env_vars={"FAILPOINTS": "before-upload-index=return"})
+
+    # sleep a bit to force the upload task go into exponential backoff
+    time.sleep(1)
+
+    q: queue.Queue[Optional[PageserverApiException]] = queue.Queue()
+    barrier = threading.Barrier(2)
+
+    def create_in_background():
+        barrier.wait()
+        try:
+            client.timeline_create(
+                tenant_id=env.initial_tenant,
+                ancestor_timeline_id=env.initial_timeline,
+                new_timeline_id=new_branch_timeline_id,
+                pg_version=env.pg_version,
+            )
+            q.put(None)
+        except PageserverApiException as e:
+            q.put(e)
+
+    create_thread = threading.Thread(target=create_in_background)
+    create_thread.start()
+
+    try:
+        # maximize chances of actually waiting for the uploads by create_timeline
+        barrier.wait()
+
+        assert not new_branch_on_remote_storage.exists(), "failpoint should had stopped uploading"
+
+        client.configure_failpoints(("before-upload-index", "off"))
+        conflict = q.get()
+
+        assert conflict, "create_timeline should not have succeeded"
+        assert (
+            conflict.status_code == 409
+        ), "timeline was created before restart, and uploads scheduled during initial load, so we expect 409 conflict"
+
+        assert_nothing_to_upload(client, env.initial_tenant, new_branch_timeline_id)
+
+        assert (
+            new_branch_on_remote_storage / "index_part.json"
+        ).is_file(), "uploads scheduled during initial load should had been awaited for"
+    finally:
+        create_thread.join()
 
 
 def wait_upload_queue_empty(
@@ -752,4 +809,17 @@ def get_queued_count(
     return int(val)
 
 
+def assert_nothing_to_upload(
+    client: PageserverHttpClient,
+    tenant_id: TenantId,
+    timeline_id: TimelineId,
+):
+    """
+    Check last_record_lsn == remote_consistent_lsn. Assert works only for empty timelines, which
+    do not have anything to compact or gc.
+    """
+    detail = client.timeline_detail(tenant_id, timeline_id)
+    assert Lsn(detail["last_record_lsn"]) == Lsn(detail["remote_consistent_lsn"])
+
+
 # TODO Test that we correctly handle GC of files that are stuck in upload queue.
diff --git a/test_runner/regress/test_tenant_relocation.py b/test_runner/regress/test_tenant_relocation.py
index 81ad256f4b..2a5b30803b 100644
--- a/test_runner/regress/test_tenant_relocation.py
+++ b/test_runner/regress/test_tenant_relocation.py
@@ -23,7 +23,6 @@ from fixtures.pageserver.utils import (
     tenant_exists,
     wait_for_last_record_lsn,
     wait_for_upload,
-    wait_for_upload_queue_empty,
 )
 from fixtures.types import Lsn, TenantId, TimelineId
 from fixtures.utils import (
@@ -546,7 +545,7 @@ def test_emergency_relocate_with_branches_slow_replay(
     # - A logical replication message between the inserts, so that we can conveniently
     #   pause the WAL ingestion between the two inserts.
     # - Child branch, created after the inserts
-    tenant_id, timeline_id = env.neon_cli.create_tenant()
+    tenant_id, _ = env.neon_cli.create_tenant()
 
     main_endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
     with main_endpoint.cursor() as cur:
@@ -559,14 +558,7 @@ def test_emergency_relocate_with_branches_slow_replay(
         current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
 
     main_endpoint.stop()
-    child_timeline_id = env.neon_cli.create_branch(
-        "child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn
-    )
-
-    # Wait for the index_part.json file of both branches to be uploaded to remote storage.
-    # This is a work around for issue https://github.com/neondatabase/neon/issues/3865.
-    wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
-    wait_for_upload_queue_empty(pageserver_http, tenant_id, child_timeline_id)
+    env.neon_cli.create_branch("child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn)
 
     # Now kill the pageserver, remove the tenant directory, and restart. This simulates
     # the scenario that a pageserver dies unexpectedly and cannot be recovered, so we relocate
@@ -704,7 +696,7 @@ def test_emergency_relocate_with_branches_createdb(
     pageserver_http = env.pageserver.http_client()
 
     # create new nenant
-    tenant_id, timeline_id = env.neon_cli.create_tenant()
+    tenant_id, _ = env.neon_cli.create_tenant()
 
     main_endpoint = env.endpoints.create_start("main", tenant_id=tenant_id)
     with main_endpoint.cursor() as cur:
@@ -712,9 +704,7 @@ def test_emergency_relocate_with_branches_createdb(
 
         cur.execute("CREATE DATABASE neondb")
         current_lsn = Lsn(query_scalar(cur, "SELECT pg_current_wal_flush_lsn()"))
-    child_timeline_id = env.neon_cli.create_branch(
-        "child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn
-    )
+    env.neon_cli.create_branch("child", tenant_id=tenant_id, ancestor_start_lsn=current_lsn)
 
     with main_endpoint.cursor(dbname="neondb") as cur:
         cur.execute("CREATE TABLE test_migrate_one AS SELECT generate_series(1,100)")
@@ -725,11 +715,6 @@ def test_emergency_relocate_with_branches_createdb(
         cur.execute("CREATE TABLE test_migrate_one AS SELECT generate_series(1,200)")
     child_endpoint.stop()
 
-    # Wait for the index_part.json file of both branches to be uploaded to remote storage.
-    # This is a work around for issue https://github.com/neondatabase/neon/issues/3865.
-    wait_for_upload_queue_empty(pageserver_http, tenant_id, timeline_id)
-    wait_for_upload_queue_empty(pageserver_http, tenant_id, child_timeline_id)
-
     # Kill the pageserver, remove the tenant directory, and restart
     env.pageserver.stop(immediate=True)
     shutil.rmtree(Path(env.repo_dir) / "tenants" / str(tenant_id))

From a5615bd8ea202b24d91d506cae230de5e25bdeb9 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Mon, 15 May 2023 13:04:03 +0100
Subject: [PATCH 20/23] Fix Allure reports for different benchmark jobs (#4229)

- Fix Allure report generation failure for Nightly Benchmarks
- Fix GitHub Autocomment for `run-benchmarks` label
(`build_and_test.yml::benchmarks` job)
---
 .github/actions/run-python-test-set/action.yml |  2 +-
 scripts/pr-comment-test-report.js              | 12 +++++++++++-
 2 files changed, 12 insertions(+), 2 deletions(-)

diff --git a/.github/actions/run-python-test-set/action.yml b/.github/actions/run-python-test-set/action.yml
index 1dd294d17a..d6c960bfda 100644
--- a/.github/actions/run-python-test-set/action.yml
+++ b/.github/actions/run-python-test-set/action.yml
@@ -206,4 +206,4 @@ runs:
       uses: ./.github/actions/allure-report-store
       with:
         report-dir: /tmp/test_output/allure/results
-        unique-key: ${{ inputs.test_selection }}-${{ inputs.build_type }}
+        unique-key: ${{ inputs.build_type }}
diff --git a/scripts/pr-comment-test-report.js b/scripts/pr-comment-test-report.js
index 7cb2a5494f..287a1ea8e5 100644
--- a/scripts/pr-comment-test-report.js
+++ b/scripts/pr-comment-test-report.js
@@ -79,7 +79,17 @@ module.exports = async ({ github, context, fetch, report }) => {
     for (const parentSuite of suites.children) {
         for (const suite of parentSuite.children) {
             for (const test of suite.children) {
-                const {groups: {buildType, pgVersion}} = test.name.match(/[\[-](?<buildType>debug|release)-pg(?<pgVersion>\d+)[-\]]/)
+                let buildType, pgVersion
+                const match = test.name.match(/[\[-](?<buildType>debug|release)-pg(?<pgVersion>\d+)[-\]]/)?.groups
+                if (match) {
+                    ({buildType, pgVersion} = match)
+                } else {
+                    // It's ok, we embed BUILD_TYPE and Postgres Version into the test name only for regress suite and do not for other suites (like performance).
+                    console.info(`Cannot get BUILD_TYPE and Postgres Version from test name: "${test.name}", defaulting to "release" and "14"`)
+
+                    buildType = "release"
+                    pgVersion = "14"
+                }
 
                 pgVersions.add(pgVersion)
                 buildTypes.add(buildType)

From 6415dc791ce003f4564ad502d7cedb1b1aabfff2 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 15 May 2023 13:38:18 -0500
Subject: [PATCH 21/23] Fix use-after-free issue in libpagestore (#4239)

## Describe your changes

`pageserver_disconnect()` calls `PQfinish()` which deallocates resources
on the connection structure. `PQerrorMessage()` hands back a pointer to
an allocated resource. Duplicate the error message prior to calling
`pageserver_disconnect()`.

## Issue ticket number and link

Fixes https://github.com/neondatabase/neon/issues/4214

## Checklist before requesting a review

- [x] I have performed a self-review of my code.
- [x] If it is a core feature, I have added thorough tests.
- [x] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [x] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.

## Checklist before merging

- [x] Do not forget to reformat commit message to not include the above
checklist
---
 pgxn/neon/libpagestore.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 21330c018f..677b9e3259 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -367,7 +367,7 @@ pageserver_flush(void)
 	}
 	else if (PQflush(pageserver_conn))
 	{
-		char	   *msg = PQerrorMessage(pageserver_conn);
+		char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
 
 		pageserver_disconnect();
 		neon_log(ERROR, "failed to flush page requests: %s", msg);

From e7514cc15e7835434a485f1bec2f15f908339a75 Mon Sep 17 00:00:00 2001
From: Tristan Partin <tristan@neon.tech>
Date: Mon, 15 May 2023 15:36:53 -0500
Subject: [PATCH 22/23] Wrap naked PQerrorMessage calls in libpagestore with
 pchomp (#4242)

---
 pgxn/neon/libpagestore.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/pgxn/neon/libpagestore.c b/pgxn/neon/libpagestore.c
index 677b9e3259..606af9741f 100644
--- a/pgxn/neon/libpagestore.c
+++ b/pgxn/neon/libpagestore.c
@@ -192,8 +192,9 @@ retry:
 		{
 			if (!PQconsumeInput(pageserver_conn))
 			{
-				neon_log(LOG, "could not get response from pageserver: %s",
-						 PQerrorMessage(pageserver_conn));
+				char	   *msg = pchomp(PQerrorMessage(pageserver_conn));
+				neon_log(LOG, "could not get response from pageserver: %s", msg);
+				pfree(msg);
 				return -1;
 			}
 		}
@@ -343,7 +344,7 @@ pageserver_receive(void)
 			resp = NULL;
 		}
 		else if (rc == -2)
-			neon_log(ERROR, "could not read COPY data: %s", PQerrorMessage(pageserver_conn));
+			neon_log(ERROR, "could not read COPY data: %s", pchomp(PQerrorMessage(pageserver_conn)));
 		else
 			neon_log(ERROR, "unexpected PQgetCopyData return value: %d", rc);
 	}

From 4f64be4a98ab601c5b6c349de02f59e18f65db63 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Tue, 9 May 2023 15:46:34 +0200
Subject: [PATCH 23/23] Add endpoint to connection string

---
 proxy/src/auth.rs                 |  7 ++--
 proxy/src/auth/backend/hacks.rs   |  4 +-
 proxy/src/auth/credentials.rs     | 58 +++++++++++++++++++++++++++-
 proxy/src/auth/flow.rs            |  2 +-
 proxy/src/auth/password_hack.rs   | 64 +++++++++++++++++++++++++++----
 proxy/src/compute.rs              |  4 +-
 test_runner/regress/test_proxy.py | 53 +++++++++++++++----------
 7 files changed, 153 insertions(+), 39 deletions(-)

diff --git a/proxy/src/auth.rs b/proxy/src/auth.rs
index dfea84953b..58dceb3bb6 100644
--- a/proxy/src/auth.rs
+++ b/proxy/src/auth.rs
@@ -7,6 +7,7 @@ mod credentials;
 pub use credentials::ClientCredentials;
 
 mod password_hack;
+pub use password_hack::parse_endpoint_param;
 use password_hack::PasswordHackPayload;
 
 mod flow;
@@ -44,10 +45,10 @@ pub enum AuthErrorImpl {
     #[error(
         "Endpoint ID is not specified. \
         Either please upgrade the postgres client library (libpq) for SNI support \
-        or pass the endpoint ID (first part of the domain name) as a parameter: '?options=project%3D<endpoint-id>'. \
+        or pass the endpoint ID (first part of the domain name) as a parameter: '?options=endpoint%3D<endpoint-id>'. \
         See more at https://neon.tech/sni"
     )]
-    MissingProjectName,
+    MissingEndpointName,
 
     #[error("password authentication failed for user '{0}'")]
     AuthFailed(Box<str>),
@@ -88,7 +89,7 @@ impl UserFacingError for AuthError {
             AuthFailed(_) => self.to_string(),
             BadAuthMethod(_) => self.to_string(),
             MalformedPassword(_) => self.to_string(),
-            MissingProjectName => self.to_string(),
+            MissingEndpointName => self.to_string(),
             Io(_) => "Internal error".to_string(),
         }
     }
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
index d45806461e..dcc93ec04c 100644
--- a/proxy/src/auth/backend/hacks.rs
+++ b/proxy/src/auth/backend/hacks.rs
@@ -52,8 +52,8 @@ pub async fn password_hack(
         .authenticate()
         .await?;
 
-    info!(project = &payload.project, "received missing parameter");
-    creds.project = Some(payload.project);
+    info!(project = &payload.endpoint, "received missing parameter");
+    creds.project = Some(payload.endpoint);
 
     let mut node = api.wake_compute(extra, creds).await?;
     node.config.password(payload.password);
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index b21cd79ddf..6787d82b71 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -1,6 +1,7 @@
 //! User credentials used in authentication.
 
-use crate::error::UserFacingError;
+use crate::{auth::password_hack::parse_endpoint_param, error::UserFacingError};
+use itertools::Itertools;
 use pq_proto::StartupMessageParams;
 use std::collections::HashSet;
 use thiserror::Error;
@@ -61,7 +62,15 @@ impl<'a> ClientCredentials<'a> {
         // Project name might be passed via PG's command-line options.
         let project_option = params
             .options_raw()
-            .and_then(|mut options| options.find_map(|opt| opt.strip_prefix("project=")))
+            .and_then(|options| {
+                // We support both `project` (deprecated) and `endpoint` options for backward compatibility.
+                // However, if both are present, we don't exactly know which one to use.
+                // Therefore we require that only one of them is present.
+                options
+                    .filter_map(parse_endpoint_param)
+                    .at_most_one()
+                    .ok()?
+            })
             .map(|name| name.to_string());
 
         let project_from_domain = if let Some(sni_str) = sni {
@@ -177,6 +186,51 @@ mod tests {
         Ok(())
     }
 
+    #[test]
+    fn parse_endpoint_from_options() -> anyhow::Result<()> {
+        let options = StartupMessageParams::new([
+            ("user", "john_doe"),
+            ("options", "-ckey=1 endpoint=bar -c geqo=off"),
+        ]);
+
+        let creds = ClientCredentials::parse(&options, None, None)?;
+        assert_eq!(creds.user, "john_doe");
+        assert_eq!(creds.project.as_deref(), Some("bar"));
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_three_endpoints_from_options() -> anyhow::Result<()> {
+        let options = StartupMessageParams::new([
+            ("user", "john_doe"),
+            (
+                "options",
+                "-ckey=1 endpoint=one endpoint=two endpoint=three -c geqo=off",
+            ),
+        ]);
+
+        let creds = ClientCredentials::parse(&options, None, None)?;
+        assert_eq!(creds.user, "john_doe");
+        assert!(creds.project.is_none());
+
+        Ok(())
+    }
+
+    #[test]
+    fn parse_when_endpoint_and_project_are_in_options() -> anyhow::Result<()> {
+        let options = StartupMessageParams::new([
+            ("user", "john_doe"),
+            ("options", "-ckey=1 endpoint=bar project=foo -c geqo=off"),
+        ]);
+
+        let creds = ClientCredentials::parse(&options, None, None)?;
+        assert_eq!(creds.user, "john_doe");
+        assert!(creds.project.is_none());
+
+        Ok(())
+    }
+
     #[test]
     fn parse_projects_identical() -> anyhow::Result<()> {
         let options = StartupMessageParams::new([("user", "john_doe"), ("options", "project=baz")]);
diff --git a/proxy/src/auth/flow.rs b/proxy/src/auth/flow.rs
index 4b982c0c5e..190abc9b2e 100644
--- a/proxy/src/auth/flow.rs
+++ b/proxy/src/auth/flow.rs
@@ -91,7 +91,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> AuthFlow<'_, S, PasswordHack> {
             // the user neither enabled SNI nor resorted to any other method
             // for passing the project name we rely on. We should show them
             // the most helpful error message and point to the documentation.
-            .ok_or(AuthErrorImpl::MissingProjectName)?;
+            .ok_or(AuthErrorImpl::MissingEndpointName)?;
 
         Ok(payload)
     }
diff --git a/proxy/src/auth/password_hack.rs b/proxy/src/auth/password_hack.rs
index 639809e18a..33441e8c88 100644
--- a/proxy/src/auth/password_hack.rs
+++ b/proxy/src/auth/password_hack.rs
@@ -6,27 +6,55 @@
 use bstr::ByteSlice;
 
 pub struct PasswordHackPayload {
-    pub project: String,
+    pub endpoint: String,
     pub password: Vec<u8>,
 }
 
 impl PasswordHackPayload {
     pub fn parse(bytes: &[u8]) -> Option<Self> {
         // The format is `project=<utf-8>;<password-bytes>`.
-        let mut iter = bytes.strip_prefix(b"project=")?.splitn_str(2, ";");
-        let project = iter.next()?.to_str().ok()?.to_owned();
+        let mut iter = bytes.splitn_str(2, ";");
+        let endpoint = iter.next()?.to_str().ok()?;
+        let endpoint = parse_endpoint_param(endpoint)?.to_owned();
         let password = iter.next()?.to_owned();
 
-        Some(Self { project, password })
+        Some(Self { endpoint, password })
     }
 }
 
+pub fn parse_endpoint_param(bytes: &str) -> Option<&str> {
+    bytes
+        .strip_prefix("project=")
+        .or_else(|| bytes.strip_prefix("endpoint="))
+}
+
 #[cfg(test)]
 mod tests {
     use super::*;
 
     #[test]
-    fn parse_password_hack_payload() {
+    fn parse_endpoint_param_fn() {
+        let input = "";
+        assert!(parse_endpoint_param(input).is_none());
+
+        let input = "project=";
+        assert_eq!(parse_endpoint_param(input), Some(""));
+
+        let input = "project=foobar";
+        assert_eq!(parse_endpoint_param(input), Some("foobar"));
+
+        let input = "endpoint=";
+        assert_eq!(parse_endpoint_param(input), Some(""));
+
+        let input = "endpoint=foobar";
+        assert_eq!(parse_endpoint_param(input), Some("foobar"));
+
+        let input = "other_option=foobar";
+        assert!(parse_endpoint_param(input).is_none());
+    }
+
+    #[test]
+    fn parse_password_hack_payload_project() {
         let bytes = b"";
         assert!(PasswordHackPayload::parse(bytes).is_none());
 
@@ -34,13 +62,33 @@ mod tests {
         assert!(PasswordHackPayload::parse(bytes).is_none());
 
         let bytes = b"project=;";
-        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
-        assert_eq!(payload.project, "");
+        let payload: PasswordHackPayload =
+            PasswordHackPayload::parse(bytes).expect("parsing failed");
+        assert_eq!(payload.endpoint, "");
         assert_eq!(payload.password, b"");
 
         let bytes = b"project=foobar;pass;word";
         let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
-        assert_eq!(payload.project, "foobar");
+        assert_eq!(payload.endpoint, "foobar");
+        assert_eq!(payload.password, b"pass;word");
+    }
+
+    #[test]
+    fn parse_password_hack_payload_endpoint() {
+        let bytes = b"";
+        assert!(PasswordHackPayload::parse(bytes).is_none());
+
+        let bytes = b"endpoint=";
+        assert!(PasswordHackPayload::parse(bytes).is_none());
+
+        let bytes = b"endpoint=;";
+        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
+        assert_eq!(payload.endpoint, "");
+        assert_eq!(payload.password, b"");
+
+        let bytes = b"endpoint=foobar;pass;word";
+        let payload = PasswordHackPayload::parse(bytes).expect("parsing failed");
+        assert_eq!(payload.endpoint, "foobar");
         assert_eq!(payload.password, b"pass;word");
     }
 }
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index d277940a12..480acb88d9 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -1,4 +1,4 @@
-use crate::{cancellation::CancelClosure, error::UserFacingError};
+use crate::{auth::parse_endpoint_param, cancellation::CancelClosure, error::UserFacingError};
 use futures::{FutureExt, TryFutureExt};
 use itertools::Itertools;
 use pq_proto::StartupMessageParams;
@@ -279,7 +279,7 @@ fn filtered_options(params: &StartupMessageParams) -> Option<String> {
     #[allow(unstable_name_collisions)]
     let options: String = params
         .options_raw()?
-        .filter(|opt| !opt.starts_with("project="))
+        .filter(|opt| parse_endpoint_param(opt).is_none())
         .intersperse(" ") // TODO: use impl from std once it's stabilized
         .collect();
 
diff --git a/test_runner/regress/test_proxy.py b/test_runner/regress/test_proxy.py
index ee6349436b..ae914e384e 100644
--- a/test_runner/regress/test_proxy.py
+++ b/test_runner/regress/test_proxy.py
@@ -5,16 +5,18 @@ import pytest
 from fixtures.neon_fixtures import PSQL, NeonProxy, VanillaPostgres
 
 
-def test_proxy_select_1(static_proxy: NeonProxy):
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_proxy_select_1(static_proxy: NeonProxy, option_name: str):
     """
     A simplest smoke test: check proxy against a local postgres instance.
     """
 
-    out = static_proxy.safe_psql("select 1", options="project=generic-project-name")
+    out = static_proxy.safe_psql("select 1", options=f"{option_name}=generic-project-name")
     assert out[0][0] == 1
 
 
-def test_password_hack(static_proxy: NeonProxy):
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_password_hack(static_proxy: NeonProxy, option_name: str):
     """
     Check the PasswordHack auth flow: an alternative to SCRAM auth for
     clients which can't provide the project/endpoint name via SNI or `options`.
@@ -23,11 +25,12 @@ def test_password_hack(static_proxy: NeonProxy):
     user = "borat"
     password = "password"
     static_proxy.safe_psql(
-        f"create role {user} with login password '{password}'", options="project=irrelevant"
+        f"create role {user} with login password '{password}'",
+        options=f"{option_name}=irrelevant",
     )
 
     # Note the format of `magic`!
-    magic = f"project=irrelevant;{password}"
+    magic = f"{option_name}=irrelevant;{password}"
     static_proxy.safe_psql("select 1", sslsni=0, user=user, password=magic)
 
     # Must also check that invalid magic won't be accepted.
@@ -56,55 +59,62 @@ async def test_link_auth(vanilla_pg: VanillaPostgres, link_proxy: NeonProxy):
     assert out == "42"
 
 
-def test_proxy_options(static_proxy: NeonProxy):
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_proxy_options(static_proxy: NeonProxy, option_name: str):
     """
     Check that we pass extra `options` to the PostgreSQL server:
-    * `project=...` shouldn't be passed at all (otherwise postgres will raise an error).
+    * `project=...` and `endpoint=...` shouldn't be passed at all
+    * (otherwise postgres will raise an error).
     * everything else should be passed as-is.
     """
 
-    options = "project=irrelevant -cproxytest.option=value"
+    options = f"{option_name}=irrelevant -cproxytest.option=value"
     out = static_proxy.safe_psql("show proxytest.option", options=options)
     assert out[0][0] == "value"
 
-    options = "-c proxytest.foo=\\ str project=irrelevant"
+    options = f"-c proxytest.foo=\\ str {option_name}=irrelevant"
     out = static_proxy.safe_psql("show proxytest.foo", options=options)
     assert out[0][0] == " str"
 
 
-def test_auth_errors(static_proxy: NeonProxy):
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_auth_errors(static_proxy: NeonProxy, option_name: str):
     """
     Check that we throw very specific errors in some unsuccessful auth scenarios.
     """
 
     # User does not exist
     with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", options="project=irrelevant")
+        static_proxy.connect(user="pinocchio", options=f"{option_name}=irrelevant")
     text = str(exprinfo.value).strip()
     assert text.endswith("password authentication failed for user 'pinocchio'")
 
     static_proxy.safe_psql(
-        "create role pinocchio with login password 'magic'", options="project=irrelevant"
+        "create role pinocchio with login password 'magic'",
+        options=f"{option_name}=irrelevant",
     )
 
     # User exists, but password is missing
     with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", password=None, options="project=irrelevant")
+        static_proxy.connect(user="pinocchio", password=None, options=f"{option_name}=irrelevant")
     text = str(exprinfo.value).strip()
     assert text.endswith("password authentication failed for user 'pinocchio'")
 
     # User exists, but password is wrong
     with pytest.raises(psycopg2.Error) as exprinfo:
-        static_proxy.connect(user="pinocchio", password="bad", options="project=irrelevant")
+        static_proxy.connect(user="pinocchio", password="bad", options=f"{option_name}=irrelevant")
     text = str(exprinfo.value).strip()
     assert text.endswith("password authentication failed for user 'pinocchio'")
 
     # Finally, check that the user can connect
-    with static_proxy.connect(user="pinocchio", password="magic", options="project=irrelevant"):
+    with static_proxy.connect(
+        user="pinocchio", password="magic", options=f"{option_name}=irrelevant"
+    ):
         pass
 
 
-def test_forward_params_to_client(static_proxy: NeonProxy):
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
+def test_forward_params_to_client(static_proxy: NeonProxy, option_name: str):
     """
     Check that we forward all necessary PostgreSQL server params to client.
     """
@@ -130,7 +140,7 @@ def test_forward_params_to_client(static_proxy: NeonProxy):
         where name = any(%s)
     """
 
-    with static_proxy.connect(options="project=irrelevant") as conn:
+    with static_proxy.connect(options=f"{option_name}=irrelevant") as conn:
         with conn.cursor() as cur:
             cur.execute(query, (reported_params_subset,))
             for name, value in cur.fetchall():
@@ -138,17 +148,18 @@ def test_forward_params_to_client(static_proxy: NeonProxy):
                 assert conn.get_parameter_status(name) == value
 
 
+@pytest.mark.parametrize("option_name", ["project", "endpoint"])
 @pytest.mark.timeout(5)
-def test_close_on_connections_exit(static_proxy: NeonProxy):
+def test_close_on_connections_exit(static_proxy: NeonProxy, option_name: str):
     # Open two connections, send SIGTERM, then ensure that proxy doesn't exit
     # until after connections close.
-    with static_proxy.connect(options="project=irrelevant"), static_proxy.connect(
-        options="project=irrelevant"
+    with static_proxy.connect(options=f"{option_name}=irrelevant"), static_proxy.connect(
+        options=f"{option_name}=irrelevant"
     ):
         static_proxy.terminate()
         with pytest.raises(subprocess.TimeoutExpired):
             static_proxy.wait_for_exit(timeout=2)
         # Ensure we don't accept any more connections
         with pytest.raises(psycopg2.OperationalError):
-            static_proxy.connect(options="project=irrelevant")
+            static_proxy.connect(options=f"{option_name}=irrelevant")
     static_proxy.wait_for_exit()