From a5ce2b5330233927169152253548f822cf6d1643 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Tue, 14 Feb 2023 16:31:06 +0200
Subject: [PATCH 01/42] Add debug messages around
 timeline.get_current_logical_size

---
 pageserver/src/consumption_metrics.rs | 23 +++++++++++++----------
 pageserver/src/tenant/timeline.rs     |  7 ++++++-
 2 files changed, 19 insertions(+), 11 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index a730d39339..e4b7b6d809 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -166,17 +166,20 @@ pub async fn collect_metrics_iteration(
 
                 match timeline.get_current_logical_size(ctx) {
                     // Only send timeline logical size when it is fully calculated.
-                    Ok((size, is_exact)) if is_exact => {
-                        current_metrics.push((
-                            PageserverConsumptionMetricsKey {
-                                tenant_id,
-                                timeline_id: Some(timeline.timeline_id),
-                                metric: TIMELINE_LOGICAL_SIZE,
-                            },
-                            size,
-                        ));
+                    Ok((size, is_exact)) => {
+                        if is_exact {
+                            current_metrics.push((
+                                PageserverConsumptionMetricsKey {
+                                    tenant_id,
+                                    timeline_id: Some(timeline.timeline_id),
+                                    metric: TIMELINE_LOGICAL_SIZE,
+                                },
+                                size,
+                            ));
+                        } else {
+                            info!("logical_size is not fully calculated for timeline {}, skipping sending value {} ", timeline.timeline_id, size);
+                        }
                     }
-                    Ok((_, _)) => {}
                     Err(err) => {
                         error!(
                             "failed to get current logical size for timeline {}: {err:?}",
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 5a829e42e5..7a5a9de2f4 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -740,10 +740,15 @@ impl Timeline {
 
         let mut is_exact = true;
         let size = current_size.size();
-        if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
+        if let (CurrentLogicalSize::Approximate(approx_size), Some(init_lsn)) =
             (current_size, self.current_logical_size.initial_part_end)
         {
             is_exact = false;
+            info!(
+                "Current size for timeline {} is approximate {}, initial_part_end lsn: {:?}",
+                self.timeline_id, approx_size, init_lsn
+            );
+
             self.try_spawn_size_init_task(init_lsn, ctx);
         }
 

From a839860c2ea9a0bbce39a43e9849daef698025ee Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Tue, 14 Feb 2023 16:49:42 +0200
Subject: [PATCH 02/42] Add debug messages around sending cached metrics

---
 pageserver/src/consumption_metrics.rs | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index e4b7b6d809..b078782a86 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -242,6 +242,18 @@ pub async fn collect_metrics_iteration(
             Some(val) => val != curr_val,
             None => true,
         });
+
+        info!(
+            "sending only changed metrics, {} values at {}",
+            current_metrics.len(),
+            Utc::now()
+        );
+    } else {
+        info!(
+            "sending all metrics, including cached ones. {} values at {}",
+            current_metrics.len(),
+            Utc::now()
+        );
     }
 
     if current_metrics.is_empty() {

From a974602f9f4311d6b1e778d31e5c2ea17aacebf8 Mon Sep 17 00:00:00 2001
From: Anna Stepanyan <anna.stepanyan@neon.tech>
Date: Wed, 15 Feb 2023 15:11:06 +0100
Subject: [PATCH 03/42] fix the logical size term definition (#3609)

a size of a *database* cannot be a sum of the sizes of *all databases*
indicating that a logical size is calculated for a branch

## Describe your changes

## Issue ticket number and link

## Checklist before requesting a review
- [x] i checked the suggested changes
- [x] this is not a core feature
- [x] this is just a docs update, does not require analytics
- [x] this PR does not require a public announcement
---
 docs/synthetic-size.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/synthetic-size.md b/docs/synthetic-size.md
index 8378efc842..407d7b525a 100644
--- a/docs/synthetic-size.md
+++ b/docs/synthetic-size.md
@@ -37,7 +37,7 @@ The synthetic size is designed to:
 
 ## Terms & assumptions
 
-- logical size is the size of a database *at a given point in
+- logical size is the size of a branch *at a given point in
   time*. It's the total size of all tables in all databases, as you
   see with "\l+" in psql for example, plus the Postgres SLRUs and some
   small amount of metadata. NOTE that currently, Neon does not include

From 1d9d7c02dbc2b404b555d56353e339de44160229 Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <ivadmi5@gmail.com>
Date: Wed, 15 Feb 2023 16:25:07 +0300
Subject: [PATCH 04/42] [proxy] Don't forward empty `options` to compute nodes

Clients may specify endpoint/project name via `options=project=...`,
so we should not only remove `project=` from `options` but also
drop `options` entirely, because connection pools don't support it.

Discussion: https://neondb.slack.com/archives/C033A2WE6BZ/p1676464382670119
---
 libs/pq_proto/src/lib.rs | 44 ++++++++++++++++++++------------
 proxy/src/compute.rs     | 54 ++++++++++++++++++++++++++++++++++------
 2 files changed, 74 insertions(+), 24 deletions(-)

diff --git a/libs/pq_proto/src/lib.rs b/libs/pq_proto/src/lib.rs
index c5e4dbd1f0..b7995c840c 100644
--- a/libs/pq_proto/src/lib.rs
+++ b/libs/pq_proto/src/lib.rs
@@ -75,27 +75,36 @@ impl StartupMessageParams {
     /// taking into account all escape sequences but leaving them as-is.
     /// [`None`] means that there's no `options` in [`Self`].
     pub fn options_raw(&self) -> Option<impl Iterator<Item = &str>> {
-        // See `postgres: pg_split_opts`.
-        let mut last_was_escape = false;
-        let iter = self
-            .get("options")?
-            .split(move |c: char| {
-                // We split by non-escaped whitespace symbols.
-                let should_split = c.is_ascii_whitespace() && !last_was_escape;
-                last_was_escape = c == '\\' && !last_was_escape;
-                should_split
-            })
-            .filter(|s| !s.is_empty());
-
-        Some(iter)
+        self.get("options").map(Self::parse_options_raw)
     }
 
     /// Split command-line options according to PostgreSQL's logic,
     /// applying all escape sequences (using owned strings as needed).
     /// [`None`] means that there's no `options` in [`Self`].
     pub fn options_escaped(&self) -> Option<impl Iterator<Item = Cow<'_, str>>> {
+        self.get("options").map(Self::parse_options_escaped)
+    }
+
+    /// Split command-line options according to PostgreSQL's logic,
+    /// taking into account all escape sequences but leaving them as-is.
+    pub fn parse_options_raw(input: &str) -> impl Iterator<Item = &str> {
         // See `postgres: pg_split_opts`.
-        let iter = self.options_raw()?.map(|s| {
+        let mut last_was_escape = false;
+        input
+            .split(move |c: char| {
+                // We split by non-escaped whitespace symbols.
+                let should_split = c.is_ascii_whitespace() && !last_was_escape;
+                last_was_escape = c == '\\' && !last_was_escape;
+                should_split
+            })
+            .filter(|s| !s.is_empty())
+    }
+
+    /// Split command-line options according to PostgreSQL's logic,
+    /// applying all escape sequences (using owned strings as needed).
+    pub fn parse_options_escaped(input: &str) -> impl Iterator<Item = Cow<'_, str>> {
+        // See `postgres: pg_split_opts`.
+        Self::parse_options_raw(input).map(|s| {
             let mut preserve_next_escape = false;
             let escape = |c| {
                 // We should remove '\\' unless it's preceded by '\\'.
@@ -108,9 +117,12 @@ impl StartupMessageParams {
                 true => Cow::Owned(s.replace(escape, "")),
                 false => Cow::Borrowed(s),
             }
-        });
+        })
+    }
 
-        Some(iter)
+    /// Iterate through key-value pairs in an arbitrary order.
+    pub fn iter(&self) -> impl Iterator<Item = (&str, &str)> {
+        self.params.iter().map(|(k, v)| (k.as_str(), v.as_str()))
     }
 
     // This function is mostly useful in tests.
diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs
index 3f5eb3caff..2e12d9ee26 100644
--- a/proxy/src/compute.rs
+++ b/proxy/src/compute.rs
@@ -77,14 +77,9 @@ impl ConnCfg {
             self.dbname(dbname);
         }
 
-        if let Some(options) = params.options_raw() {
-            // We must drop all proxy-specific parameters.
-            #[allow(unstable_name_collisions)]
-            let options: String = options
-                .filter(|opt| !opt.starts_with("project="))
-                .intersperse(" ") // TODO: use impl from std once it's stabilized
-                .collect();
-
+        // Don't add `options` if they were only used for specifying a project.
+        // Connection pools don't support `options`, because they affect backend startup.
+        if let Some(options) = filtered_options(params) {
             self.options(&options);
         }
 
@@ -225,3 +220,46 @@ impl ConnCfg {
         Ok(connection)
     }
 }
+
+/// Retrieve `options` from a startup message, dropping all proxy-secific flags.
+fn filtered_options(params: &StartupMessageParams) -> Option<String> {
+    #[allow(unstable_name_collisions)]
+    let options: String = params
+        .options_raw()?
+        .filter(|opt| !opt.starts_with("project="))
+        .intersperse(" ") // TODO: use impl from std once it's stabilized
+        .collect();
+
+    // Don't even bother with empty options.
+    if options.is_empty() {
+        return None;
+    }
+
+    Some(options)
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn test_filtered_options() {
+        // Empty options is unlikely to be useful anyway.
+        let params = StartupMessageParams::new([("options", "")]);
+        assert_eq!(filtered_options(&params), None);
+
+        // It's likely that clients will only use options to specify endpoint/project.
+        let params = StartupMessageParams::new([("options", "project=foo")]);
+        assert_eq!(filtered_options(&params), None);
+
+        // Same, because unescaped whitespaces are no-op.
+        let params = StartupMessageParams::new([("options", " project=foo ")]);
+        assert_eq!(filtered_options(&params).as_deref(), None);
+
+        let params = StartupMessageParams::new([("options", r"\  project=foo \ ")]);
+        assert_eq!(filtered_options(&params).as_deref(), Some(r"\  \ "));
+
+        let params = StartupMessageParams::new([("options", "project = foo")]);
+        assert_eq!(filtered_options(&params).as_deref(), Some("project = foo"));
+    }
+}

From 7b182e2605b29a364ab625a2fad7323550c4f5f2 Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Thu, 16 Feb 2023 10:33:04 +0200
Subject: [PATCH 05/42] Update settings.md with latest PITR and gc period
 values (#3618)

## Describe your changes
Updates PITR and GC_PERIOD default value doc
## Issue ticket number and link

## Checklist before requesting a review
- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
---
 docs/settings.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/settings.md b/docs/settings.md
index 58d32157a3..817f97d8ba 100644
--- a/docs/settings.md
+++ b/docs/settings.md
@@ -16,7 +16,7 @@ listen_http_addr = '127.0.0.1:9898'
 checkpoint_distance = '268435456' # in bytes
 checkpoint_timeout = '10m'
 
-gc_period = '100 s'
+gc_period = '1 hour'
 gc_horizon = '67108864'
 
 max_file_descriptors = '100'
@@ -101,7 +101,7 @@ away.
 
 #### gc_period
 
-Interval at which garbage collection is triggered. Default is 100 s.
+Interval at which garbage collection is triggered. Default is 1 hour.
 
 #### image_creation_threshold
 
@@ -109,7 +109,7 @@ L0 delta layer threshold for L1 image layer creation. Default is 3.
 
 #### pitr_interval
 
-WAL retention duration for PITR branching. Default is 30 days.
+WAL retention duration for PITR branching. Default is 7 days.
 
 #### walreceiver_connect_timeout
 

From ddbdcdddd7886c72da740b990471dea9ed001413 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 16 Feb 2023 10:53:46 +0200
Subject: [PATCH 06/42] Tenant size calculation: refactor, rewrite, and add SVG
 (#2817)

Refactor the tenant_size_model code. Segment now contains just the
minimum amount of information needed to calculate the size. Other
information that is useful for building up the segment tree, and for
display purposes, is now kept elsewhere. The code in 'main.rs' has a new
ScenarioBuilder struct for that.

Calculating which Segments are "needed" is now the responsibility of the
caller of tenant_size_mode, not part of the calculation itself. So it's
up to the caller to make all the decisions with retention periods for
each branch.

The output of the sizing calculation is now a Vec of SizeResults, rather
than a tree. It uses a tree representation internally, when doing the
calculation, but it's not exposed to the caller anymore.

Refactor the way the recursive calculation is performed.

Rewrite the code in size.rs that builds the Segment model. Get rid of
the intermediate representation with Update structs. Build the Segments
directly, with some local HashMaps and Vecs to track branch points to
help with that.

retention_period is now an input to gather_inputs(), rather than an
output.

Update pageserver http API: rename /size endpoint to /synthetic_size
with following parameters:
    - /synthetic_size?inputs_only to get debug info;
- /synthetic_size?retention_period=0 to override cutoff that is used to
calculate the size;
pass header -H "Accept: text/html" to get HTML output, otherwise JSON is
returned

Update python tests and openapi spec.

---------

Co-authored-by: Anastasia Lubennikova <anastasia@neon.tech>
Co-authored-by: Joonas Koivunen <joonas@neon.tech>
---
 Cargo.lock                                |    2 +
 libs/tenant_size_model/Cargo.toml         |    2 +
 libs/tenant_size_model/src/calculation.rs |  219 ++++
 libs/tenant_size_model/src/lib.rs         |  427 +-------
 libs/tenant_size_model/src/main.rs        |  269 -----
 libs/tenant_size_model/src/svg.rs         |  193 ++++
 libs/tenant_size_model/tests/tests.rs     |  313 ++++++
 pageserver/src/http/openapi_spec.yml      |    7 +
 pageserver/src/http/routes.rs             |  109 +-
 pageserver/src/tenant.rs                  |   40 +-
 pageserver/src/tenant/size.rs             | 1159 +++++++++------------
 pageserver/src/walredo.rs                 |    3 +-
 test_runner/fixtures/neon_fixtures.py     |   12 +-
 test_runner/regress/test_tenant_size.py   |  180 +++-
 14 files changed, 1581 insertions(+), 1354 deletions(-)
 create mode 100644 libs/tenant_size_model/src/calculation.rs
 delete mode 100644 libs/tenant_size_model/src/main.rs
 create mode 100644 libs/tenant_size_model/src/svg.rs
 create mode 100644 libs/tenant_size_model/tests/tests.rs

diff --git a/Cargo.lock b/Cargo.lock
index 67e54d3833..98c4dca09b 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -3809,6 +3809,8 @@ name = "tenant_size_model"
 version = "0.1.0"
 dependencies = [
  "anyhow",
+ "serde",
+ "serde_json",
  "workspace_hack",
 ]
 
diff --git a/libs/tenant_size_model/Cargo.toml b/libs/tenant_size_model/Cargo.toml
index a5f0160f35..15e78932a8 100644
--- a/libs/tenant_size_model/Cargo.toml
+++ b/libs/tenant_size_model/Cargo.toml
@@ -7,5 +7,7 @@ license.workspace = true
 
 [dependencies]
 anyhow.workspace = true
+serde.workspace = true
+serde_json.workspace = true
 
 workspace_hack.workspace = true
diff --git a/libs/tenant_size_model/src/calculation.rs b/libs/tenant_size_model/src/calculation.rs
new file mode 100644
index 0000000000..093b053675
--- /dev/null
+++ b/libs/tenant_size_model/src/calculation.rs
@@ -0,0 +1,219 @@
+use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
+
+//
+//                 *-g--*---D--->
+//                /
+//               /
+//              /                 *---b----*-B--->
+//             /                 /
+//            /                 /
+//      -----*--e---*-----f----* C
+//           E                  \
+//                               \
+//                                *--a---*---A-->
+//
+// If A and B need to be retained, is it cheaper to store
+// snapshot at C+a+b, or snapshots at A and B ?
+//
+// If D also needs to be retained, which is cheaper:
+//
+// 1. E+g+e+f+a+b
+// 2. D+C+a+b
+// 3. D+A+B
+
+/// [`Segment`] which has had it's size calculated.
+#[derive(Clone, Debug)]
+struct SegmentSize {
+    method: SegmentMethod,
+
+    // calculated size of this subtree, using this method
+    accum_size: u64,
+
+    seg_id: usize,
+    children: Vec<SegmentSize>,
+}
+
+struct SizeAlternatives {
+    // cheapest alternative if parent is available.
+    incremental: SegmentSize,
+
+    // cheapest alternative if parent node is not available
+    non_incremental: Option<SegmentSize>,
+}
+
+impl StorageModel {
+    pub fn calculate(&self) -> SizeResult {
+        // Build adjacency list. 'child_list' is indexed by segment id. Each entry
+        // contains a list of all child segments of the segment.
+        let mut roots: Vec<usize> = Vec::new();
+        let mut child_list: Vec<Vec<usize>> = Vec::new();
+        child_list.resize(self.segments.len(), Vec::new());
+
+        for (seg_id, seg) in self.segments.iter().enumerate() {
+            if let Some(parent_id) = seg.parent {
+                child_list[parent_id].push(seg_id);
+            } else {
+                roots.push(seg_id);
+            }
+        }
+
+        let mut segment_results = Vec::new();
+        segment_results.resize(
+            self.segments.len(),
+            SegmentSizeResult {
+                method: SegmentMethod::Skipped,
+                accum_size: 0,
+            },
+        );
+
+        let mut total_size = 0;
+        for root in roots {
+            if let Some(selected) = self.size_here(root, &child_list).non_incremental {
+                StorageModel::fill_selected_sizes(&selected, &mut segment_results);
+                total_size += selected.accum_size;
+            } else {
+                // Couldn't find any way to get this root. Error?
+            }
+        }
+
+        SizeResult {
+            total_size,
+            segments: segment_results,
+        }
+    }
+
+    fn fill_selected_sizes(selected: &SegmentSize, result: &mut Vec<SegmentSizeResult>) {
+        result[selected.seg_id] = SegmentSizeResult {
+            method: selected.method,
+            accum_size: selected.accum_size,
+        };
+        // recurse to children
+        for child in selected.children.iter() {
+            StorageModel::fill_selected_sizes(child, result);
+        }
+    }
+
+    //
+    // This is the core of the sizing calculation.
+    //
+    // This is a recursive function, that for each Segment calculates the best way
+    // to reach all the Segments that are marked as needed in this subtree, under two
+    // different conditions:
+    // a) when the parent of this segment is available (as a snaphot or through WAL), and
+    // b) when the parent of this segment is not available.
+    //
+    fn size_here(&self, seg_id: usize, child_list: &Vec<Vec<usize>>) -> SizeAlternatives {
+        let seg = &self.segments[seg_id];
+        // First figure out the best way to get each child
+        let mut children = Vec::new();
+        for child_id in &child_list[seg_id] {
+            children.push(self.size_here(*child_id, child_list))
+        }
+
+        // Method 1. If this node is not needed, we can skip it as long as we
+        // take snapshots later in each sub-tree
+        let snapshot_later = if !seg.needed {
+            let mut snapshot_later = SegmentSize {
+                seg_id,
+                method: SegmentMethod::Skipped,
+                accum_size: 0,
+                children: Vec::new(),
+            };
+
+            let mut possible = true;
+            for child in children.iter() {
+                if let Some(non_incremental) = &child.non_incremental {
+                    snapshot_later.accum_size += non_incremental.accum_size;
+                    snapshot_later.children.push(non_incremental.clone())
+                } else {
+                    possible = false;
+                    break;
+                }
+            }
+            if possible {
+                Some(snapshot_later)
+            } else {
+                None
+            }
+        } else {
+            None
+        };
+
+        // Method 2. Get a snapshot here. This assumed to be possible, if the 'size' of
+        // this Segment was given.
+        let snapshot_here = if !seg.needed || seg.parent.is_none() {
+            if let Some(snapshot_size) = seg.size {
+                let mut snapshot_here = SegmentSize {
+                    seg_id,
+                    method: SegmentMethod::SnapshotHere,
+                    accum_size: snapshot_size,
+                    children: Vec::new(),
+                };
+                for child in children.iter() {
+                    snapshot_here.accum_size += child.incremental.accum_size;
+                    snapshot_here.children.push(child.incremental.clone())
+                }
+                Some(snapshot_here)
+            } else {
+                None
+            }
+        } else {
+            None
+        };
+
+        // Method 3. Use WAL to get here from parent
+        let wal_here = {
+            let mut wal_here = SegmentSize {
+                seg_id,
+                method: SegmentMethod::Wal,
+                accum_size: if let Some(parent_id) = seg.parent {
+                    seg.lsn - self.segments[parent_id].lsn
+                } else {
+                    0
+                },
+                children: Vec::new(),
+            };
+            for child in children {
+                wal_here.accum_size += child.incremental.accum_size;
+                wal_here.children.push(child.incremental)
+            }
+            wal_here
+        };
+
+        // If the parent is not available, what's the cheapest method involving
+        // a snapshot here or later?
+        let mut cheapest_non_incremental: Option<SegmentSize> = None;
+        if let Some(snapshot_here) = snapshot_here {
+            cheapest_non_incremental = Some(snapshot_here);
+        }
+        if let Some(snapshot_later) = snapshot_later {
+            // Use <=, to prefer skipping if the size is equal
+            if let Some(parent) = &cheapest_non_incremental {
+                if snapshot_later.accum_size <= parent.accum_size {
+                    cheapest_non_incremental = Some(snapshot_later);
+                }
+            } else {
+                cheapest_non_incremental = Some(snapshot_later);
+            }
+        }
+
+        // And what's the cheapest method, if the parent is available?
+        let cheapest_incremental = if let Some(cheapest_non_incremental) = &cheapest_non_incremental
+        {
+            // Is it cheaper to use a snapshot here or later, anyway?
+            // Use <, to prefer Wal over snapshot if the cost is the same
+            if wal_here.accum_size < cheapest_non_incremental.accum_size {
+                wal_here
+            } else {
+                cheapest_non_incremental.clone()
+            }
+        } else {
+            wal_here
+        };
+
+        SizeAlternatives {
+            incremental: cheapest_incremental,
+            non_incremental: cheapest_non_incremental,
+        }
+    }
+}
diff --git a/libs/tenant_size_model/src/lib.rs b/libs/tenant_size_model/src/lib.rs
index b156e1be9d..c151e3b42c 100644
--- a/libs/tenant_size_model/src/lib.rs
+++ b/libs/tenant_size_model/src/lib.rs
@@ -1,401 +1,70 @@
-use std::borrow::Cow;
-use std::collections::HashMap;
+//! Synthetic size calculation
 
-use anyhow::Context;
+mod calculation;
+pub mod svg;
 
-/// Pricing model or history size builder.
+/// StorageModel is the input to the synthetic size calculation. It represents
+/// a tree of timelines, with just the information that's needed for the
+/// calculation. This doesn't track timeline names or where each timeline
+/// begins and ends, for example. Instead, it consists of "points of interest"
+/// on the timelines. A point of interest could be the timeline start or end point,
+/// the oldest point on a timeline that needs to be retained because of PITR
+/// cutoff, or snapshot points named by the user. For each such point, and the
+/// edge connecting the points (implicit in Segment), we store information about
+/// whether we need to be able to recover to the point, and if known, the logical
+/// size at the point.
 ///
-/// Maintains knowledge of the branches and their modifications. Generic over the branch name key
-/// type.
-pub struct Storage<K: 'static> {
-    segments: Vec<Segment>,
-
-    /// Mapping from the branch name to the index of a segment describing it's latest state.
-    branches: HashMap<K, usize>,
+/// The segments must form a well-formed tree, with no loops.
+#[derive(serde::Serialize)]
+pub struct StorageModel {
+    pub segments: Vec<Segment>,
 }
 
-/// Snapshot of a branch.
-#[derive(Clone, Debug, Eq, PartialEq)]
+/// Segment represents one point in the tree of branches, *and* the edge that leads
+/// to it (if any). We don't need separate structs for points and edges, because each
+/// point can have only one parent.
+///
+/// When 'needed' is true, it means that we need to be able to reconstruct
+/// any version between 'parent.lsn' and 'lsn'. If you want to represent that only
+/// a single point is needed, create two Segments with the same lsn, and mark only
+/// the child as needed.
+///
+#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
 pub struct Segment {
     /// Previous segment index into ['Storage::segments`], if any.
-    parent: Option<usize>,
+    pub parent: Option<usize>,
 
-    /// Description of how did we get to this state.
-    ///
-    /// Mainly used in the original scenarios 1..=4 with insert, delete and update. Not used when
-    /// modifying a branch directly.
-    pub op: Cow<'static, str>,
+    /// LSN at this point
+    pub lsn: u64,
 
-    /// LSN before this state
-    start_lsn: u64,
+    /// Logical size at this node, if known.
+    pub size: Option<u64>,
 
-    /// LSN at this state
-    pub end_lsn: u64,
-
-    /// Logical size before this state
-    start_size: u64,
-
-    /// Logical size at this state. Can be None in the last Segment of a branch.
-    pub end_size: Option<u64>,
-
-    /// Indices to [`Storage::segments`]
-    ///
-    /// FIXME: this could be an Option<usize>
-    children_after: Vec<usize>,
-
-    /// Determined by `retention_period` given to [`Storage::calculate`]
+    /// If true, the segment from parent to this node is needed by `retention_period`
     pub needed: bool,
 }
 
-//
-//
-//
-//
-//                 *-g--*---D--->
-//                /
-//               /
-//              /                 *---b----*-B--->
-//             /                 /
-//            /                 /
-//      -----*--e---*-----f----* C
-//           E                  \
-//                               \
-//                                *--a---*---A-->
-//
-// If A and B need to be retained, is it cheaper to store
-// snapshot at C+a+b, or snapshots at A and B ?
-//
-// If D also needs to be retained, which is cheaper:
-//
-// 1. E+g+e+f+a+b
-// 2. D+C+a+b
-// 3. D+A+B
+/// Result of synthetic size calculation. Returned by StorageModel::calculate()
+pub struct SizeResult {
+    pub total_size: u64,
 
-/// [`Segment`] which has had it's size calculated.
-pub struct SegmentSize {
-    pub seg_id: usize,
-
-    pub method: SegmentMethod,
-
-    this_size: u64,
-
-    pub children: Vec<SegmentSize>,
+    // This has same length as the StorageModel::segments vector in the input.
+    // Each entry in this array corresponds to the entry with same index in
+    // StorageModel::segments.
+    pub segments: Vec<SegmentSizeResult>,
 }
 
-impl SegmentSize {
-    fn total(&self) -> u64 {
-        self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
-    }
-
-    pub fn total_children(&self) -> u64 {
-        if self.method == SnapshotAfter {
-            self.this_size + self.children.iter().fold(0, |acc, x| acc + x.total())
-        } else {
-            self.children.iter().fold(0, |acc, x| acc + x.total())
-        }
-    }
+#[derive(Clone, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
+pub struct SegmentSizeResult {
+    pub method: SegmentMethod,
+    // calculated size of this subtree, using this method
+    pub accum_size: u64,
 }
 
 /// Different methods to retain history from a particular state
-#[derive(Clone, Copy, Debug, Eq, PartialEq)]
+#[derive(Clone, Copy, Debug, Eq, PartialEq, serde::Serialize, serde::Deserialize)]
 pub enum SegmentMethod {
-    SnapshotAfter,
-    Wal,
-    WalNeeded,
+    SnapshotHere, // A logical snapshot is needed after this segment
+    Wal,          // Keep WAL leading up to this node
     Skipped,
 }
-
-use SegmentMethod::*;
-
-impl<K: std::hash::Hash + Eq + 'static> Storage<K> {
-    /// Creates a new storage with the given default branch name.
-    pub fn new(initial_branch: K) -> Storage<K> {
-        let init_segment = Segment {
-            op: "".into(),
-            needed: false,
-            parent: None,
-            start_lsn: 0,
-            end_lsn: 0,
-            start_size: 0,
-            end_size: Some(0),
-            children_after: Vec::new(),
-        };
-
-        Storage {
-            segments: vec![init_segment],
-            branches: HashMap::from([(initial_branch, 0)]),
-        }
-    }
-
-    /// Advances the branch with a new point, at given LSN.
-    pub fn insert_point<Q: ?Sized>(
-        &mut self,
-        branch: &Q,
-        op: Cow<'static, str>,
-        lsn: u64,
-        size: Option<u64>,
-    ) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
-        let newseg_id = self.segments.len();
-        let lastseg = &mut self.segments[lastseg_id];
-
-        assert!(lsn > lastseg.end_lsn);
-
-        let Some(start_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
-
-        let newseg = Segment {
-            op,
-            parent: Some(lastseg_id),
-            start_lsn: lastseg.end_lsn,
-            end_lsn: lsn,
-            start_size,
-            end_size: size,
-            children_after: Vec::new(),
-            needed: false,
-        };
-        lastseg.children_after.push(newseg_id);
-
-        self.segments.push(newseg);
-        *self.branches.get_mut(branch).expect("read already") = newseg_id;
-
-        Ok(())
-    }
-
-    /// Advances the branch with the named operation, by the relative LSN and logical size bytes.
-    pub fn modify_branch<Q: ?Sized>(
-        &mut self,
-        branch: &Q,
-        op: Cow<'static, str>,
-        lsn_bytes: u64,
-        size_bytes: i64,
-    ) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        let Some(lastseg_id) = self.branches.get(branch).copied() else { anyhow::bail!("branch not found: {branch:?}") };
-        let newseg_id = self.segments.len();
-        let lastseg = &mut self.segments[lastseg_id];
-
-        let Some(last_end_size) = lastseg.end_size else { anyhow::bail!("no end_size on latest segment for {branch:?}") };
-
-        let newseg = Segment {
-            op,
-            parent: Some(lastseg_id),
-            start_lsn: lastseg.end_lsn,
-            end_lsn: lastseg.end_lsn + lsn_bytes,
-            start_size: last_end_size,
-            end_size: Some((last_end_size as i64 + size_bytes) as u64),
-            children_after: Vec::new(),
-            needed: false,
-        };
-        lastseg.children_after.push(newseg_id);
-
-        self.segments.push(newseg);
-        *self.branches.get_mut(branch).expect("read already") = newseg_id;
-        Ok(())
-    }
-
-    pub fn insert<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        self.modify_branch(branch, "insert".into(), bytes, bytes as i64)
-    }
-
-    pub fn update<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        self.modify_branch(branch, "update".into(), bytes, 0i64)
-    }
-
-    pub fn delete<Q: ?Sized>(&mut self, branch: &Q, bytes: u64) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q>,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        self.modify_branch(branch, "delete".into(), bytes, -(bytes as i64))
-    }
-
-    pub fn branch<Q: ?Sized>(&mut self, parent: &Q, name: K) -> anyhow::Result<()>
-    where
-        K: std::borrow::Borrow<Q> + std::fmt::Debug,
-        Q: std::hash::Hash + Eq + std::fmt::Debug,
-    {
-        // Find the right segment
-        let branchseg_id = *self.branches.get(parent).with_context(|| {
-            format!(
-                "should had found the parent {:?} by key. in branches {:?}",
-                parent, self.branches
-            )
-        })?;
-
-        let _branchseg = &mut self.segments[branchseg_id];
-
-        // Create branch name for it
-        self.branches.insert(name, branchseg_id);
-        Ok(())
-    }
-
-    pub fn calculate(&mut self, retention_period: u64) -> anyhow::Result<SegmentSize> {
-        // Phase 1: Mark all the segments that need to be retained
-        for (_branch, &last_seg_id) in self.branches.iter() {
-            let last_seg = &self.segments[last_seg_id];
-            let cutoff_lsn = last_seg.start_lsn.saturating_sub(retention_period);
-            let mut seg_id = last_seg_id;
-            loop {
-                let seg = &mut self.segments[seg_id];
-                if seg.end_lsn < cutoff_lsn {
-                    break;
-                }
-                seg.needed = true;
-                if let Some(prev_seg_id) = seg.parent {
-                    seg_id = prev_seg_id;
-                } else {
-                    break;
-                }
-            }
-        }
-
-        // Phase 2: For each oldest segment in a chain that needs to be retained,
-        // calculate if we should store snapshot or WAL
-        self.size_from_snapshot_later(0)
-    }
-
-    fn size_from_wal(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
-        let seg = &self.segments[seg_id];
-
-        let this_size = seg.end_lsn - seg.start_lsn;
-
-        let mut children = Vec::new();
-
-        // try both ways
-        for &child_id in seg.children_after.iter() {
-            // try each child both ways
-            let child = &self.segments[child_id];
-            let p1 = self.size_from_wal(child_id)?;
-
-            let p = if !child.needed {
-                let p2 = self.size_from_snapshot_later(child_id)?;
-                if p1.total() < p2.total() {
-                    p1
-                } else {
-                    p2
-                }
-            } else {
-                p1
-            };
-            children.push(p);
-        }
-        Ok(SegmentSize {
-            seg_id,
-            method: if seg.needed { WalNeeded } else { Wal },
-            this_size,
-            children,
-        })
-    }
-
-    fn size_from_snapshot_later(&self, seg_id: usize) -> anyhow::Result<SegmentSize> {
-        // If this is needed, then it's time to do the snapshot and continue
-        // with wal method.
-        let seg = &self.segments[seg_id];
-        //eprintln!("snap: seg{}: {} needed: {}", seg_id, seg.children_after.len(), seg.needed);
-        if seg.needed {
-            let mut children = Vec::new();
-
-            for &child_id in seg.children_after.iter() {
-                // try each child both ways
-                let child = &self.segments[child_id];
-                let p1 = self.size_from_wal(child_id)?;
-
-                let p = if !child.needed {
-                    let p2 = self.size_from_snapshot_later(child_id)?;
-                    if p1.total() < p2.total() {
-                        p1
-                    } else {
-                        p2
-                    }
-                } else {
-                    p1
-                };
-                children.push(p);
-            }
-            Ok(SegmentSize {
-                seg_id,
-                method: WalNeeded,
-                this_size: seg.start_size,
-                children,
-            })
-        } else {
-            // If any of the direct children are "needed", need to be able to reconstruct here
-            let mut children_needed = false;
-            for &child in seg.children_after.iter() {
-                let seg = &self.segments[child];
-                if seg.needed {
-                    children_needed = true;
-                    break;
-                }
-            }
-
-            let method1 = if !children_needed {
-                let mut children = Vec::new();
-                for child in seg.children_after.iter() {
-                    children.push(self.size_from_snapshot_later(*child)?);
-                }
-                Some(SegmentSize {
-                    seg_id,
-                    method: Skipped,
-                    this_size: 0,
-                    children,
-                })
-            } else {
-                None
-            };
-
-            // If this a junction, consider snapshotting here
-            let method2 = if children_needed || seg.children_after.len() >= 2 {
-                let mut children = Vec::new();
-                for child in seg.children_after.iter() {
-                    children.push(self.size_from_wal(*child)?);
-                }
-                let Some(this_size) = seg.end_size else { anyhow::bail!("no end_size at junction {seg_id}") };
-                Some(SegmentSize {
-                    seg_id,
-                    method: SnapshotAfter,
-                    this_size,
-                    children,
-                })
-            } else {
-                None
-            };
-
-            Ok(match (method1, method2) {
-                (None, None) => anyhow::bail!(
-                    "neither method was applicable: children_after={}, children_needed={}",
-                    seg.children_after.len(),
-                    children_needed
-                ),
-                (Some(method), None) => method,
-                (None, Some(method)) => method,
-                (Some(method1), Some(method2)) => {
-                    if method1.total() < method2.total() {
-                        method1
-                    } else {
-                        method2
-                    }
-                }
-            })
-        }
-    }
-
-    pub fn into_segments(self) -> Vec<Segment> {
-        self.segments
-    }
-}
diff --git a/libs/tenant_size_model/src/main.rs b/libs/tenant_size_model/src/main.rs
deleted file mode 100644
index e32dd055f4..0000000000
--- a/libs/tenant_size_model/src/main.rs
+++ /dev/null
@@ -1,269 +0,0 @@
-//! Tenant size model testing ground.
-//!
-//! Has a number of scenarios and a `main` for invoking these by number, calculating the history
-//! size, outputs graphviz graph. Makefile in directory shows how to use graphviz to turn scenarios
-//! into pngs.
-
-use tenant_size_model::{Segment, SegmentSize, Storage};
-
-// Main branch only. Some updates on it.
-fn scenario_1() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000)?;
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    let size = storage.calculate(1000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-// Main branch only. Some updates on it.
-fn scenario_2() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000)?;
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    // Branch
-    storage.branch("main", "child")?;
-    storage.update("child", 1_000)?;
-
-    // More updates on parent
-    storage.update("main", 1_000)?;
-
-    let size = storage.calculate(1000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-// Like 2, but more updates on main
-fn scenario_3() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000)?;
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    // Branch
-    storage.branch("main", "child")?;
-    storage.update("child", 1_000)?;
-
-    // More updates on parent
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    let size = storage.calculate(1000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-// Diverged branches
-fn scenario_4() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    // Create main branch
-    let mut storage = Storage::new("main");
-
-    // Bulk load 5 GB of data to it
-    storage.insert("main", 5_000)?;
-
-    // Stream of updates
-    for _ in 0..5 {
-        storage.update("main", 1_000)?;
-    }
-
-    // Branch
-    storage.branch("main", "child")?;
-    storage.update("child", 1_000)?;
-
-    // More updates on parent
-    for _ in 0..8 {
-        storage.update("main", 1_000)?;
-    }
-
-    let size = storage.calculate(1000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-fn scenario_5() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    let mut storage = Storage::new("a");
-    storage.insert("a", 5000)?;
-    storage.branch("a", "b")?;
-    storage.update("b", 4000)?;
-    storage.update("a", 2000)?;
-    storage.branch("a", "c")?;
-    storage.insert("c", 4000)?;
-    storage.insert("a", 2000)?;
-
-    let size = storage.calculate(5000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-fn scenario_6() -> anyhow::Result<(Vec<Segment>, SegmentSize)> {
-    use std::borrow::Cow;
-
-    const NO_OP: Cow<'static, str> = Cow::Borrowed("");
-
-    let branches = [
-        Some(0x7ff1edab8182025f15ae33482edb590a_u128),
-        Some(0xb1719e044db05401a05a2ed588a3ad3f),
-        Some(0xb68d6691c895ad0a70809470020929ef),
-    ];
-
-    // compared to other scenarios, this one uses bytes instead of kB
-
-    let mut storage = Storage::new(None);
-
-    storage.branch(&None, branches[0])?; // at 0
-    storage.modify_branch(&branches[0], NO_OP, 108951064, 43696128)?; // at 108951064
-    storage.branch(&branches[0], branches[1])?; // at 108951064
-    storage.modify_branch(&branches[1], NO_OP, 15560408, -1851392)?; // at 124511472
-    storage.modify_branch(&branches[0], NO_OP, 174464360, -1531904)?; // at 283415424
-    storage.branch(&branches[0], branches[2])?; // at 283415424
-    storage.modify_branch(&branches[2], NO_OP, 15906192, 8192)?; // at 299321616
-    storage.modify_branch(&branches[0], NO_OP, 18909976, 32768)?; // at 302325400
-
-    let size = storage.calculate(100_000)?;
-
-    Ok((storage.into_segments(), size))
-}
-
-fn main() {
-    let args: Vec<String> = std::env::args().collect();
-
-    let scenario = if args.len() < 2 { "1" } else { &args[1] };
-
-    let (segments, size) = match scenario {
-        "1" => scenario_1(),
-        "2" => scenario_2(),
-        "3" => scenario_3(),
-        "4" => scenario_4(),
-        "5" => scenario_5(),
-        "6" => scenario_6(),
-        other => {
-            eprintln!("invalid scenario {}", other);
-            std::process::exit(1);
-        }
-    }
-    .unwrap();
-
-    graphviz_tree(&segments, &size);
-}
-
-fn graphviz_recurse(segments: &[Segment], node: &SegmentSize) {
-    use tenant_size_model::SegmentMethod::*;
-
-    let seg_id = node.seg_id;
-    let seg = segments.get(seg_id).unwrap();
-    let lsn = seg.end_lsn;
-    let size = seg.end_size.unwrap_or(0);
-    let method = node.method;
-
-    println!("  {{");
-    println!("    node [width=0.1 height=0.1 shape=oval]");
-
-    let tenant_size = node.total_children();
-
-    let penwidth = if seg.needed { 6 } else { 3 };
-    let x = match method {
-        SnapshotAfter =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" style=filled penwidth={penwidth}"),
-        Wal =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
-        WalNeeded =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"black\" penwidth={penwidth}"),
-        Skipped =>
-            format!("label=\"lsn: {lsn}\\nsize: {size}\\ntenant_size: {tenant_size}\" color=\"gray\" penwidth={penwidth}"),
-    };
-
-    println!("    \"seg{seg_id}\" [{x}]");
-    println!("  }}");
-
-    // Recurse. Much of the data is actually on the edge
-    for child in node.children.iter() {
-        let child_id = child.seg_id;
-        graphviz_recurse(segments, child);
-
-        let edge_color = match child.method {
-            SnapshotAfter => "gray",
-            Wal => "black",
-            WalNeeded => "black",
-            Skipped => "gray",
-        };
-
-        println!("  {{");
-        println!("    edge [] ");
-        print!("    \"seg{seg_id}\" -> \"seg{child_id}\" [");
-        print!("color={edge_color}");
-        if child.method == WalNeeded {
-            print!(" penwidth=6");
-        }
-        if child.method == Wal {
-            print!(" penwidth=3");
-        }
-
-        let next = segments.get(child_id).unwrap();
-
-        if next.op.is_empty() {
-            print!(
-                " label=\"{} / {}\"",
-                next.end_lsn - seg.end_lsn,
-                (next.end_size.unwrap_or(0) as i128 - seg.end_size.unwrap_or(0) as i128)
-            );
-        } else {
-            print!(" label=\"{}: {}\"", next.op, next.end_lsn - seg.end_lsn);
-        }
-        println!("]");
-        println!("  }}");
-    }
-}
-
-fn graphviz_tree(segments: &[Segment], tree: &SegmentSize) {
-    println!("digraph G {{");
-    println!("  fontname=\"Helvetica,Arial,sans-serif\"");
-    println!("  node [fontname=\"Helvetica,Arial,sans-serif\"]");
-    println!("  edge [fontname=\"Helvetica,Arial,sans-serif\"]");
-    println!("  graph [center=1 rankdir=LR]");
-    println!("  edge [dir=none]");
-
-    graphviz_recurse(segments, tree);
-
-    println!("}}");
-}
-
-#[test]
-fn scenarios_return_same_size() {
-    type ScenarioFn = fn() -> anyhow::Result<(Vec<Segment>, SegmentSize)>;
-    let truths: &[(u32, ScenarioFn, _)] = &[
-        (line!(), scenario_1, 8000),
-        (line!(), scenario_2, 9000),
-        (line!(), scenario_3, 13000),
-        (line!(), scenario_4, 16000),
-        (line!(), scenario_5, 17000),
-        (line!(), scenario_6, 333_792_000),
-    ];
-
-    for (line, scenario, expected) in truths {
-        let (_, size) = scenario().unwrap();
-        assert_eq!(*expected, size.total_children(), "scenario on line {line}");
-    }
-}
diff --git a/libs/tenant_size_model/src/svg.rs b/libs/tenant_size_model/src/svg.rs
new file mode 100644
index 0000000000..f26d3aa79d
--- /dev/null
+++ b/libs/tenant_size_model/src/svg.rs
@@ -0,0 +1,193 @@
+use crate::{SegmentMethod, SegmentSizeResult, SizeResult, StorageModel};
+use std::fmt::Write;
+
+const SVG_WIDTH: f32 = 500.0;
+
+struct SvgDraw<'a> {
+    storage: &'a StorageModel,
+    branches: &'a [String],
+    seg_to_branch: &'a [usize],
+    sizes: &'a [SegmentSizeResult],
+
+    // layout
+    xscale: f32,
+    min_lsn: u64,
+    seg_coordinates: Vec<(f32, f32)>,
+}
+
+fn draw_legend(result: &mut String) -> anyhow::Result<()> {
+    writeln!(
+        result,
+        "<circle cx=\"10\" cy=\"10\" r=\"5\" stroke=\"red\"/>"
+    )?;
+    writeln!(result, "<text x=\"20\" y=\"15\">logical snapshot</text>")?;
+    writeln!(
+        result,
+        "<line x1=\"5\" y1=\"30\" x2=\"15\" y2=\"30\" stroke-width=\"6\" stroke=\"black\" />"
+    )?;
+    writeln!(
+        result,
+        "<text x=\"20\" y=\"35\">WAL within retention period</text>"
+    )?;
+    writeln!(
+        result,
+        "<line x1=\"5\" y1=\"50\" x2=\"15\" y2=\"50\" stroke-width=\"3\" stroke=\"black\" />"
+    )?;
+    writeln!(
+        result,
+        "<text x=\"20\" y=\"55\">WAL retained to avoid copy</text>"
+    )?;
+    writeln!(
+        result,
+        "<line x1=\"5\" y1=\"70\" x2=\"15\" y2=\"70\" stroke-width=\"1\" stroke=\"gray\" />"
+    )?;
+    writeln!(result, "<text x=\"20\" y=\"75\">WAL not retained</text>")?;
+    Ok(())
+}
+
+pub fn draw_svg(
+    storage: &StorageModel,
+    branches: &[String],
+    seg_to_branch: &[usize],
+    sizes: &SizeResult,
+) -> anyhow::Result<String> {
+    let mut draw = SvgDraw {
+        storage,
+        branches,
+        seg_to_branch,
+        sizes: &sizes.segments,
+
+        xscale: 0.0,
+        min_lsn: 0,
+        seg_coordinates: Vec::new(),
+    };
+
+    let mut result = String::new();
+
+    writeln!(result, "<svg xmlns=\"http://www.w3.org/2000/svg\" xmlns:xlink=\"http://www.w3.org/1999/xlink\" height=\"300\" width=\"500\">")?;
+
+    draw.calculate_svg_layout();
+
+    // Draw the tree
+    for (seg_id, _seg) in storage.segments.iter().enumerate() {
+        draw.draw_seg_phase1(seg_id, &mut result)?;
+    }
+
+    // Draw snapshots
+    for (seg_id, _seg) in storage.segments.iter().enumerate() {
+        draw.draw_seg_phase2(seg_id, &mut result)?;
+    }
+
+    draw_legend(&mut result)?;
+
+    write!(result, "</svg>")?;
+
+    Ok(result)
+}
+
+impl<'a> SvgDraw<'a> {
+    fn calculate_svg_layout(&mut self) {
+        // Find x scale
+        let segments = &self.storage.segments;
+        let min_lsn = segments.iter().map(|s| s.lsn).fold(u64::MAX, std::cmp::min);
+        let max_lsn = segments.iter().map(|s| s.lsn).fold(0, std::cmp::max);
+
+        // Start with 1 pixel = 1 byte. Double the scale until it fits into the image
+        let mut xscale = 1.0;
+        while (max_lsn - min_lsn) as f32 / xscale > SVG_WIDTH {
+            xscale *= 2.0;
+        }
+
+        // Layout the timelines on Y dimension.
+        // TODO
+        let mut y = 100.0;
+        let mut branch_y_coordinates = Vec::new();
+        for _branch in self.branches {
+            branch_y_coordinates.push(y);
+            y += 40.0;
+        }
+
+        // Calculate coordinates for each point
+        let seg_coordinates = std::iter::zip(segments, self.seg_to_branch)
+            .map(|(seg, branch_id)| {
+                let x = (seg.lsn - min_lsn) as f32 / xscale;
+                let y = branch_y_coordinates[*branch_id];
+                (x, y)
+            })
+            .collect();
+
+        self.xscale = xscale;
+        self.min_lsn = min_lsn;
+        self.seg_coordinates = seg_coordinates;
+    }
+
+    /// Draws lines between points
+    fn draw_seg_phase1(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> {
+        let seg = &self.storage.segments[seg_id];
+
+        let wal_bytes = if let Some(parent_id) = seg.parent {
+            seg.lsn - self.storage.segments[parent_id].lsn
+        } else {
+            0
+        };
+
+        let style = match self.sizes[seg_id].method {
+            SegmentMethod::SnapshotHere => "stroke-width=\"1\" stroke=\"gray\"",
+            SegmentMethod::Wal if seg.needed && wal_bytes > 0 => {
+                "stroke-width=\"6\" stroke=\"black\""
+            }
+            SegmentMethod::Wal => "stroke-width=\"3\" stroke=\"black\"",
+            SegmentMethod::Skipped => "stroke-width=\"1\" stroke=\"gray\"",
+        };
+        if let Some(parent_id) = seg.parent {
+            let (x1, y1) = self.seg_coordinates[parent_id];
+            let (x2, y2) = self.seg_coordinates[seg_id];
+
+            writeln!(
+                result,
+                "<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
+            )?;
+            writeln!(
+                result,
+                "  <title>{wal_bytes} bytes of WAL (seg {seg_id})</title>"
+            )?;
+            writeln!(result, "</line>")?;
+        } else {
+            // draw a little dash to mark the starting point of this branch
+            let (x, y) = self.seg_coordinates[seg_id];
+            let (x1, y1) = (x, y - 5.0);
+            let (x2, y2) = (x, y + 5.0);
+
+            writeln!(
+                result,
+                "<line x1=\"{x1}\" y1=\"{y1}\" x2=\"{x2}\" y2=\"{y2}\" {style}>",
+            )?;
+            writeln!(result, "  <title>(seg {seg_id})</title>")?;
+            writeln!(result, "</line>")?;
+        }
+
+        Ok(())
+    }
+
+    /// Draw circles where snapshots are taken
+    fn draw_seg_phase2(&self, seg_id: usize, result: &mut String) -> anyhow::Result<()> {
+        let seg = &self.storage.segments[seg_id];
+
+        // draw a snapshot point if it's needed
+        let (coord_x, coord_y) = self.seg_coordinates[seg_id];
+        if self.sizes[seg_id].method == SegmentMethod::SnapshotHere {
+            writeln!(
+                result,
+                "<circle cx=\"{coord_x}\" cy=\"{coord_y}\" r=\"5\" stroke=\"red\">",
+            )?;
+            writeln!(
+                result,
+                "  <title>logical size {}</title>",
+                seg.size.unwrap()
+            )?;
+            write!(result, "</circle>")?;
+        }
+
+        Ok(())
+    }
+}
diff --git a/libs/tenant_size_model/tests/tests.rs b/libs/tenant_size_model/tests/tests.rs
new file mode 100644
index 0000000000..7660d41c56
--- /dev/null
+++ b/libs/tenant_size_model/tests/tests.rs
@@ -0,0 +1,313 @@
+//! Tenant size model tests.
+
+use tenant_size_model::{Segment, SizeResult, StorageModel};
+
+use std::collections::HashMap;
+
+struct ScenarioBuilder {
+    segments: Vec<Segment>,
+
+    /// Mapping from the branch name to the index of a segment describing its latest state.
+    branches: HashMap<String, usize>,
+}
+
+impl ScenarioBuilder {
+    /// Creates a new storage with the given default branch name.
+    pub fn new(initial_branch: &str) -> ScenarioBuilder {
+        let init_segment = Segment {
+            parent: None,
+            lsn: 0,
+            size: Some(0),
+            needed: false, // determined later
+        };
+
+        ScenarioBuilder {
+            segments: vec![init_segment],
+            branches: HashMap::from([(initial_branch.into(), 0)]),
+        }
+    }
+
+    /// Advances the branch with the named operation, by the relative LSN and logical size bytes.
+    pub fn modify_branch(&mut self, branch: &str, lsn_bytes: u64, size_bytes: i64) {
+        let lastseg_id = *self.branches.get(branch).unwrap();
+        let newseg_id = self.segments.len();
+        let lastseg = &mut self.segments[lastseg_id];
+
+        let newseg = Segment {
+            parent: Some(lastseg_id),
+            lsn: lastseg.lsn + lsn_bytes,
+            size: Some((lastseg.size.unwrap() as i64 + size_bytes) as u64),
+            needed: false,
+        };
+
+        self.segments.push(newseg);
+        *self.branches.get_mut(branch).expect("read already") = newseg_id;
+    }
+
+    pub fn insert(&mut self, branch: &str, bytes: u64) {
+        self.modify_branch(branch, bytes, bytes as i64);
+    }
+
+    pub fn update(&mut self, branch: &str, bytes: u64) {
+        self.modify_branch(branch, bytes, 0i64);
+    }
+
+    pub fn _delete(&mut self, branch: &str, bytes: u64) {
+        self.modify_branch(branch, bytes, -(bytes as i64));
+    }
+
+    /// Panics if the parent branch cannot be found.
+    pub fn branch(&mut self, parent: &str, name: &str) {
+        // Find the right segment
+        let branchseg_id = *self
+            .branches
+            .get(parent)
+            .expect("should had found the parent by key");
+        let _branchseg = &mut self.segments[branchseg_id];
+
+        // Create branch name for it
+        self.branches.insert(name.to_string(), branchseg_id);
+    }
+
+    pub fn calculate(&mut self, retention_period: u64) -> (StorageModel, SizeResult) {
+        // Phase 1: Mark all the segments that need to be retained
+        for (_branch, &last_seg_id) in self.branches.iter() {
+            let last_seg = &self.segments[last_seg_id];
+            let cutoff_lsn = last_seg.lsn.saturating_sub(retention_period);
+            let mut seg_id = last_seg_id;
+            loop {
+                let seg = &mut self.segments[seg_id];
+                if seg.lsn <= cutoff_lsn {
+                    break;
+                }
+                seg.needed = true;
+                if let Some(prev_seg_id) = seg.parent {
+                    seg_id = prev_seg_id;
+                } else {
+                    break;
+                }
+            }
+        }
+
+        // Perform the calculation
+        let storage_model = StorageModel {
+            segments: self.segments.clone(),
+        };
+        let size_result = storage_model.calculate();
+        (storage_model, size_result)
+    }
+}
+
+// Main branch only. Some updates on it.
+#[test]
+fn scenario_1() {
+    // Create main branch
+    let mut scenario = ScenarioBuilder::new("main");
+
+    // Bulk load 5 GB of data to it
+    scenario.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    // Calculate the synthetic size with retention horizon 1000
+    let (_model, result) = scenario.calculate(1000);
+
+    // The end of the branch is at LSN 10000. Need to retain
+    // a logical snapshot at LSN 9000, plus the WAL between 9000-10000.
+    // The logical snapshot has size 5000.
+    assert_eq!(result.total_size, 5000 + 1000);
+}
+
+// Main branch only. Some updates on it.
+#[test]
+fn scenario_2() {
+    // Create main branch
+    let mut scenario = ScenarioBuilder::new("main");
+
+    // Bulk load 5 GB of data to it
+    scenario.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    // Branch
+    scenario.branch("main", "child");
+    scenario.update("child", 1_000);
+
+    // More updates on parent
+    scenario.update("main", 1_000);
+
+    //
+    // The history looks like this now:
+    //
+    //         10000          11000
+    // *----*----*--------------*    main
+    //           |
+    //           |            11000
+    //           +--------------     child
+    //
+    //
+    // With retention horizon 1000, we need to retain logical snapshot
+    // at the branch point, size 5000, and the WAL from 10000-11000 on
+    // both branches.
+    let (_model, result) = scenario.calculate(1000);
+
+    assert_eq!(result.total_size, 5000 + 1000 + 1000);
+}
+
+// Like 2, but more updates on main
+#[test]
+fn scenario_3() {
+    // Create main branch
+    let mut scenario = ScenarioBuilder::new("main");
+
+    // Bulk load 5 GB of data to it
+    scenario.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    // Branch
+    scenario.branch("main", "child");
+    scenario.update("child", 1_000);
+
+    // More updates on parent
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    //
+    // The history looks like this now:
+    //
+    //         10000                                 15000
+    // *----*----*------------------------------------*    main
+    //           |
+    //           |            11000
+    //           +--------------     child
+    //
+    //
+    // With retention horizon 1000, it's still cheapest to retain
+    // - snapshot at branch point (size 5000)
+    // - WAL on child between 10000-11000
+    // - WAL on main between 10000-15000
+    //
+    // This is in total 5000 + 1000 + 5000
+    //
+    let (_model, result) = scenario.calculate(1000);
+
+    assert_eq!(result.total_size, 5000 + 1000 + 5000);
+}
+
+// Diverged branches
+#[test]
+fn scenario_4() {
+    // Create main branch
+    let mut scenario = ScenarioBuilder::new("main");
+
+    // Bulk load 5 GB of data to it
+    scenario.insert("main", 5_000);
+
+    // Stream of updates
+    for _ in 0..5 {
+        scenario.update("main", 1_000);
+    }
+
+    // Branch
+    scenario.branch("main", "child");
+    scenario.update("child", 1_000);
+
+    // More updates on parent
+    for _ in 0..8 {
+        scenario.update("main", 1_000);
+    }
+
+    //
+    // The history looks like this now:
+    //
+    //         10000                                 18000
+    // *----*----*------------------------------------*    main
+    //           |
+    //           |            11000
+    //           +--------------     child
+    //
+    //
+    // With retention horizon 1000, it's now cheapest to retain
+    // separate snapshots on both branches:
+    // - snapshot on main branch at LSN 17000 (size 5000)
+    // - WAL on main between 17000-18000
+    // - snapshot on child branch at LSN 10000 (size 5000)
+    // - WAL on child between 10000-11000
+    //
+    // This is in total 5000 + 1000 + 5000 + 1000 = 12000
+    //
+    // (If we used the the method from the previous scenario, and
+    // kept only snapshot at the branch point, we'd need to keep
+    // all the WAL between 10000-18000 on the main branch, so
+    // the total size would be 5000 + 1000 + 8000 = 14000. The
+    // calculation always picks the cheapest alternative)
+
+    let (_model, result) = scenario.calculate(1000);
+
+    assert_eq!(result.total_size, 5000 + 1000 + 5000 + 1000);
+}
+
+#[test]
+fn scenario_5() {
+    let mut scenario = ScenarioBuilder::new("a");
+    scenario.insert("a", 5000);
+    scenario.branch("a", "b");
+    scenario.update("b", 4000);
+    scenario.update("a", 2000);
+    scenario.branch("a", "c");
+    scenario.insert("c", 4000);
+    scenario.insert("a", 2000);
+
+    let (_model, result) = scenario.calculate(1000);
+
+    assert_eq!(result.total_size, 17000);
+}
+
+#[test]
+fn scenario_6() {
+    let branches = [
+        "7ff1edab8182025f15ae33482edb590a",
+        "b1719e044db05401a05a2ed588a3ad3f",
+        "0xb68d6691c895ad0a70809470020929ef",
+    ];
+
+    // compared to other scenarios, this one uses bytes instead of kB
+
+    let mut scenario = ScenarioBuilder::new("");
+
+    scenario.branch("", branches[0]); // at 0
+    scenario.modify_branch(branches[0], 108951064, 43696128); // at 108951064
+    scenario.branch(branches[0], branches[1]); // at 108951064
+    scenario.modify_branch(branches[1], 15560408, -1851392); // at 124511472
+    scenario.modify_branch(branches[0], 174464360, -1531904); // at 283415424
+    scenario.branch(branches[0], branches[2]); // at 283415424
+    scenario.modify_branch(branches[2], 15906192, 8192); // at 299321616
+    scenario.modify_branch(branches[0], 18909976, 32768); // at 302325400
+
+    let (model, result) = scenario.calculate(100_000);
+
+    // FIXME: We previously calculated 333_792_000. But with this PR, we get
+    // a much lower number. At a quick look at the model output and the
+    // calculations here, the new result seems correct to me.
+    eprintln!(
+        " MODEL: {}",
+        serde_json::to_string(&model.segments).unwrap()
+    );
+    eprintln!(
+        "RESULT: {}",
+        serde_json::to_string(&result.segments).unwrap()
+    );
+
+    assert_eq!(result.total_size, 136_236_928);
+}
diff --git a/pageserver/src/http/openapi_spec.yml b/pageserver/src/http/openapi_spec.yml
index fc271fe83b..e68ceb2dc6 100644
--- a/pageserver/src/http/openapi_spec.yml
+++ b/pageserver/src/http/openapi_spec.yml
@@ -437,6 +437,13 @@ paths:
           type: boolean
         description: |
           When true, skip calculation and only provide the model inputs (for debugging). Defaults to false.
+      - name: retention_period
+        in: query
+        required: false
+        schema:
+          type: integer
+        description: |
+          Override the default retention period (in bytes) used for size calculation.
     get:
       description: |
         Calculate tenant's size, which is a mixture of WAL (bytes) and logical_size (bytes).
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 6a9232e097..7cd7e81fe1 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -7,6 +7,7 @@ use hyper::{Body, Request, Response, Uri};
 use metrics::launch_timestamp::LaunchTimestamp;
 use pageserver_api::models::DownloadRemoteLayersTaskSpawnRequest;
 use remote_storage::GenericRemoteStorage;
+use tenant_size_model::{SizeResult, StorageModel};
 use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::http::request::{get_request_param, must_get_query_param, parse_query_param};
@@ -20,6 +21,7 @@ use crate::pgdatadir_mapping::LsnForTimestamp;
 use crate::task_mgr::TaskKind;
 use crate::tenant::config::TenantConfOpt;
 use crate::tenant::mgr::TenantMapInsertError;
+use crate::tenant::size::ModelInputs;
 use crate::tenant::storage_layer::LayerAccessStatsReset;
 use crate::tenant::{PageReconstructError, Timeline};
 use crate::{config::PageServerConf, tenant::mgr};
@@ -479,11 +481,19 @@ async fn tenant_status(request: Request<Body>) -> Result<Response<Body>, ApiErro
 /// to debug any of the calculations. Requires `tenant_id` request parameter, supports
 /// `inputs_only=true|false` (default false) which supports debugging failure to calculate model
 /// values.
+///
+/// 'retention_period' query parameter overrides the cutoff that is used to calculate the size
+/// (only if it is shorter than the real cutoff).
+///
+/// Note: we don't update the cached size and prometheus metric here.
+/// The retention period might be different, and it's nice to have a method to just calculate it
+/// without modifying anything anyway.
 async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     check_permission(&request, Some(tenant_id))?;
-
     let inputs_only: Option<bool> = parse_query_param(&request, "inputs_only")?;
+    let retention_period: Option<u64> = parse_query_param(&request, "retention_period")?;
+    let headers = request.headers();
 
     let ctx = RequestContext::new(TaskKind::MgmtRequest, DownloadBehavior::Download);
     let tenant = mgr::get_tenant(tenant_id, true)
@@ -492,24 +502,29 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
 
     // this can be long operation
     let inputs = tenant
-        .gather_size_inputs(&ctx)
+        .gather_size_inputs(retention_period, &ctx)
         .await
         .map_err(ApiError::InternalServerError)?;
 
-    let size = if !inputs_only.unwrap_or(false) {
-        Some(
-            tenant
-                .calc_and_update_cached_synthetic_size(&inputs)
-                .map_err(ApiError::InternalServerError)?,
-        )
-    } else {
-        None
-    };
+    let mut sizes = None;
+    if !inputs_only.unwrap_or(false) {
+        let storage_model = inputs
+            .calculate_model()
+            .map_err(ApiError::InternalServerError)?;
+        let size = storage_model.calculate();
 
-    /// Private response type with the additional "unstable" `inputs` field.
-    ///
-    /// The type is described with `id` and `size` in the openapi_spec file, but the `inputs` is
-    /// intentionally left out. The type resides in the pageserver not to expose `ModelInputs`.
+        // If request header expects html, return html
+        if headers["Accept"] == "text/html" {
+            return synthetic_size_html_response(inputs, storage_model, size);
+        }
+        sizes = Some(size);
+    } else if headers["Accept"] == "text/html" {
+        return Err(ApiError::BadRequest(anyhow!(
+            "inputs_only parameter is incompatible with html output request"
+        )));
+    }
+
+    /// The type resides in the pageserver not to expose `ModelInputs`.
     #[serde_with::serde_as]
     #[derive(serde::Serialize)]
     struct TenantHistorySize {
@@ -519,6 +534,9 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
         ///
         /// Will be none if `?inputs_only=true` was given.
         size: Option<u64>,
+        /// Size of each segment used in the model.
+        /// Will be null if `?inputs_only=true` was given.
+        segment_sizes: Option<Vec<tenant_size_model::SegmentSizeResult>>,
         inputs: crate::tenant::size::ModelInputs,
     }
 
@@ -526,7 +544,8 @@ async fn tenant_size_handler(request: Request<Body>) -> Result<Response<Body>, A
         StatusCode::OK,
         TenantHistorySize {
             id: tenant_id,
-            size,
+            size: sizes.as_ref().map(|x| x.total_size),
+            segment_sizes: sizes.map(|x| x.segments),
             inputs,
         },
     )
@@ -591,6 +610,62 @@ async fn evict_timeline_layer_handler(request: Request<Body>) -> Result<Response
     }
 }
 
+/// Get tenant_size SVG graph along with the JSON data.
+fn synthetic_size_html_response(
+    inputs: ModelInputs,
+    storage_model: StorageModel,
+    sizes: SizeResult,
+) -> Result<Response<Body>, ApiError> {
+    let mut timeline_ids: Vec<String> = Vec::new();
+    let mut timeline_map: HashMap<TimelineId, usize> = HashMap::new();
+    for (index, ti) in inputs.timeline_inputs.iter().enumerate() {
+        timeline_map.insert(ti.timeline_id, index);
+        timeline_ids.push(ti.timeline_id.to_string());
+    }
+    let seg_to_branch: Vec<usize> = inputs
+        .segments
+        .iter()
+        .map(|seg| *timeline_map.get(&seg.timeline_id).unwrap())
+        .collect();
+
+    let svg =
+        tenant_size_model::svg::draw_svg(&storage_model, &timeline_ids, &seg_to_branch, &sizes)
+            .map_err(ApiError::InternalServerError)?;
+
+    let mut response = String::new();
+
+    use std::fmt::Write;
+    write!(response, "<html>\n<body>\n").unwrap();
+    write!(response, "<div>\n{svg}\n</div>").unwrap();
+    writeln!(response, "Project size: {}", sizes.total_size).unwrap();
+    writeln!(response, "<pre>").unwrap();
+    writeln!(
+        response,
+        "{}",
+        serde_json::to_string_pretty(&inputs).unwrap()
+    )
+    .unwrap();
+    writeln!(
+        response,
+        "{}",
+        serde_json::to_string_pretty(&sizes.segments).unwrap()
+    )
+    .unwrap();
+    writeln!(response, "</pre>").unwrap();
+    write!(response, "</body>\n</html>\n").unwrap();
+
+    html_response(StatusCode::OK, response)
+}
+
+pub fn html_response(status: StatusCode, data: String) -> Result<Response<Body>, ApiError> {
+    let response = Response::builder()
+        .status(status)
+        .header(hyper::header::CONTENT_TYPE, "text/html")
+        .body(Body::from(data.as_bytes().to_vec()))
+        .map_err(|e| ApiError::InternalServerError(e.into()))?;
+    Ok(response)
+}
+
 // Helper function to standardize the error messages we produce on bad durations
 //
 // Intended to be used with anyhow's `with_context`, e.g.:
@@ -1019,7 +1094,7 @@ pub fn make_router(
         .get("/v1/tenant", tenant_list_handler)
         .post("/v1/tenant", tenant_create_handler)
         .get("/v1/tenant/:tenant_id", tenant_status)
-        .get("/v1/tenant/:tenant_id/size", tenant_size_handler)
+        .get("/v1/tenant/:tenant_id/synthetic_size", tenant_size_handler)
         .put("/v1/tenant/config", update_tenant_config_handler)
         .get("/v1/tenant/:tenant_id/config", get_tenant_config_handler)
         .get("/v1/tenant/:tenant_id/timeline", timeline_list_handler)
diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs
index 23210b98d5..9e9c98ad62 100644
--- a/pageserver/src/tenant.rs
+++ b/pageserver/src/tenant.rs
@@ -2418,6 +2418,9 @@ impl Tenant {
     #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
     pub async fn gather_size_inputs(
         &self,
+        // `max_retention_period` overrides the cutoff that is used to calculate the size
+        // (only if it is shorter than the real cutoff).
+        max_retention_period: Option<u64>,
         ctx: &RequestContext,
     ) -> anyhow::Result<size::ModelInputs> {
         let logical_sizes_at_once = self
@@ -2425,32 +2428,41 @@ impl Tenant {
             .concurrent_tenant_size_logical_size_queries
             .inner();
 
-        // TODO: Having a single mutex block concurrent reads is unfortunate, but since the queries
-        // are for testing/experimenting, we tolerate this.
+        // TODO: Having a single mutex block concurrent reads is not great for performance.
+        //
+        // But the only case where we need to run multiple of these at once is when we
+        // request a size for a tenant manually via API, while another background calculation
+        // is in progress (which is not a common case).
         //
         // See more for on the issue #2748 condenced out of the initial PR review.
         let mut shared_cache = self.cached_logical_sizes.lock().await;
 
-        size::gather_inputs(self, logical_sizes_at_once, &mut shared_cache, ctx).await
+        size::gather_inputs(
+            self,
+            logical_sizes_at_once,
+            max_retention_period,
+            &mut shared_cache,
+            ctx,
+        )
+        .await
     }
 
-    /// Calculate synthetic tenant size
+    /// Calculate synthetic tenant size and cache the result.
     /// This is periodically called by background worker.
     /// result is cached in tenant struct
     #[instrument(skip_all, fields(tenant_id=%self.tenant_id))]
     pub async fn calculate_synthetic_size(&self, ctx: &RequestContext) -> anyhow::Result<u64> {
-        let inputs = self.gather_size_inputs(ctx).await?;
+        let inputs = self.gather_size_inputs(None, ctx).await?;
 
-        self.calc_and_update_cached_synthetic_size(&inputs)
-    }
-
-    /// Calculate synthetic size , cache it and set metric value
-    pub fn calc_and_update_cached_synthetic_size(
-        &self,
-        inputs: &size::ModelInputs,
-    ) -> anyhow::Result<u64> {
         let size = inputs.calculate()?;
 
+        self.set_cached_synthetic_size(size);
+
+        Ok(size)
+    }
+
+    /// Cache given synthetic size and update the metric value
+    pub fn set_cached_synthetic_size(&self, size: u64) {
         self.cached_synthetic_tenant_size
             .store(size, Ordering::Relaxed);
 
@@ -2458,8 +2470,6 @@ impl Tenant {
             .get_metric_with_label_values(&[&self.tenant_id.to_string()])
             .unwrap()
             .set(size);
-
-        Ok(size)
     }
 
     pub fn get_cached_synthetic_size(&self) -> u64 {
diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 2fed4f88b3..2c5efe283b 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -1,8 +1,9 @@
 use std::cmp;
+use std::collections::hash_map::Entry;
 use std::collections::{HashMap, HashSet};
 use std::sync::Arc;
 
-use anyhow::Context;
+use anyhow::{bail, Context};
 use tokio::sync::oneshot::error::RecvError;
 use tokio::sync::Semaphore;
 
@@ -10,35 +11,80 @@ use crate::context::RequestContext;
 use crate::pgdatadir_mapping::CalculateLogicalSizeError;
 
 use super::Tenant;
+use crate::tenant::Timeline;
 use utils::id::TimelineId;
 use utils::lsn::Lsn;
 
 use tracing::*;
 
+use tenant_size_model::{Segment, StorageModel};
+
 /// Inputs to the actual tenant sizing model
 ///
 /// Implements [`serde::Serialize`] but is not meant to be part of the public API, instead meant to
 /// be a transferrable format between execution environments and developer.
+///
+/// This tracks more information than the actual StorageModel that calculation
+/// needs. We will convert this into a StorageModel when it's time to perform
+/// the calculation.
+///
 #[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
 pub struct ModelInputs {
-    updates: Vec<Update>,
-    retention_period: u64,
+    pub segments: Vec<SegmentMeta>,
+    pub timeline_inputs: Vec<TimelineInputs>,
+}
 
-    /// Relevant lsns per timeline.
-    ///
-    /// This field is not required for deserialization purposes, which is mostly used in tests. The
-    /// LSNs explain the outcome (updates) but are not needed in size calculation.
-    #[serde_as(as = "HashMap<serde_with::DisplayFromStr, _>")]
-    #[serde(default)]
-    timeline_inputs: HashMap<TimelineId, TimelineInputs>,
+/// A [`Segment`], with some extra information for display purposes
+#[serde_with::serde_as]
+#[derive(Debug, serde::Serialize, serde::Deserialize)]
+pub struct SegmentMeta {
+    pub segment: Segment,
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub timeline_id: TimelineId,
+    pub kind: LsnKind,
+}
+
+impl SegmentMeta {
+    fn size_needed(&self) -> bool {
+        match self.kind {
+            LsnKind::BranchStart => {
+                // If we don't have a later GcCutoff point on this branch, and
+                // no ancestor, calculate size for the branch start point.
+                self.segment.needed && self.segment.parent.is_none()
+            }
+            LsnKind::BranchPoint => true,
+            LsnKind::GcCutOff => true,
+            LsnKind::BranchEnd => false,
+        }
+    }
+}
+
+#[derive(
+    Debug, Clone, Copy, Eq, Ord, PartialEq, PartialOrd, serde::Serialize, serde::Deserialize,
+)]
+pub enum LsnKind {
+    /// A timeline starting here
+    BranchStart,
+    /// A child timeline branches off from here
+    BranchPoint,
+    /// GC cutoff point
+    GcCutOff,
+    /// Last record LSN
+    BranchEnd,
 }
 
 /// Collect all relevant LSNs to the inputs. These will only be helpful in the serialized form as
 /// part of [`ModelInputs`] from the HTTP api, explaining the inputs.
 #[serde_with::serde_as]
 #[derive(Debug, serde::Serialize, serde::Deserialize)]
-struct TimelineInputs {
+pub struct TimelineInputs {
+    #[serde_as(as = "serde_with::DisplayFromStr")]
+    pub timeline_id: TimelineId,
+
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
+    pub ancestor_id: Option<TimelineId>,
+
     #[serde_as(as = "serde_with::DisplayFromStr")]
     ancestor_lsn: Lsn,
     #[serde_as(as = "serde_with::DisplayFromStr")]
@@ -49,118 +95,14 @@ struct TimelineInputs {
     horizon_cutoff: Lsn,
     #[serde_as(as = "serde_with::DisplayFromStr")]
     pitr_cutoff: Lsn,
+
+    /// Cutoff point based on GC settings
     #[serde_as(as = "serde_with::DisplayFromStr")]
     next_gc_cutoff: Lsn,
-}
 
-// Adjust BranchFrom sorting so that we always process ancestor
-// before descendants. This is needed to correctly calculate size of
-// descendant timelines.
-//
-// Note that we may have multiple BranchFroms at the same LSN, so we
-// need to sort them in the tree order.
-//
-// see updates_sort_with_branches_at_same_lsn test below
-fn sort_updates_in_tree_order(updates: Vec<Update>) -> anyhow::Result<Vec<Update>> {
-    let mut sorted_updates = Vec::with_capacity(updates.len());
-    let mut known_timelineids = HashSet::new();
-    let mut i = 0;
-    while i < updates.len() {
-        let curr_upd = &updates[i];
-
-        if let Command::BranchFrom(parent_id) = curr_upd.command {
-            let parent_id = match parent_id {
-                Some(parent_id) if known_timelineids.contains(&parent_id) => {
-                    // we have already processed ancestor
-                    // process this BranchFrom Update normally
-                    known_timelineids.insert(curr_upd.timeline_id);
-                    sorted_updates.push(*curr_upd);
-                    i += 1;
-                    continue;
-                }
-                None => {
-                    known_timelineids.insert(curr_upd.timeline_id);
-                    sorted_updates.push(*curr_upd);
-                    i += 1;
-                    continue;
-                }
-                Some(parent_id) => parent_id,
-            };
-
-            let mut j = i;
-
-            // we have not processed ancestor yet.
-            // there is a chance that it is at the same Lsn
-            if !known_timelineids.contains(&parent_id) {
-                let mut curr_lsn_branchfroms: HashMap<TimelineId, Vec<(TimelineId, usize)>> =
-                    HashMap::new();
-
-                // inspect all branchpoints at the same lsn
-                while j < updates.len() && updates[j].lsn == curr_upd.lsn {
-                    let lookahead_upd = &updates[j];
-                    j += 1;
-
-                    if let Command::BranchFrom(lookahead_parent_id) = lookahead_upd.command {
-                        match lookahead_parent_id {
-                            Some(lookahead_parent_id)
-                                if !known_timelineids.contains(&lookahead_parent_id) =>
-                            {
-                                // we have not processed ancestor yet
-                                // store it for later
-                                let es =
-                                    curr_lsn_branchfroms.entry(lookahead_parent_id).or_default();
-                                es.push((lookahead_upd.timeline_id, j));
-                            }
-                            _ => {
-                                // we have already processed ancestor
-                                // process this BranchFrom Update normally
-                                known_timelineids.insert(lookahead_upd.timeline_id);
-                                sorted_updates.push(*lookahead_upd);
-                            }
-                        }
-                    }
-                }
-
-                // process BranchFroms in the tree order
-                // check that we don't have a cycle if somet entry is orphan
-                // (this should not happen, but better to be safe)
-                let mut processed_some_entry = true;
-                while processed_some_entry {
-                    processed_some_entry = false;
-
-                    curr_lsn_branchfroms.retain(|parent_id, branchfroms| {
-                        if known_timelineids.contains(parent_id) {
-                            for (timeline_id, j) in branchfroms {
-                                known_timelineids.insert(*timeline_id);
-                                sorted_updates.push(updates[*j - 1]);
-                            }
-                            processed_some_entry = true;
-                            false
-                        } else {
-                            true
-                        }
-                    });
-                }
-
-                if !curr_lsn_branchfroms.is_empty() {
-                    // orphans are expected to be rare and transient between tenant reloads
-                    // for example, an broken ancestor without the child branch being broken.
-                    anyhow::bail!(
-                        "orphan branch(es) detected in BranchFroms: {curr_lsn_branchfroms:?}"
-                    );
-                }
-            }
-
-            assert!(j > i);
-            i = j;
-        } else {
-            // not a BranchFrom, keep the same order
-            sorted_updates.push(*curr_upd);
-            i += 1;
-        }
-    }
-
-    Ok(sorted_updates)
+    /// Cutoff point calculated from the user-supplied 'max_retention_period'
+    #[serde_as(as = "Option<serde_with::DisplayFromStr>")]
+    retention_param_cutoff: Option<Lsn>,
 }
 
 /// Gathers the inputs for the tenant sizing model.
@@ -181,257 +123,257 @@ fn sort_updates_in_tree_order(updates: Vec<Update>) -> anyhow::Result<Vec<Update
 pub(super) async fn gather_inputs(
     tenant: &Tenant,
     limit: &Arc<Semaphore>,
+    max_retention_period: Option<u64>,
     logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
     ctx: &RequestContext,
 ) -> anyhow::Result<ModelInputs> {
-    // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
-    // our advantage with `?` error handling.
-    let mut joinset = tokio::task::JoinSet::new();
-
     // refresh is needed to update gc related pitr_cutoff and horizon_cutoff
     tenant
         .refresh_gc_info(ctx)
         .await
         .context("Failed to refresh gc_info before gathering inputs")?;
 
+    // Collect information about all the timelines
     let timelines = tenant.list_timelines();
 
     if timelines.is_empty() {
         // perhaps the tenant has just been created, and as such doesn't have any data yet
         return Ok(ModelInputs {
-            updates: vec![],
-            retention_period: 0,
-            timeline_inputs: HashMap::default(),
+            segments: vec![],
+            timeline_inputs: Vec::new(),
         });
     }
 
+    // Build a map of branch points.
+    let mut branchpoints: HashMap<TimelineId, HashSet<Lsn>> = HashMap::new();
+    for timeline in timelines.iter() {
+        if let Some(ancestor_id) = timeline.get_ancestor_timeline_id() {
+            branchpoints
+                .entry(ancestor_id)
+                .or_default()
+                .insert(timeline.get_ancestor_lsn());
+        }
+    }
+
+    // These become the final result.
+    let mut timeline_inputs = Vec::with_capacity(timelines.len());
+    let mut segments: Vec<SegmentMeta> = Vec::new();
+
+    //
+    // Build Segments representing each timeline. As we do that, also remember
+    // the branchpoints and branch startpoints in 'branchpoint_segments' and
+    // 'branchstart_segments'
+    //
+
+    // BranchPoint segments of each timeline
+    // (timeline, branchpoint LSN) -> segment_id
+    let mut branchpoint_segments: HashMap<(TimelineId, Lsn), usize> = HashMap::new();
+
+    // timeline, Branchpoint seg id, (ancestor, ancestor LSN)
+    type BranchStartSegment = (TimelineId, usize, Option<(TimelineId, Lsn)>);
+    let mut branchstart_segments: Vec<BranchStartSegment> = Vec::new();
+
+    for timeline in timelines.iter() {
+        let timeline_id = timeline.timeline_id;
+        let last_record_lsn = timeline.get_last_record_lsn();
+        let ancestor_lsn = timeline.get_ancestor_lsn();
+
+        // there's a race between the update (holding tenant.gc_lock) and this read but it
+        // might not be an issue, because it's not for Timeline::gc
+        let gc_info = timeline.gc_info.read().unwrap();
+
+        // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a
+        // new gc run, which we have no control over. however differently from `Timeline::gc`
+        // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
+        // actually removing files.
+        let mut next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
+
+        // If the caller provided a shorter retention period, use that instead of the GC cutoff.
+        let retention_param_cutoff = if let Some(max_retention_period) = max_retention_period {
+            let param_cutoff = Lsn(last_record_lsn.0.saturating_sub(max_retention_period));
+            if next_gc_cutoff < param_cutoff {
+                next_gc_cutoff = param_cutoff;
+            }
+            Some(param_cutoff)
+        } else {
+            None
+        };
+
+        // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
+        // want to query any logical size before initdb_lsn.
+        let branch_start_lsn = cmp::max(ancestor_lsn, timeline.initdb_lsn);
+
+        // Build "interesting LSNs" on this timeline
+        let mut lsns: Vec<(Lsn, LsnKind)> = gc_info
+            .retain_lsns
+            .iter()
+            .filter(|&&lsn| lsn > ancestor_lsn)
+            .copied()
+            // this assumes there are no other retain_lsns than the branchpoints
+            .map(|lsn| (lsn, LsnKind::BranchPoint))
+            .collect::<Vec<_>>();
+
+        // Add branch points we collected earlier, just in case there were any that were
+        // not present in retain_lsns. We will remove any duplicates below later.
+        if let Some(this_branchpoints) = branchpoints.get(&timeline_id) {
+            lsns.extend(
+                this_branchpoints
+                    .iter()
+                    .map(|lsn| (*lsn, LsnKind::BranchPoint)),
+            )
+        }
+
+        // Add a point for the GC cutoff
+        let branch_start_needed = next_gc_cutoff <= branch_start_lsn;
+        if !branch_start_needed {
+            lsns.push((next_gc_cutoff, LsnKind::GcCutOff));
+        }
+
+        lsns.sort_unstable();
+        lsns.dedup();
+
+        //
+        // Create Segments for the interesting points.
+        //
+
+        // Timeline start point
+        let ancestor = timeline
+            .get_ancestor_timeline_id()
+            .map(|ancestor_id| (ancestor_id, ancestor_lsn));
+        branchstart_segments.push((timeline_id, segments.len(), ancestor));
+        segments.push(SegmentMeta {
+            segment: Segment {
+                parent: None, // filled in later
+                lsn: branch_start_lsn.0,
+                size: None, // filled in later
+                needed: branch_start_needed,
+            },
+            timeline_id: timeline.timeline_id,
+            kind: LsnKind::BranchStart,
+        });
+
+        // GC cutoff point, and any branch points, i.e. points where
+        // other timelines branch off from this timeline.
+        let mut parent = segments.len() - 1;
+        for (lsn, kind) in lsns {
+            if kind == LsnKind::BranchPoint {
+                branchpoint_segments.insert((timeline_id, lsn), segments.len());
+            }
+            segments.push(SegmentMeta {
+                segment: Segment {
+                    parent: Some(parent),
+                    lsn: lsn.0,
+                    size: None,
+                    needed: lsn > next_gc_cutoff,
+                },
+                timeline_id: timeline.timeline_id,
+                kind,
+            });
+            parent += 1;
+        }
+
+        // Current end of the timeline
+        segments.push(SegmentMeta {
+            segment: Segment {
+                parent: Some(parent),
+                lsn: last_record_lsn.0,
+                size: None, // Filled in later, if necessary
+                needed: true,
+            },
+            timeline_id: timeline.timeline_id,
+            kind: LsnKind::BranchEnd,
+        });
+
+        timeline_inputs.push(TimelineInputs {
+            timeline_id: timeline.timeline_id,
+            ancestor_id: timeline.get_ancestor_timeline_id(),
+            ancestor_lsn,
+            last_record: last_record_lsn,
+            // this is not used above, because it might not have updated recently enough
+            latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
+            horizon_cutoff: gc_info.horizon_cutoff,
+            pitr_cutoff: gc_info.pitr_cutoff,
+            next_gc_cutoff,
+            retention_param_cutoff,
+        });
+    }
+
+    // We now have all segments from the timelines in 'segments'. The timelines
+    // haven't been linked to each other yet, though. Do that.
+    for (_timeline_id, seg_id, ancestor) in branchstart_segments {
+        // Look up the branch point
+        if let Some(ancestor) = ancestor {
+            let parent_id = *branchpoint_segments.get(&ancestor).unwrap();
+            segments[seg_id].segment.parent = Some(parent_id);
+        }
+    }
+
+    // We left the 'size' field empty in all of the Segments so far.
+    // Now find logical sizes for all of the points that might need or benefit from them.
+    fill_logical_sizes(&timelines, &mut segments, limit, logical_size_cache, ctx).await?;
+
+    Ok(ModelInputs {
+        segments,
+        timeline_inputs,
+    })
+}
+
+/// Augment 'segments' with logical sizes
+///
+/// this will probably conflict with on-demand downloaded layers, or at least force them all
+/// to be downloaded
+///
+async fn fill_logical_sizes(
+    timelines: &[Arc<Timeline>],
+    segments: &mut [SegmentMeta],
+    limit: &Arc<Semaphore>,
+    logical_size_cache: &mut HashMap<(TimelineId, Lsn), u64>,
+    ctx: &RequestContext,
+) -> anyhow::Result<()> {
+    let timeline_hash: HashMap<TimelineId, Arc<Timeline>> = HashMap::from_iter(
+        timelines
+            .iter()
+            .map(|timeline| (timeline.timeline_id, Arc::clone(timeline))),
+    );
+
     // record the used/inserted cache keys here, to remove extras not to start leaking
     // after initial run the cache should be quite stable, but live timelines will eventually
     // require new lsns to be inspected.
-    let mut needed_cache = HashSet::<(TimelineId, Lsn)>::new();
+    let mut sizes_needed = HashMap::<(TimelineId, Lsn), Option<u64>>::new();
 
-    let mut updates = Vec::new();
+    // with joinset, on drop, all of the tasks will just be de-scheduled, which we can use to
+    // our advantage with `?` error handling.
+    let mut joinset = tokio::task::JoinSet::new();
 
-    // record the per timeline values useful to debug the model inputs, also used to track
-    // ancestor_lsn without keeping a hold of Timeline
-    let mut timeline_inputs = HashMap::with_capacity(timelines.len());
-
-    // used to determine the `retention_period` for the size model
-    let mut max_cutoff_distance = None;
-
-    // mapping from (TimelineId, Lsn) => if this branch point has been handled already via
-    // GcInfo::retain_lsns or if it needs to have its logical_size calculated.
-    let mut referenced_branch_froms = HashMap::<(TimelineId, Lsn), bool>::new();
-
-    for timeline in timelines {
-        if !timeline.is_active() {
-            anyhow::bail!(
-                "timeline {} is not active, cannot calculate tenant_size now",
-                timeline.timeline_id
-            );
+    // For each point that would benefit from having a logical size available,
+    // spawn a Task to fetch it, unless we have it cached already.
+    for seg in segments.iter() {
+        if !seg.size_needed() {
+            continue;
         }
 
-        let last_record_lsn = timeline.get_last_record_lsn();
+        let timeline_id = seg.timeline_id;
+        let lsn = Lsn(seg.segment.lsn);
 
-        let (interesting_lsns, horizon_cutoff, pitr_cutoff, next_gc_cutoff) = {
-            // there's a race between the update (holding tenant.gc_lock) and this read but it
-            // might not be an issue, because it's not for Timeline::gc
-            let gc_info = timeline.gc_info.read().unwrap();
-
-            // similar to gc, but Timeline::get_latest_gc_cutoff_lsn() will not be updated before a
-            // new gc run, which we have no control over. however differently from `Timeline::gc`
-            // we don't consider the `Timeline::disk_consistent_lsn` at all, because we are not
-            // actually removing files.
-            let next_gc_cutoff = cmp::min(gc_info.horizon_cutoff, gc_info.pitr_cutoff);
-
-            // the minimum where we should find the next_gc_cutoff for our calculations.
-            //
-            // next_gc_cutoff in parent branch are not of interest (right now at least), nor do we
-            // want to query any logical size before initdb_lsn.
-            let cutoff_minimum = cmp::max(timeline.get_ancestor_lsn(), timeline.initdb_lsn);
-
-            let maybe_cutoff = if next_gc_cutoff > cutoff_minimum {
-                Some((next_gc_cutoff, LsnKind::GcCutOff))
-            } else {
-                None
-            };
-
-            // this assumes there are no other lsns than the branchpoints
-            let lsns = gc_info
-                .retain_lsns
-                .iter()
-                .inspect(|&&lsn| {
-                    trace!(
-                        timeline_id=%timeline.timeline_id,
-                        "retained lsn: {lsn:?}, is_before_ancestor_lsn={}",
-                        lsn < timeline.get_ancestor_lsn()
-                    )
-                })
-                .filter(|&&lsn| lsn > timeline.get_ancestor_lsn())
-                .copied()
-                .map(|lsn| (lsn, LsnKind::BranchPoint))
-                .chain(maybe_cutoff)
-                .collect::<Vec<_>>();
-
-            (
-                lsns,
-                gc_info.horizon_cutoff,
-                gc_info.pitr_cutoff,
-                next_gc_cutoff,
-            )
-        };
-
-        // update this to have a retention_period later for the tenant_size_model
-        // tenant_size_model compares this to the last segments start_lsn
-        if let Some(cutoff_distance) = last_record_lsn.checked_sub(next_gc_cutoff) {
-            match max_cutoff_distance.as_mut() {
-                Some(max) => {
-                    *max = std::cmp::max(*max, cutoff_distance);
-                }
-                _ => {
-                    max_cutoff_distance = Some(cutoff_distance);
-                }
-            }
-        }
-
-        // all timelines branch from something, because it might be impossible to pinpoint
-        // which is the tenant_size_model's "default" branch.
-
-        let ancestor_lsn = timeline.get_ancestor_lsn();
-
-        updates.push(Update {
-            lsn: ancestor_lsn,
-            command: Command::BranchFrom(timeline.get_ancestor_timeline_id()),
-            timeline_id: timeline.timeline_id,
-        });
-
-        if let Some(parent_timeline_id) = timeline.get_ancestor_timeline_id() {
-            // refresh_gc_info will update branchpoints and pitr_cutoff but only do it for branches
-            // which are over gc_horizon. for example, a "main" branch which never received any
-            // updates apart from initdb not have branch points recorded.
-            referenced_branch_froms
-                .entry((parent_timeline_id, timeline.get_ancestor_lsn()))
-                .or_default();
-        }
-
-        for (lsn, _kind) in &interesting_lsns {
-            // mark this visited so don't need to re-process this parent
-            *referenced_branch_froms
-                .entry((timeline.timeline_id, *lsn))
-                .or_default() = true;
-
-            if let Some(size) = logical_size_cache.get(&(timeline.timeline_id, *lsn)) {
-                updates.push(Update {
-                    lsn: *lsn,
-                    timeline_id: timeline.timeline_id,
-                    command: Command::Update(*size),
-                });
-
-                needed_cache.insert((timeline.timeline_id, *lsn));
-            } else {
-                let timeline = Arc::clone(&timeline);
+        if let Entry::Vacant(e) = sizes_needed.entry((timeline_id, lsn)) {
+            let cached_size = logical_size_cache.get(&(timeline_id, lsn)).cloned();
+            if cached_size.is_none() {
+                let timeline = Arc::clone(timeline_hash.get(&timeline_id).unwrap());
                 let parallel_size_calcs = Arc::clone(limit);
                 let ctx = ctx.attached_child();
                 joinset.spawn(calculate_logical_size(
                     parallel_size_calcs,
                     timeline,
-                    *lsn,
+                    lsn,
                     ctx,
                 ));
             }
-        }
-
-        timeline_inputs.insert(
-            timeline.timeline_id,
-            TimelineInputs {
-                ancestor_lsn,
-                last_record: last_record_lsn,
-                // this is not used above, because it might not have updated recently enough
-                latest_gc_cutoff: *timeline.get_latest_gc_cutoff_lsn(),
-                horizon_cutoff,
-                pitr_cutoff,
-                next_gc_cutoff,
-            },
-        );
-    }
-
-    // iterate over discovered branch points and make sure we are getting logical sizes at those
-    // points.
-    for ((timeline_id, lsn), handled) in referenced_branch_froms.iter() {
-        if *handled {
-            continue;
-        }
-
-        let timeline_id = *timeline_id;
-        let lsn = *lsn;
-
-        match timeline_inputs.get(&timeline_id) {
-            Some(inputs) if inputs.ancestor_lsn == lsn => {
-                // we don't need an update at this branch point which is also point where
-                // timeline_id branch was branched from.
-                continue;
-            }
-            Some(_) => {}
-            None => {
-                // we should have this because we have iterated through all of the timelines
-                anyhow::bail!("missing timeline_input for {timeline_id}")
-            }
-        }
-
-        if let Some(size) = logical_size_cache.get(&(timeline_id, lsn)) {
-            updates.push(Update {
-                lsn,
-                timeline_id,
-                command: Command::Update(*size),
-            });
-
-            needed_cache.insert((timeline_id, lsn));
-        } else {
-            let timeline = tenant
-                .get_timeline(timeline_id, false)
-                .context("find referenced ancestor timeline")?;
-            let parallel_size_calcs = Arc::clone(limit);
-            joinset.spawn(calculate_logical_size(
-                parallel_size_calcs,
-                timeline.clone(),
-                lsn,
-                ctx.attached_child(),
-            ));
-
-            if let Some(parent_id) = timeline.get_ancestor_timeline_id() {
-                // we should not find new ones because we iterated tenants all timelines
-                anyhow::ensure!(
-                    timeline_inputs.contains_key(&parent_id),
-                    "discovered new timeline {parent_id} (parent of {timeline_id})"
-                );
-            }
-        };
-    }
-
-    // finally add in EndOfBranch for all timelines where their last_record_lsn is not a branch
-    // point. this is needed by the model.
-    for (timeline_id, inputs) in timeline_inputs.iter() {
-        let lsn = inputs.last_record;
-
-        if referenced_branch_froms.contains_key(&(*timeline_id, lsn)) {
-            // this means that the (timeline_id, last_record_lsn) represents a branch point
-            // we do not want to add EndOfBranch updates for these points because it doesn't fit
-            // into the current tenant_size_model.
-            continue;
-        }
-
-        if lsn > inputs.ancestor_lsn {
-            // all timelines also have an end point if they have made any progress
-            updates.push(Update {
-                lsn,
-                command: Command::EndOfBranch,
-                timeline_id: *timeline_id,
-            });
+            e.insert(cached_size);
         }
     }
 
+    // Perform the size lookups
     let mut have_any_error = false;
-
     while let Some(res) = joinset.join_next().await {
         // each of these come with Result<anyhow::Result<_>, JoinError>
         // because of spawn + spawn_blocking
@@ -460,19 +402,13 @@ pub(super) async fn gather_inputs(
                 debug!(timeline_id=%timeline.timeline_id, %lsn, size, "size calculated");
 
                 logical_size_cache.insert((timeline.timeline_id, lsn), size);
-                needed_cache.insert((timeline.timeline_id, lsn));
-
-                updates.push(Update {
-                    lsn,
-                    timeline_id: timeline.timeline_id,
-                    command: Command::Update(size),
-                });
+                sizes_needed.insert((timeline.timeline_id, lsn), Some(size));
             }
         }
     }
 
     // prune any keys not needed anymore; we record every used key and added key.
-    logical_size_cache.retain(|key, _| needed_cache.contains(key));
+    logical_size_cache.retain(|key, _| sizes_needed.contains_key(key));
 
     if have_any_error {
         // we cannot complete this round, because we are missing data.
@@ -480,105 +416,47 @@ pub(super) async fn gather_inputs(
         anyhow::bail!("failed to calculate some logical_sizes");
     }
 
-    // the data gathered to updates is per lsn, regardless of the branch, so we can use it to
-    // our advantage, not requiring a sorted container or graph walk.
-    //
-    // for branch points, which come as multiple updates at the same LSN, the Command::Update
-    // is needed before a branch is made out of that branch Command::BranchFrom. this is
-    // handled by the variant order in `Command`.
-    //
-    updates.sort_unstable();
-
-    // And another sort to handle Command::BranchFrom ordering
-    // in case when there are multiple branches at the same LSN.
-    let sorted_updates = sort_updates_in_tree_order(updates)?;
-
-    let retention_period = match max_cutoff_distance {
-        Some(max) => max.0,
-        None => {
-            anyhow::bail!("the first branch should have a gc_cutoff after it's branch point at 0")
+    // Insert the looked up sizes to the Segments
+    for seg in segments.iter_mut() {
+        if !seg.size_needed() {
+            continue;
         }
-    };
 
-    Ok(ModelInputs {
-        updates: sorted_updates,
-        retention_period,
-        timeline_inputs,
-    })
+        let timeline_id = seg.timeline_id;
+        let lsn = Lsn(seg.segment.lsn);
+
+        if let Some(Some(size)) = sizes_needed.get(&(timeline_id, lsn)) {
+            seg.segment.size = Some(*size);
+        } else {
+            bail!("could not find size at {} in timeline {}", lsn, timeline_id);
+        }
+    }
+    Ok(())
 }
 
 impl ModelInputs {
+    pub fn calculate_model(&self) -> anyhow::Result<tenant_size_model::StorageModel> {
+        // Convert SegmentMetas into plain Segments
+        let storage = StorageModel {
+            segments: self
+                .segments
+                .iter()
+                .map(|seg| seg.segment.clone())
+                .collect(),
+        };
+
+        Ok(storage)
+    }
+
+    // calculate total project size
     pub fn calculate(&self) -> anyhow::Result<u64> {
-        // Option<TimelineId> is used for "naming" the branches because it is assumed to be
-        // impossible to always determine the a one main branch.
-        let mut storage = tenant_size_model::Storage::<Option<TimelineId>>::new(None);
+        let storage = self.calculate_model()?;
+        let sizes = storage.calculate();
 
-        for update in &self.updates {
-            let Update {
-                lsn,
-                command: op,
-                timeline_id,
-            } = update;
-
-            let Lsn(now) = *lsn;
-            match op {
-                Command::Update(sz) => {
-                    storage.insert_point(&Some(*timeline_id), "".into(), now, Some(*sz))?;
-                }
-                Command::EndOfBranch => {
-                    storage.insert_point(&Some(*timeline_id), "".into(), now, None)?;
-                }
-                Command::BranchFrom(parent) => {
-                    // This branch command may fail if it cannot find a parent to branch from.
-                    storage.branch(parent, Some(*timeline_id))?;
-                }
-            }
-        }
-
-        Ok(storage.calculate(self.retention_period)?.total_children())
+        Ok(sizes.total_size)
     }
 }
 
-/// A point of interest in the tree of branches
-#[serde_with::serde_as]
-#[derive(
-    Debug, PartialEq, PartialOrd, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize,
-)]
-struct Update {
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    lsn: utils::lsn::Lsn,
-    command: Command,
-    #[serde_as(as = "serde_with::DisplayFromStr")]
-    timeline_id: TimelineId,
-}
-
-#[serde_with::serde_as]
-#[derive(PartialOrd, PartialEq, Eq, Ord, Clone, Copy, serde::Serialize, serde::Deserialize)]
-#[serde(rename_all = "snake_case")]
-enum Command {
-    Update(u64),
-    BranchFrom(#[serde_as(as = "Option<serde_with::DisplayFromStr>")] Option<TimelineId>),
-    EndOfBranch,
-}
-
-impl std::fmt::Debug for Command {
-    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-        // custom one-line implementation makes it more enjoyable to read {:#?} avoiding 3
-        // linebreaks
-        match self {
-            Self::Update(arg0) => write!(f, "Update({arg0})"),
-            Self::BranchFrom(arg0) => write!(f, "BranchFrom({arg0:?})"),
-            Self::EndOfBranch => write!(f, "EndOfBranch"),
-        }
-    }
-}
-
-#[derive(Debug, Clone, Copy)]
-enum LsnKind {
-    BranchPoint,
-    GcCutOff,
-}
-
 /// Newtype around the tuple that carries the timeline at lsn logical size calculation.
 struct TimelineAtLsnSizeResult(
     Arc<crate::tenant::Timeline>,
@@ -604,227 +482,230 @@ async fn calculate_logical_size(
     Ok(TimelineAtLsnSizeResult(timeline, lsn, size_res))
 }
 
-#[test]
-fn updates_sort() {
-    use std::str::FromStr;
-    use utils::id::TimelineId;
-    use utils::lsn::Lsn;
-
-    let ids = [
-        TimelineId::from_str("7ff1edab8182025f15ae33482edb590a").unwrap(),
-        TimelineId::from_str("b1719e044db05401a05a2ed588a3ad3f").unwrap(),
-        TimelineId::from_str("b68d6691c895ad0a70809470020929ef").unwrap(),
-    ];
-
-    // try through all permutations
-    let ids = [
-        [&ids[0], &ids[1], &ids[2]],
-        [&ids[0], &ids[2], &ids[1]],
-        [&ids[1], &ids[0], &ids[2]],
-        [&ids[1], &ids[2], &ids[0]],
-        [&ids[2], &ids[0], &ids[1]],
-        [&ids[2], &ids[1], &ids[0]],
-    ];
-
-    for ids in ids {
-        // apply a fixture which uses a permutation of ids
-        let commands = [
-            Update {
-                lsn: Lsn(0),
-                command: Command::BranchFrom(None),
-                timeline_id: *ids[0],
-            },
-            Update {
-                lsn: Lsn::from_str("0/67E7618").unwrap(),
-                command: Command::Update(43696128),
-                timeline_id: *ids[0],
-            },
-            Update {
-                lsn: Lsn::from_str("0/67E7618").unwrap(),
-                command: Command::BranchFrom(Some(*ids[0])),
-                timeline_id: *ids[1],
-            },
-            Update {
-                lsn: Lsn::from_str("0/76BE4F0").unwrap(),
-                command: Command::Update(41844736),
-                timeline_id: *ids[1],
-            },
-            Update {
-                lsn: Lsn::from_str("0/10E49380").unwrap(),
-                command: Command::Update(42164224),
-                timeline_id: *ids[0],
-            },
-            Update {
-                lsn: Lsn::from_str("0/10E49380").unwrap(),
-                command: Command::BranchFrom(Some(*ids[0])),
-                timeline_id: *ids[2],
-            },
-            Update {
-                lsn: Lsn::from_str("0/11D74910").unwrap(),
-                command: Command::Update(42172416),
-                timeline_id: *ids[2],
-            },
-            Update {
-                lsn: Lsn::from_str("0/12051E98").unwrap(),
-                command: Command::Update(42196992),
-                timeline_id: *ids[0],
-            },
-        ];
-
-        let mut sorted = commands;
-
-        // these must sort in the same order, regardless of how the ids sort
-        // which is why the timeline_id is the last field
-        sorted.sort_unstable();
-
-        assert_eq!(commands, sorted, "{:#?} vs. {:#?}", commands, sorted);
-    }
-}
-
 #[test]
 fn verify_size_for_multiple_branches() {
     // this is generated from integration test test_tenant_size_with_multiple_branches, but this way
     // it has the stable lsn's
     //
-    // timelineinputs have been left out, because those explain the inputs, but don't participate
-    // in further size calculations.
-    let doc = r#"{"updates":[{"lsn":"0/0","command":{"branch_from":null},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"update":25763840},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/176FA40","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/1819818","command":{"update":26075136},"timeline_id":"10b532a550540bc15385eac4edde416a"},{"lsn":"0/18B5E40","command":{"update":26427392},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"update":26492928},"timeline_id":"cd9d9409c216e64bf580904facedb01b"},{"lsn":"0/18D3DF0","command":{"branch_from":"cd9d9409c216e64bf580904facedb01b"},"timeline_id":"230fc9d756f7363574c0d66533564dcc"},{"lsn":"0/220F438","command":{"update":25239552},"timeline_id":"230fc9d756f7363574c0d66533564dcc"}],"retention_period":131072}"#;
-
+    // The timeline_inputs don't participate in the size calculation, and are here just to explain
+    // the inputs.
+    let doc = r#"
+{
+  "segments": [
+    {
+      "segment": {
+        "parent": 9,
+        "lsn": 26033560,
+        "size": null,
+        "needed": false
+      },
+      "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce",
+      "kind": "BranchStart"
+    },
+    {
+      "segment": {
+        "parent": 0,
+        "lsn": 35720400,
+        "size": 25206784,
+        "needed": false
+      },
+      "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce",
+      "kind": "GcCutOff"
+    },
+    {
+      "segment": {
+        "parent": 1,
+        "lsn": 35851472,
+        "size": null,
+        "needed": true
+      },
+      "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce",
+      "kind": "BranchEnd"
+    },
+    {
+      "segment": {
+        "parent": 7,
+        "lsn": 24566168,
+        "size": null,
+        "needed": false
+      },
+      "timeline_id": "454626700469f0a9914949b9d018e876",
+      "kind": "BranchStart"
+    },
+    {
+      "segment": {
+        "parent": 3,
+        "lsn": 25261936,
+        "size": 26050560,
+        "needed": false
+      },
+      "timeline_id": "454626700469f0a9914949b9d018e876",
+      "kind": "GcCutOff"
+    },
+    {
+      "segment": {
+        "parent": 4,
+        "lsn": 25393008,
+        "size": null,
+        "needed": true
+      },
+      "timeline_id": "454626700469f0a9914949b9d018e876",
+      "kind": "BranchEnd"
+    },
+    {
+      "segment": {
+        "parent": null,
+        "lsn": 23694408,
+        "size": null,
+        "needed": false
+      },
+      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
+      "kind": "BranchStart"
+    },
+    {
+      "segment": {
+        "parent": 6,
+        "lsn": 24566168,
+        "size": 25739264,
+        "needed": false
+      },
+      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
+      "kind": "BranchPoint"
+    },
+    {
+      "segment": {
+        "parent": 7,
+        "lsn": 25902488,
+        "size": 26402816,
+        "needed": false
+      },
+      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
+      "kind": "GcCutOff"
+    },
+    {
+      "segment": {
+        "parent": 8,
+        "lsn": 26033560,
+        "size": 26468352,
+        "needed": true
+      },
+      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
+      "kind": "BranchPoint"
+    },
+    {
+      "segment": {
+        "parent": 9,
+        "lsn": 26033560,
+        "size": null,
+        "needed": true
+      },
+      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
+      "kind": "BranchEnd"
+    }
+  ],
+  "timeline_inputs": [
+    {
+      "timeline_id": "20b129c9b50cff7213e6503a31b2a5ce",
+      "ancestor_lsn": "0/18D3D98",
+      "last_record": "0/2230CD0",
+      "latest_gc_cutoff": "0/1698C48",
+      "horizon_cutoff": "0/2210CD0",
+      "pitr_cutoff": "0/2210CD0",
+      "next_gc_cutoff": "0/2210CD0",
+      "retention_param_cutoff": null
+    },
+    {
+      "timeline_id": "454626700469f0a9914949b9d018e876",
+      "ancestor_lsn": "0/176D998",
+      "last_record": "0/1837770",
+      "latest_gc_cutoff": "0/1698C48",
+      "horizon_cutoff": "0/1817770",
+      "pitr_cutoff": "0/1817770",
+      "next_gc_cutoff": "0/1817770",
+      "retention_param_cutoff": null
+    },
+    {
+      "timeline_id": "cb5e3cbe60a4afc00d01880e1a37047f",
+      "ancestor_lsn": "0/0",
+      "last_record": "0/18D3D98",
+      "latest_gc_cutoff": "0/1698C48",
+      "horizon_cutoff": "0/18B3D98",
+      "pitr_cutoff": "0/18B3D98",
+      "next_gc_cutoff": "0/18B3D98",
+      "retention_param_cutoff": null
+    }
+  ]
+}
+"#;
     let inputs: ModelInputs = serde_json::from_str(doc).unwrap();
 
-    assert_eq!(inputs.calculate().unwrap(), 36_409_872);
+    assert_eq!(inputs.calculate().unwrap(), 37_851_408);
 }
 
 #[test]
-fn updates_sort_with_branches_at_same_lsn() {
-    use std::str::FromStr;
-    use Command::{BranchFrom, EndOfBranch};
-
-    macro_rules! lsn {
-        ($e:expr) => {
-            Lsn::from_str($e).unwrap()
-        };
+fn verify_size_for_one_branch() {
+    let doc = r#"
+{
+  "segments": [
+    {
+      "segment": {
+        "parent": null,
+        "lsn": 0,
+        "size": null,
+        "needed": false
+      },
+      "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd",
+      "kind": "BranchStart"
+    },
+    {
+      "segment": {
+        "parent": 0,
+        "lsn": 305547335776,
+        "size": 220054675456,
+        "needed": false
+      },
+      "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd",
+      "kind": "GcCutOff"
+    },
+    {
+      "segment": {
+        "parent": 1,
+        "lsn": 305614444640,
+        "size": null,
+        "needed": true
+      },
+      "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd",
+      "kind": "BranchEnd"
     }
+  ],
+  "timeline_inputs": [
+    {
+      "timeline_id": "f15ae0cf21cce2ba27e4d80c6709a6cd",
+      "ancestor_lsn": "0/0",
+      "last_record": "47/280A5860",
+      "latest_gc_cutoff": "47/240A5860",
+      "horizon_cutoff": "47/240A5860",
+      "pitr_cutoff": "47/240A5860",
+      "next_gc_cutoff": "47/240A5860",
+      "retention_param_cutoff": "0/0"
+    }
+  ]
+}"#;
 
-    let ids = [
-        TimelineId::from_str("00000000000000000000000000000000").unwrap(),
-        TimelineId::from_str("11111111111111111111111111111111").unwrap(),
-        TimelineId::from_str("22222222222222222222222222222222").unwrap(),
-        TimelineId::from_str("33333333333333333333333333333333").unwrap(),
-        TimelineId::from_str("44444444444444444444444444444444").unwrap(),
-    ];
+    let model: ModelInputs = serde_json::from_str(doc).unwrap();
 
-    // issue https://github.com/neondatabase/neon/issues/3179
-    let commands = vec![
-        Update {
-            lsn: lsn!("0/0"),
-            command: BranchFrom(None),
-            timeline_id: ids[0],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: Command::Update(25387008),
-            timeline_id: ids[0],
-        },
-        // next three are wrongly sorted, because
-        // ids[1] is branched from before ids[1] exists
-        // and ids[2] is branched from before ids[2] exists
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: BranchFrom(Some(ids[1])),
-            timeline_id: ids[3],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: BranchFrom(Some(ids[0])),
-            timeline_id: ids[2],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: BranchFrom(Some(ids[2])),
-            timeline_id: ids[1],
-        },
-        Update {
-            lsn: lsn!("0/1CA85B8"),
-            command: Command::Update(28925952),
-            timeline_id: ids[1],
-        },
-        Update {
-            lsn: lsn!("0/1CD85B8"),
-            command: Command::Update(29024256),
-            timeline_id: ids[1],
-        },
-        Update {
-            lsn: lsn!("0/1CD85B8"),
-            command: BranchFrom(Some(ids[1])),
-            timeline_id: ids[4],
-        },
-        Update {
-            lsn: lsn!("0/22DCE70"),
-            command: Command::Update(32546816),
-            timeline_id: ids[3],
-        },
-        Update {
-            lsn: lsn!("0/230CE70"),
-            command: EndOfBranch,
-            timeline_id: ids[3],
-        },
-    ];
+    let res = model.calculate_model().unwrap().calculate();
 
-    let expected = vec![
-        Update {
-            lsn: lsn!("0/0"),
-            command: BranchFrom(None),
-            timeline_id: ids[0],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: Command::Update(25387008),
-            timeline_id: ids[0],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: BranchFrom(Some(ids[0])),
-            timeline_id: ids[2],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: BranchFrom(Some(ids[2])),
-            timeline_id: ids[1],
-        },
-        Update {
-            lsn: lsn!("0/169AD58"),
-            command: BranchFrom(Some(ids[1])),
-            timeline_id: ids[3],
-        },
-        Update {
-            lsn: lsn!("0/1CA85B8"),
-            command: Command::Update(28925952),
-            timeline_id: ids[1],
-        },
-        Update {
-            lsn: lsn!("0/1CD85B8"),
-            command: Command::Update(29024256),
-            timeline_id: ids[1],
-        },
-        Update {
-            lsn: lsn!("0/1CD85B8"),
-            command: BranchFrom(Some(ids[1])),
-            timeline_id: ids[4],
-        },
-        Update {
-            lsn: lsn!("0/22DCE70"),
-            command: Command::Update(32546816),
-            timeline_id: ids[3],
-        },
-        Update {
-            lsn: lsn!("0/230CE70"),
-            command: EndOfBranch,
-            timeline_id: ids[3],
-        },
-    ];
+    println!("calculated synthetic size: {}", res.total_size);
+    println!("result: {:?}", serde_json::to_string(&res.segments));
 
-    let sorted_commands = sort_updates_in_tree_order(commands).unwrap();
-
-    assert_eq!(sorted_commands, expected);
+    use utils::lsn::Lsn;
+    let latest_gc_cutoff_lsn: Lsn = "47/240A5860".parse().unwrap();
+    let last_lsn: Lsn = "47/280A5860".parse().unwrap();
+    println!(
+        "latest_gc_cutoff lsn 47/240A5860 is {}, last_lsn lsn 47/280A5860 is {}",
+        u64::from(latest_gc_cutoff_lsn),
+        u64::from(last_lsn)
+    );
+    assert_eq!(res.total_size, 220121784320);
 }
diff --git a/pageserver/src/walredo.rs b/pageserver/src/walredo.rs
index c943bf0a27..21c6ede27e 100644
--- a/pageserver/src/walredo.rs
+++ b/pageserver/src/walredo.rs
@@ -27,8 +27,7 @@ use std::fs::OpenOptions;
 use std::io::prelude::*;
 use std::io::{Error, ErrorKind};
 use std::ops::{Deref, DerefMut};
-use std::os::fd::RawFd;
-use std::os::unix::io::AsRawFd;
+use std::os::unix::io::{AsRawFd, RawFd};
 use std::os::unix::prelude::CommandExt;
 use std::path::PathBuf;
 use std::process::Stdio;
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index b35252243e..0620ad8a35 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1217,7 +1217,7 @@ class PageserverHttpClient(requests.Session):
         """
         Returns the tenant size, together with the model inputs as the second tuple item.
         """
-        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/size")
+        res = self.get(f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size")
         self.verbose_error(res)
         res = res.json()
         assert isinstance(res, dict)
@@ -1228,6 +1228,16 @@ class PageserverHttpClient(requests.Session):
         assert type(inputs) is dict
         return (size, inputs)
 
+    def tenant_size_debug(self, tenant_id: TenantId) -> str:
+        """
+        Returns the tenant size debug info, as an HTML string
+        """
+        res = self.get(
+            f"http://localhost:{self.port}/v1/tenant/{tenant_id}/synthetic_size",
+            headers={"Accept": "text/html"},
+        )
+        return res.text
+
     def timeline_list(
         self,
         tenant_id: TenantId,
diff --git a/test_runner/regress/test_tenant_size.py b/test_runner/regress/test_tenant_size.py
index bb3bca8782..8c2996f491 100644
--- a/test_runner/regress/test_tenant_size.py
+++ b/test_runner/regress/test_tenant_size.py
@@ -1,13 +1,13 @@
-from typing import Any, List, Tuple
+from pathlib import Path
+from typing import List, Tuple
 
 import pytest
 from fixtures.log_helper import log
-from fixtures.metrics import parse_metrics
 from fixtures.neon_fixtures import NeonEnv, NeonEnvBuilder, wait_for_last_flush_lsn
 from fixtures.types import Lsn
 
 
-def test_empty_tenant_size(neon_simple_env: NeonEnv):
+def test_empty_tenant_size(neon_simple_env: NeonEnv, test_output_dir: Path):
     env = neon_simple_env
     (tenant_id, _) = env.neon_cli.create_tenant()
     http_client = env.pageserver.http_client()
@@ -18,6 +18,9 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv):
 
     main_branch_name = "main"
 
+    branch_name, main_timeline_id = env.neon_cli.list_timelines(tenant_id)[0]
+    assert branch_name == main_branch_name
+
     with env.postgres.create_start(
         main_branch_name,
         tenant_id=tenant_id,
@@ -39,12 +42,44 @@ def test_empty_tenant_size(neon_simple_env: NeonEnv):
     size, inputs = http_client.tenant_size_and_modelinputs(tenant_id)
     assert size == initial_size, "tenant_size should not be affected by shutdown of compute"
 
-    expected_commands: List[Any] = [{"branch_from": None}, "end_of_branch"]
-    actual_commands: List[Any] = list(map(lambda x: x["command"], inputs["updates"]))  # type: ignore
-    assert actual_commands == expected_commands
+    expected_inputs = {
+        "segments": [
+            {
+                "segment": {"parent": None, "lsn": 23694408, "size": 25362432, "needed": True},
+                "timeline_id": f"{main_timeline_id}",
+                "kind": "BranchStart",
+            },
+            {
+                "segment": {"parent": 0, "lsn": 23694528, "size": None, "needed": True},
+                "timeline_id": f"{main_timeline_id}",
+                "kind": "BranchEnd",
+            },
+        ],
+        "timeline_inputs": [
+            {
+                "timeline_id": f"{main_timeline_id}",
+                "ancestor_id": None,
+                "ancestor_lsn": "0/0",
+                "last_record": "0/1698CC0",
+                "latest_gc_cutoff": "0/1698C48",
+                "horizon_cutoff": "0/0",
+                "pitr_cutoff": "0/0",
+                "next_gc_cutoff": "0/0",
+                "retention_param_cutoff": None,
+            }
+        ],
+    }
+    expected_inputs = mask_model_inputs(expected_inputs)
+    actual_inputs = mask_model_inputs(inputs)
+
+    assert expected_inputs == actual_inputs
+
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
 
 
-def test_branched_empty_timeline_size(neon_simple_env: NeonEnv):
+def test_branched_empty_timeline_size(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     Issue found in production. Because the ancestor branch was under
     gc_horizon, the branchpoint was "dangling" and the computation could not be
@@ -75,8 +110,12 @@ def test_branched_empty_timeline_size(neon_simple_env: NeonEnv):
 
     assert size_after_branching > initial_size
 
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
 
-def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv):
+
+def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     More general version of test_branched_empty_timeline_size
 
@@ -128,9 +167,13 @@ def test_branched_from_many_empty_parents_size(neon_simple_env: NeonEnv):
     size_after_writes = http_client.tenant_size(tenant_id)
     assert size_after_writes > initial_size
 
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
+
 
 @pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
-def test_branch_point_within_horizon(neon_simple_env: NeonEnv):
+def test_branch_point_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     gc_horizon = 15
 
@@ -167,9 +210,13 @@ def test_branch_point_within_horizon(neon_simple_env: NeonEnv):
 
     assert size_before_branching < size_after
 
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
+
 
 @pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
-def test_parent_within_horizon(neon_simple_env: NeonEnv):
+def test_parent_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     gc_horizon = 5
 
@@ -179,7 +226,7 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv):
     """
 
     env = neon_simple_env
-    gc_horizon = 200_000
+    gc_horizon = 5_000
     (tenant_id, main_id) = env.neon_cli.create_tenant(conf={"gc_horizon": str(gc_horizon)})
     http_client = env.pageserver.http_client()
 
@@ -212,9 +259,13 @@ def test_parent_within_horizon(neon_simple_env: NeonEnv):
 
     assert size_before_branching < size_after
 
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
+
 
 @pytest.mark.skip("This should work, but is left out because assumed covered by other tests")
-def test_only_heads_within_horizon(neon_simple_env: NeonEnv):
+def test_only_heads_within_horizon(neon_simple_env: NeonEnv, test_output_dir: Path):
     """
     gc_horizon = small
 
@@ -253,8 +304,14 @@ def test_only_heads_within_horizon(neon_simple_env: NeonEnv):
 
             latest_size = size_now
 
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
 
-def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
+
+def test_single_branch_get_tenant_size_grows(
+    neon_env_builder: NeonEnvBuilder, test_output_dir: Path
+):
     """
     Operate on single branch reading the tenants size after each transaction.
     """
@@ -279,7 +336,20 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
 
     collected_responses: List[Tuple[Lsn, int]] = []
 
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+
+    def check_size_change(current_lsn: Lsn, initdb_lsn: Lsn, gc_horizon: int, size: int, prev: int):
+        if current_lsn - initdb_lsn > gc_horizon:
+            assert (
+                size >= prev
+            ), "tenant_size may grow or not grow, because we only add gc_horizon amount of WAL to initial snapshot size"
+        else:
+            assert (
+                size > prev
+            ), "tenant_size should grow, because we continue to add WAL to initial snapshot size"
+
     with env.postgres.create_start(branch_name, tenant_id=tenant_id) as pg:
+        initdb_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
         with pg.cursor() as cur:
             cur.execute("CREATE TABLE t0 (i BIGINT NOT NULL)")
 
@@ -297,13 +367,19 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
 
             current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
 
-            size = http_client.tenant_size(tenant_id)
+            size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
+
+            size_debug = http_client.tenant_size_debug(tenant_id)
+            size_debug_file.write(size_debug)
 
             if len(collected_responses) > 0:
                 prev = collected_responses[-1][1]
                 if size == 0:
                     assert prev == 0
                 else:
+                    # branch start shouldn't be past gc_horizon yet
+                    # thus the size should grow as we insert more data
+                    assert current_lsn - initdb_lsn <= gc_horizon
                     assert size > prev
 
             collected_responses.append((current_lsn, size))
@@ -323,9 +399,15 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
 
             current_lsn = wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
 
-            size = http_client.tenant_size(tenant_id)
+            size, sizes = http_client.tenant_size_and_modelinputs(tenant_id)
+
+            size_debug = http_client.tenant_size_debug(tenant_id)
+            size_debug_file.write(size_debug)
+
             prev = collected_responses[-1][1]
-            assert size > prev, "tenant_size should grow with updates"
+
+            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
+
             collected_responses.append((current_lsn, size))
 
         while True:
@@ -340,9 +422,9 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
 
             size = http_client.tenant_size(tenant_id)
             prev = collected_responses[-1][1]
-            assert (
-                size > prev
-            ), "even though rows have been deleted, the tenant_size should increase"
+
+            check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
+
             collected_responses.append((current_lsn, size))
 
         with pg.cursor() as cur:
@@ -352,7 +434,9 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
 
         size = http_client.tenant_size(tenant_id)
         prev = collected_responses[-1][1]
-        assert size > prev, "dropping table grows tenant_size"
+
+        check_size_change(current_lsn, initdb_lsn, gc_horizon, size, prev)
+
         collected_responses.append((current_lsn, size))
 
     # this isn't too many lines to forget for a while. observed while
@@ -364,24 +448,17 @@ def test_single_branch_get_tenant_size_grows(neon_env_builder: NeonEnvBuilder):
     env.pageserver.stop()
     env.pageserver.start()
 
+    size_debug_file.close()
+
     size_after = http_client.tenant_size(tenant_id)
     prev = collected_responses[-1][1]
 
     assert size_after == prev, "size after restarting pageserver should not have changed"
 
-    ps_metrics = parse_metrics(http_client.get_metrics(), "pageserver")
-    tenant_metric_filter = {
-        "tenant_id": str(tenant_id),
-    }
 
-    tenant_size_metric = int(
-        ps_metrics.query_one("pageserver_tenant_synthetic_size", filter=tenant_metric_filter).value
-    )
-
-    assert tenant_size_metric == size_after, "API size value should be equal to metric size value"
-
-
-def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder):
+def test_get_tenant_size_with_multiple_branches(
+    neon_env_builder: NeonEnvBuilder, test_output_dir: Path
+):
     """
     Reported size goes up while branches or rows are being added, goes down after removing branches.
     """
@@ -481,6 +558,10 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder
     size_after = http_client.tenant_size(tenant_id)
     assert size_after == size_after_thinning_branch
 
+    size_debug_file_before = open(test_output_dir / "size_debug_before.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file_before.write(size_debug)
+
     # teardown, delete branches, and the size should be going down
     http_client.timeline_delete(tenant_id, first_branch_timeline_id)
 
@@ -493,3 +574,38 @@ def test_get_tenant_size_with_multiple_branches(neon_env_builder: NeonEnvBuilder
 
     assert size_after_deleting_second < size_after_continuing_on_main
     assert size_after_deleting_second > size_after_first_branch
+
+    size_debug_file = open(test_output_dir / "size_debug.html", "w")
+    size_debug = http_client.tenant_size_debug(tenant_id)
+    size_debug_file.write(size_debug)
+
+
+# Helper for tests that compare timeline_inputs
+# We don't want to compare the exact values, because they can be unstable
+# and cause flaky tests. So replace the values with useful invariants.
+def mask_model_inputs(x):
+    if isinstance(x, dict):
+        newx = {}
+        for k, v in x.items():
+            if k == "size":
+                if v is None or v == 0:
+                    # no change
+                    newx[k] = v
+                elif v < 0:
+                    newx[k] = "<0"
+                else:
+                    newx[k] = ">0"
+            elif k.endswith("lsn") or k.endswith("cutoff") or k == "last_record":
+                if v is None or v == 0 or v == "0/0":
+                    # no change
+                    newx[k] = v
+                else:
+                    newx[k] = "masked"
+            else:
+                newx[k] = mask_model_inputs(v)
+        return newx
+    elif isinstance(x, list):
+        newlist = [mask_model_inputs(v) for v in x]
+        return newlist
+    else:
+        return x

From 7991bd3b6921ccdd13f0f38085127bbe282d4f26 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 16 Feb 2023 10:56:42 +0200
Subject: [PATCH 07/42] Fix periodic metric sending: don't reset timer on every
 iteration (#3617)

Previously timer was reset on every collect_metrics_iteration and sending of cached metrics was never triggered.

This is a follow-up for a69da4a7.
---
 pageserver/src/consumption_metrics.rs | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index b078782a86..a6d1ec3632 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -75,7 +75,7 @@ pub async fn collect_metrics(
     // define client here to reuse it for all requests
     let client = reqwest::Client::new();
     let mut cached_metrics: HashMap<PageserverConsumptionMetricsKey, u64> = HashMap::new();
-    let mut prev_iteration_time: Option<std::time::Instant> = None;
+    let mut prev_iteration_time: std::time::Instant = std::time::Instant::now();
 
     loop {
         tokio::select! {
@@ -86,11 +86,11 @@ pub async fn collect_metrics(
             _ = ticker.tick() => {
 
                 // send cached metrics every cached_metric_collection_interval
-                let send_cached = prev_iteration_time
-                .map(|x| x.elapsed() >= cached_metric_collection_interval)
-                .unwrap_or(false);
+                let send_cached = prev_iteration_time.elapsed() >= cached_metric_collection_interval;
 
-                prev_iteration_time = Some(std::time::Instant::now());
+                if send_cached {
+                    prev_iteration_time = std::time::Instant::now();
+                }
 
                 collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, node_id, &ctx, send_cached).await;
             }

From 5082d84f5ba099025e6a1ddd51051e6565b16d17 Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Tue, 14 Feb 2023 15:46:43 +0100
Subject: [PATCH 08/42] Compile pgjwt extension

---
 Dockerfile.compute-node | 15 +++++++++++++++
 1 file changed, 15 insertions(+)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 5c58f4baaa..f4479c46cb 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -182,6 +182,20 @@ RUN git clone --branch v0.4.0 https://github.com/pgvector/pgvector.git && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
 
+#########################################################################################
+#
+# Layer "pgjwt-pg-build"
+# compile pgjwt extension
+#
+#########################################################################################
+FROM build-deps AS pgjwt-pg-build
+COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
+
+RUN git clone https://github.com/michelp/pgjwt.git && \
+    cd pgjwt && \
+    make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
+
 #########################################################################################
 #
 # Layer "neon-pg-ext-build"
@@ -196,6 +210,7 @@ COPY --from=h3-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=h3-pg-build /h3/usr /
 COPY --from=unit-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY --from=vector-pg-build /usr/local/pgsql/ /usr/local/pgsql/
+COPY --from=pgjwt-pg-build /usr/local/pgsql/ /usr/local/pgsql/
 COPY pgxn/ pgxn/
 
 RUN make -j $(getconf _NPROCESSORS_ONLN) \

From f0b41e7750f9f16e524e35a5084ba17465db2112 Mon Sep 17 00:00:00 2001
From: Kirill Bulatov <kirill@neon.tech>
Date: Thu, 16 Feb 2023 14:25:35 +0200
Subject: [PATCH 09/42] Propose less verbose way to build neon (#3624)

Closes https://github.com/neondatabase/neon/issues/3518 and might help
https://github.com/neondatabase/neon/issues/3611 and the future build
attempts.

Propose `-s` flag in the Readme when building via `make` command, to
help people to spot build errors easier.
---
 README.md | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 29389e7a5d..f8bc1b7736 100644
--- a/README.md
+++ b/README.md
@@ -83,9 +83,10 @@ cd neon
 
 # The preferred and default is to make a debug build. This will create a
 # demonstrably slower build than a release build. For a release build,
-# use "BUILD_TYPE=release make -j`nproc`"
+# use "BUILD_TYPE=release make -j`nproc` -s"
+# Remove -s for the verbose build log
 
-make -j`nproc`
+make -j`nproc` -s
 ```
 
 #### Building on OSX
@@ -99,9 +100,10 @@ cd neon
 
 # The preferred and default is to make a debug build. This will create a
 # demonstrably slower build than a release build. For a release build,
-# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu`"
+# use "BUILD_TYPE=release make -j`sysctl -n hw.logicalcpu` -s"
+# Remove -s for the verbose build log
 
-make -j`sysctl -n hw.logicalcpu`
+make -j`sysctl -n hw.logicalcpu` -s
 ```
 
 #### Dependency installation notes

From 0cf7fd0fb82b082d02dfadd9d6a488a7f799d72f Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Thu, 16 Feb 2023 15:36:13 +0200
Subject: [PATCH 10/42] Compaction with on-demand download (#3598)

Repeatedly (twice) try to download the compaction targeted layers before actual
compaction. Adds tests for both L0 compaction downloading layers and image
creation downloading layers. Image creation support existed already.

Fixes #3591

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 .../src/tenant/remote_timeline_client.rs      |   5 +-
 pageserver/src/tenant/timeline.rs             | 206 +++++++++++++++---
 test_runner/regress/test_ondemand_download.py | 168 ++++++++++++++
 3 files changed, 345 insertions(+), 34 deletions(-)

diff --git a/pageserver/src/tenant/remote_timeline_client.rs b/pageserver/src/tenant/remote_timeline_client.rs
index 985b480a76..7049a0bd66 100644
--- a/pageserver/src/tenant/remote_timeline_client.rs
+++ b/pageserver/src/tenant/remote_timeline_client.rs
@@ -571,14 +571,15 @@ impl RemoteTimelineClient {
         Ok(())
     }
 
-    ///
     /// Launch a delete operation in the background.
     ///
+    /// The operation does not modify local state but assumes the local files have already been
+    /// deleted, and is used to mirror those changes to remote.
+    ///
     /// Note: This schedules an index file upload before the deletions.  The
     /// deletion won't actually be performed, until any previously scheduled
     /// upload operations, and the index file upload, have completed
     /// succesfully.
-    ///
     pub fn schedule_layer_file_deletion(
         self: &Arc<Self>,
         names: &[LayerFileName],
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 7a5a9de2f4..e606cacf92 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -613,7 +613,10 @@ impl Timeline {
         self.flush_frozen_layers_and_wait().await
     }
 
+    /// Outermost timeline compaction operation; downloads needed layers.
     pub async fn compact(&self, ctx: &RequestContext) -> anyhow::Result<()> {
+        const ROUNDS: usize = 2;
+
         let last_record_lsn = self.get_last_record_lsn();
 
         // Last record Lsn could be zero in case the timeline was just created
@@ -622,6 +625,86 @@ impl Timeline {
             return Ok(());
         }
 
+        // retry two times to allow first round to find layers which need to be downloaded, then
+        // download them, then retry compaction
+        for round in 0..ROUNDS {
+            // should we error out with the most specific error?
+            let last_round = round == ROUNDS - 1;
+
+            let res = self.compact_inner(ctx).await;
+
+            // If `create_image_layers' or `compact_level0` scheduled any
+            // uploads or deletions, but didn't update the index file yet,
+            // do it now.
+            //
+            // This isn't necessary for correctness, the remote state is
+            // consistent without the uploads and deletions, and we would
+            // update the index file on next flush iteration too. But it
+            // could take a while until that happens.
+            //
+            // Additionally, only do this on the terminal round before sleeping.
+            if last_round {
+                if let Some(remote_client) = &self.remote_client {
+                    remote_client.schedule_index_upload_for_file_changes()?;
+                }
+            }
+
+            let rls = match res {
+                Ok(()) => return Ok(()),
+                Err(CompactionError::DownloadRequired(rls)) if !last_round => {
+                    // this can be done at most one time before exiting, waiting
+                    rls
+                }
+                Err(CompactionError::DownloadRequired(rls)) => {
+                    anyhow::bail!("Compaction requires downloading multiple times (last was {} layers), possibly battling against eviction", rls.len())
+                }
+                Err(CompactionError::Other(e)) => {
+                    return Err(e);
+                }
+            };
+
+            // this path can be visited in the second round of retrying, if first one found that we
+            // must first download some remote layers
+            let total = rls.len();
+
+            let mut downloads = rls
+                .into_iter()
+                .map(|rl| self.download_remote_layer(rl))
+                .collect::<futures::stream::FuturesUnordered<_>>();
+
+            let mut failed = 0;
+
+            let cancelled = task_mgr::shutdown_watcher();
+            tokio::pin!(cancelled);
+
+            loop {
+                tokio::select! {
+                    _ = &mut cancelled => anyhow::bail!("Cancelled while downloading remote layers"),
+                    res = downloads.next() => {
+                        match res {
+                            Some(Ok(())) => {},
+                            Some(Err(e)) => {
+                                warn!("Downloading remote layer for compaction failed: {e:#}");
+                                failed += 1;
+                            }
+                            None => break,
+                        }
+                    }
+                }
+            }
+
+            if failed != 0 {
+                anyhow::bail!("{failed} out of {total} layers failed to download, retrying later");
+            }
+
+            // if everything downloaded fine, lets try again
+        }
+
+        unreachable!("retry loop exits")
+    }
+
+    /// Compaction which might need to be retried after downloading remote layers.
+    async fn compact_inner(&self, ctx: &RequestContext) -> Result<(), CompactionError> {
         //
         // High level strategy for compaction / image creation:
         //
@@ -660,7 +743,7 @@ impl Timeline {
         // Is the timeline being deleted?
         let state = *self.state.borrow();
         if state == TimelineState::Stopping {
-            anyhow::bail!("timeline is Stopping");
+            return Err(anyhow::anyhow!("timeline is Stopping").into());
         }
 
         let target_file_size = self.get_checkpoint_distance();
@@ -680,7 +763,8 @@ impl Timeline {
                 // "enough".
                 let layer_paths_to_upload = self
                     .create_image_layers(&partitioning, lsn, false, ctx)
-                    .await?;
+                    .await
+                    .map_err(anyhow::Error::from)?;
                 if let Some(remote_client) = &self.remote_client {
                     for (path, layer_metadata) in layer_paths_to_upload {
                         remote_client.schedule_layer_file_upload(&path, &layer_metadata)?;
@@ -692,18 +776,6 @@ impl Timeline {
                 self.compact_level0(&layer_removal_cs, target_file_size, ctx)
                     .await?;
                 timer.stop_and_record();
-
-                // If `create_image_layers' or `compact_level0` scheduled any
-                // uploads or deletions, but didn't update the index file yet,
-                // do it now.
-                //
-                // This isn't necessary for correctness, the remote state is
-                // consistent without the uploads and deletions, and we would
-                // update the index file on next flush iteration too. But it
-                // could take a while until that happens.
-                if let Some(remote_client) = &self.remote_client {
-                    remote_client.schedule_index_upload_for_file_changes()?;
-                }
             }
             Err(err) => {
                 // no partitioning? This is normal, if the timeline was just created
@@ -2541,10 +2613,13 @@ impl Timeline {
     ) -> anyhow::Result<(KeyPartitioning, Lsn)> {
         {
             let partitioning_guard = self.partitioning.lock().unwrap();
-            if partitioning_guard.1 != Lsn(0)
-                && lsn.0 - partitioning_guard.1 .0 <= self.repartition_threshold
-            {
-                // no repartitioning needed
+            let distance = lsn.0 - partitioning_guard.1 .0;
+            if partitioning_guard.1 != Lsn(0) && distance <= self.repartition_threshold {
+                debug!(
+                    distance,
+                    threshold = self.repartition_threshold,
+                    "no repartitioning needed"
+                );
                 return Ok((partitioning_guard.0.clone(), partitioning_guard.1));
             }
         }
@@ -2562,8 +2637,12 @@ impl Timeline {
 
     // Is it time to create a new image layer for the given partition?
     fn time_for_new_image_layer(&self, partition: &KeySpace, lsn: Lsn) -> anyhow::Result<bool> {
+        let threshold = self.get_image_creation_threshold();
+
         let layers = self.layers.read().unwrap();
 
+        let mut max_deltas = 0;
+
         for part_range in &partition.ranges {
             let image_coverage = layers.image_coverage(part_range, lsn)?;
             for (img_range, last_img) in image_coverage {
@@ -2585,21 +2664,25 @@ impl Timeline {
                 // are some delta layers *later* than current 'lsn', if more WAL was processed and flushed
                 // after we read last_record_lsn, which is passed here in the 'lsn' argument.
                 if img_lsn < lsn {
-                    let threshold = self.get_image_creation_threshold();
                     let num_deltas =
                         layers.count_deltas(&img_range, &(img_lsn..lsn), Some(threshold))?;
 
-                    debug!(
-                        "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
-                        img_range.start, img_range.end, num_deltas, img_lsn, lsn
-                    );
+                    max_deltas = max_deltas.max(num_deltas);
                     if num_deltas >= threshold {
+                        debug!(
+                            "key range {}-{}, has {} deltas on this timeline in LSN range {}..{}",
+                            img_range.start, img_range.end, num_deltas, img_lsn, lsn
+                        );
                         return Ok(true);
                     }
                 }
             }
         }
 
+        debug!(
+            max_deltas,
+            "none of the partitioned ranges had >= {threshold} deltas"
+        );
         Ok(false)
     }
 
@@ -2712,25 +2795,55 @@ impl Timeline {
         Ok(layer_paths_to_upload)
     }
 }
+
 #[derive(Default)]
 struct CompactLevel0Phase1Result {
     new_layers: Vec<DeltaLayer>,
     deltas_to_compact: Vec<Arc<dyn PersistentLayer>>,
 }
 
+/// Top-level failure to compact.
+#[derive(Debug)]
+enum CompactionError {
+    /// L0 compaction requires layers to be downloaded.
+    ///
+    /// This should not happen repeatedly, but will be retried once by top-level
+    /// `Timeline::compact`.
+    DownloadRequired(Vec<Arc<RemoteLayer>>),
+    /// Compaction cannot be done right now; page reconstruction and so on.
+    Other(anyhow::Error),
+}
+
+impl From<anyhow::Error> for CompactionError {
+    fn from(value: anyhow::Error) -> Self {
+        CompactionError::Other(value)
+    }
+}
+
 impl Timeline {
+    /// Level0 files first phase of compaction, explained in the [`compact_inner`] comment.
+    ///
+    /// This method takes the `_layer_removal_cs` guard to highlight it required downloads are
+    /// returned as an error. If the `layer_removal_cs` boundary is changed not to be taken in the
+    /// start of level0 files compaction, the on-demand download should be revisited as well.
     async fn compact_level0_phase1(
         &self,
+        _layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
         target_file_size: u64,
         ctx: &RequestContext,
-    ) -> anyhow::Result<CompactLevel0Phase1Result> {
+    ) -> Result<CompactLevel0Phase1Result, CompactionError> {
         let layers = self.layers.read().unwrap();
         let mut level0_deltas = layers.get_level0_deltas()?;
         drop(layers);
 
         // Only compact if enough layers have accumulated.
-        if level0_deltas.is_empty() || level0_deltas.len() < self.get_compaction_threshold() {
-            return Ok(Default::default());
+        let threshold = self.get_compaction_threshold();
+        if level0_deltas.is_empty() || level0_deltas.len() < threshold {
+            debug!(
+                level0_deltas = level0_deltas.len(),
+                threshold, "too few deltas to compact"
+            );
+            return Ok(CompactLevel0Phase1Result::default());
         }
 
         // Gather the files to compact in this iteration.
@@ -2766,6 +2879,24 @@ impl Timeline {
             end: deltas_to_compact.last().unwrap().get_lsn_range().end,
         };
 
+        let remotes = deltas_to_compact
+            .iter()
+            .filter(|l| l.is_remote_layer())
+            .inspect(|l| info!("compact requires download of {}", l.filename().file_name()))
+            .map(|l| {
+                l.clone()
+                    .downcast_remote_layer()
+                    .expect("just checked it is remote layer")
+            })
+            .collect::<Vec<_>>();
+
+        if !remotes.is_empty() {
+            // caller is holding the lock to layer_removal_cs, and we don't want to download while
+            // holding that; in future download_remote_layer might take it as well. this is
+            // regardless of earlier image creation downloading on-demand, while holding the lock.
+            return Err(CompactionError::DownloadRequired(remotes));
+        }
+
         info!(
             "Starting Level0 compaction in LSN range {}-{} for {} layers ({} deltas in total)",
             lsn_range.start,
@@ -2773,9 +2904,11 @@ impl Timeline {
             deltas_to_compact.len(),
             level0_deltas.len()
         );
+
         for l in deltas_to_compact.iter() {
             info!("compact includes {}", l.filename().file_name());
         }
+
         // We don't need the original list of layers anymore. Drop it so that
         // we don't accidentally use it later in the function.
         drop(level0_deltas);
@@ -2945,7 +3078,9 @@ impl Timeline {
             }
 
             fail_point!("delta-layer-writer-fail-before-finish", |_| {
-                anyhow::bail!("failpoint delta-layer-writer-fail-before-finish");
+                return Err(
+                    anyhow::anyhow!("failpoint delta-layer-writer-fail-before-finish").into(),
+                );
             });
 
             writer.as_mut().unwrap().put_value(key, lsn, value)?;
@@ -2964,7 +3099,7 @@ impl Timeline {
 
             // Fsync all the layer files and directory using multiple threads to
             // minimize latency.
-            par_fsync::par_fsync(&layer_paths)?;
+            par_fsync::par_fsync(&layer_paths).context("fsync all new layers")?;
 
             layer_paths.pop().unwrap();
         }
@@ -2986,11 +3121,13 @@ impl Timeline {
         layer_removal_cs: &tokio::sync::MutexGuard<'_, ()>,
         target_file_size: u64,
         ctx: &RequestContext,
-    ) -> anyhow::Result<()> {
+    ) -> Result<(), CompactionError> {
         let CompactLevel0Phase1Result {
             new_layers,
             deltas_to_compact,
-        } = self.compact_level0_phase1(target_file_size, ctx).await?;
+        } = self
+            .compact_level0_phase1(layer_removal_cs, target_file_size, ctx)
+            .await?;
 
         if new_layers.is_empty() && deltas_to_compact.is_empty() {
             // nothing to do
@@ -3014,7 +3151,12 @@ impl Timeline {
         for l in new_layers {
             let new_delta_path = l.path();
 
-            let metadata = new_delta_path.metadata()?;
+            let metadata = new_delta_path.metadata().with_context(|| {
+                format!(
+                    "read file metadata for new created layer {}",
+                    new_delta_path.display()
+                )
+            })?;
 
             if let Some(remote_client) = &self.remote_client {
                 remote_client.schedule_layer_file_upload(
@@ -3248,7 +3390,7 @@ impl Timeline {
 
         let mut layers_to_remove = Vec::new();
 
-        // Scan all on-disk layers in the timeline.
+        // Scan all layers in the timeline (remote or on-disk).
         //
         // Garbage collect the layer if all conditions are satisfied:
         // 1. it is older than cutoff LSN;
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 3551f27cad..f5f8491ada 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -1,7 +1,10 @@
 # It's possible to run any regular test with the local fs remote storage via
 # env ZENITH_PAGESERVER_OVERRIDES="remote_storage={local_path='/tmp/neon_zzz/'}" poetry ......
 
+import time
+from collections import defaultdict
 from pathlib import Path
+from typing import Any, DefaultDict, Dict
 
 import pytest
 from fixtures.log_helper import log
@@ -10,6 +13,7 @@ from fixtures.neon_fixtures import (
     RemoteStorageKind,
     assert_tenant_status,
     available_remote_storages,
+    wait_for_last_flush_lsn,
     wait_for_last_record_lsn,
     wait_for_sk_commit_lsn_to_reach_remote_storage,
     wait_for_upload,
@@ -449,3 +453,167 @@ def test_download_remote_layers_api(
     pg_old = env.postgres.create_start(branch_name="main")
     with pg_old.cursor() as cur:
         assert query_scalar(cur, "select count(*) from testtab") == table_len
+
+
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3])
+def test_compaction_downloads_on_demand_without_image_creation(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    """
+    Create a few layers, then evict, then make sure compaction runs successfully.
+    """
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_compaction_downloads_on_demand_without_image_creation",
+    )
+
+    env = neon_env_builder.init_start()
+
+    conf = {
+        # Disable background GC & compaction
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # unused, because manual will be called after each table
+        "checkpoint_distance": 100 * 1024**2,
+        # this will be updated later on to allow manual compaction outside of checkpoints
+        "compaction_threshold": 100,
+        # repartitioning parameter, not required here
+        "image_creation_threshold": 100,
+        # repartitioning parameter, not required here
+        "compaction_target_size": 128 * 1024**2,
+        # pitr_interval and gc_horizon are not interesting because we dont run gc
+    }
+
+    # Override defaults, to create more layers
+    tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
+    env.initial_tenant = tenant_id
+    pageserver_http = env.pageserver.http_client()
+
+    with env.postgres.create_start("main") as pg:
+        # no particular reason to create the layers like this, but we are sure
+        # not to hit the image_creation_threshold here.
+        with pg.cursor() as cur:
+            cur.execute("create table a as select id::bigint from generate_series(1, 204800) s(id)")
+        wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+
+        with pg.cursor() as cur:
+            cur.execute("update a set id = -id")
+        wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    assert not layers.in_memory_layers, "no inmemory layers expected after post-commit checkpoint"
+    assert len(layers.historic_layers) == 1 + 2, "should have inidb layer and 2 deltas"
+
+    for layer in layers.historic_layers:
+        log.info(f"pre-compact:  {layer}")
+        pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
+
+    env.neon_cli.config_tenant(tenant_id, {"compaction_threshold": "3"})
+
+    pageserver_http.timeline_compact(tenant_id, timeline_id)
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    for layer in layers.historic_layers:
+        log.info(f"post compact: {layer}")
+    assert len(layers.historic_layers) == 1, "should have compacted to single layer"
+
+
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.MOCK_S3])
+def test_compaction_downloads_on_demand_with_image_creation(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    """
+    Create layers, compact with high image_creation_threshold, then run final compaction with all layers evicted.
+
+    Due to current implementation, this will make image creation on-demand download layers, but we cannot really
+    directly test for it.
+    """
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_compaction_downloads_on_demand",
+    )
+
+    env = neon_env_builder.init_start()
+
+    conf = {
+        # Disable background GC & compaction
+        "gc_period": "0s",
+        "compaction_period": "0s",
+        # repartitioning threshold is this / 10, but it doesn't really seem to matter
+        "checkpoint_distance": 50 * 1024**2,
+        "compaction_threshold": 3,
+        # important: keep this high for the data ingestion
+        "image_creation_threshold": 100,
+        # repartitioning parameter, unused
+        "compaction_target_size": 128 * 1024**2,
+        # pitr_interval and gc_horizon are not interesting because we dont run gc
+    }
+
+    # Override defaults, to create more layers
+    tenant_id, timeline_id = env.neon_cli.create_tenant(conf=stringify(conf))
+    env.initial_tenant = tenant_id
+    pageserver_http = env.pageserver.http_client()
+
+    with env.postgres.create_start("main") as pg:
+        # no particular reason to create the layers like this, but we are sure
+        # not to hit the image_creation_threshold here.
+        with pg.cursor() as cur:
+            cur.execute("create table a (id bigserial primary key, some_value bigint not null)")
+            cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)")
+        wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+
+        for _ in range(0, 2):
+            for i in range(0, 3):
+                # create a minimal amount of "delta difficulty" for this table
+                with pg.cursor() as cur:
+                    cur.execute("update a set some_value = -some_value + %s", (i,))
+
+                with pg.cursor() as cur:
+                    # vacuuming should aid to reuse keys, though it's not really important
+                    # with image_creation_threshold=1 which we will use on the last compaction
+                    cur.execute("vacuum")
+
+                wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+                pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+
+            # images should not yet be created, because threshold is too high,
+            # but these will be reshuffled to L1 layers
+            pageserver_http.timeline_compact(tenant_id, timeline_id)
+
+    for _ in range(0, 20):
+        # loop in case flushing is still in progress
+        layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+        if not layers.in_memory_layers:
+            break
+        time.sleep(0.2)
+
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    assert not layers.in_memory_layers, "no inmemory layers expected after post-commit checkpoint"
+
+    kinds_before: DefaultDict[str, int] = defaultdict(int)
+
+    for layer in layers.historic_layers:
+        kinds_before[layer.kind] += 1
+        pageserver_http.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
+
+    assert dict(kinds_before) == {"Delta": 4}
+
+    # now having evicted all layers, reconfigure to have lower image creation
+    # threshold to expose image creation to downloading all of the needed
+    # layers -- threshold of 2 would sound more reasonable, but keeping it as 1
+    # to be less flaky
+    env.neon_cli.config_tenant(tenant_id, {"image_creation_threshold": "1"})
+
+    pageserver_http.timeline_compact(tenant_id, timeline_id)
+    layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
+    kinds_after: DefaultDict[str, int] = defaultdict(int)
+    for layer in layers.historic_layers:
+        kinds_after[layer.kind] += 1
+
+    assert dict(kinds_after) == {"Delta": 4, "Image": 1}
+
+
+def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
+    return dict(map(lambda x: (x[0], str(x[1])), conf.items()))

From d9ba3c5f5ef3af4f16e8ffe23096310238b2d8c7 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 16 Feb 2023 16:29:08 +0200
Subject: [PATCH 11/42] Revert "Add debug messages around
 timeline.get_current_logical_size"

This reverts commit a5ce2b5330233927169152253548f822cf6d1643.
---
 pageserver/src/consumption_metrics.rs | 23 ++++++++++-------------
 pageserver/src/tenant/timeline.rs     |  7 +------
 2 files changed, 11 insertions(+), 19 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index a6d1ec3632..3d2a01effb 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -166,20 +166,17 @@ pub async fn collect_metrics_iteration(
 
                 match timeline.get_current_logical_size(ctx) {
                     // Only send timeline logical size when it is fully calculated.
-                    Ok((size, is_exact)) => {
-                        if is_exact {
-                            current_metrics.push((
-                                PageserverConsumptionMetricsKey {
-                                    tenant_id,
-                                    timeline_id: Some(timeline.timeline_id),
-                                    metric: TIMELINE_LOGICAL_SIZE,
-                                },
-                                size,
-                            ));
-                        } else {
-                            info!("logical_size is not fully calculated for timeline {}, skipping sending value {} ", timeline.timeline_id, size);
-                        }
+                    Ok((size, is_exact)) if is_exact => {
+                        current_metrics.push((
+                            PageserverConsumptionMetricsKey {
+                                tenant_id,
+                                timeline_id: Some(timeline.timeline_id),
+                                metric: TIMELINE_LOGICAL_SIZE,
+                            },
+                            size,
+                        ));
                     }
+                    Ok((_, _)) => {}
                     Err(err) => {
                         error!(
                             "failed to get current logical size for timeline {}: {err:?}",
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index e606cacf92..66afe2cdce 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -812,15 +812,10 @@ impl Timeline {
 
         let mut is_exact = true;
         let size = current_size.size();
-        if let (CurrentLogicalSize::Approximate(approx_size), Some(init_lsn)) =
+        if let (CurrentLogicalSize::Approximate(_), Some(init_lsn)) =
             (current_size, self.current_logical_size.initial_part_end)
         {
             is_exact = false;
-            info!(
-                "Current size for timeline {} is approximate {}, initial_part_end lsn: {:?}",
-                self.timeline_id, approx_size, init_lsn
-            );
-
             self.try_spawn_size_init_task(init_lsn, ctx);
         }
 

From 6139e8e426c2fdf7d54d517d2a93bb101ada40c6 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 16 Feb 2023 16:29:19 +0200
Subject: [PATCH 12/42] Revert "Add debug messages around sending cached
 metrics"

This reverts commit a839860c2ea9a0bbce39a43e9849daef698025ee.
---
 pageserver/src/consumption_metrics.rs | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index 3d2a01effb..d1383b33cb 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -239,18 +239,6 @@ pub async fn collect_metrics_iteration(
             Some(val) => val != curr_val,
             None => true,
         });
-
-        info!(
-            "sending only changed metrics, {} values at {}",
-            current_metrics.len(),
-            Utc::now()
-        );
-    } else {
-        info!(
-            "sending all metrics, including cached ones. {} values at {}",
-            current_metrics.len(),
-            Utc::now()
-        );
     }
 
     if current_metrics.is_empty() {

From 0d3aefb2749145044b99922fbf33730a0754cd50 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Thu, 16 Feb 2023 16:24:47 +0200
Subject: [PATCH 13/42] Only use active timelines in synthetic_size calculation

---
 pageserver/src/tenant/size.rs | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/size.rs b/pageserver/src/tenant/size.rs
index 2c5efe283b..a41889f16d 100644
--- a/pageserver/src/tenant/size.rs
+++ b/pageserver/src/tenant/size.rs
@@ -134,7 +134,7 @@ pub(super) async fn gather_inputs(
         .context("Failed to refresh gc_info before gathering inputs")?;
 
     // Collect information about all the timelines
-    let timelines = tenant.list_timelines();
+    let mut timelines = tenant.list_timelines();
 
     if timelines.is_empty() {
         // perhaps the tenant has just been created, and as such doesn't have any data yet
@@ -144,6 +144,13 @@ pub(super) async fn gather_inputs(
         });
     }
 
+    // Filter out timelines that are not active
+    //
+    // There may be a race when a timeline is dropped,
+    // but it is unlikely to cause any issues. In the worst case,
+    // the calculation will error out.
+    timelines.retain(|t| t.is_active());
+
     // Build a map of branch points.
     let mut branchpoints: HashMap<TimelineId, HashSet<Lsn>> = HashMap::new();
     for timeline in timelines.iter() {

From d9c518b2cc44da2cda378199e1ce27fafe0e4838 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Tue, 7 Feb 2023 19:36:30 +0200
Subject: [PATCH 14/42] Refactor use_cleartext_password_flow.

It's not a property of the credentials that we receive from the
client, so remove it from ClientCredentials. Instead, pass it as an
argument directly to 'authenticate' function, where it's actually
used. All the rest of the changes is just plumbing to pass it through
the call stack to 'authenticate'
---
 proxy/src/auth/backend.rs     | 14 ++++++++++++--
 proxy/src/auth/credentials.rs | 22 +++++++---------------
 proxy/src/proxy.rs            | 18 ++++++++++++------
 3 files changed, 31 insertions(+), 23 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 50afbd2a27..5cd02df87c 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -114,6 +114,7 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> {
         &'a mut self,
         extra: &'a ConsoleReqExtra<'a>,
         client: &'a mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+        use_cleartext_password_flow: bool,
     ) -> auth::Result<Option<AuthSuccess<CachedNodeInfo>>> {
         use BackendType::*;
 
@@ -158,7 +159,7 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> {
                 (node, payload.password)
             }
             // This is a hack to allow cleartext password in secure connections (wss).
-            Console(api, creds) if creds.use_cleartext_password_flow => {
+            Console(api, creds) if use_cleartext_password_flow => {
                 let payload = fetch_plaintext_password(client).await?;
                 let node = api.wake_compute(extra, creds).await?;
 
@@ -182,16 +183,25 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> {
     }
 
     /// Authenticate the client via the requested backend, possibly using credentials.
+    ///
+    /// If `use_cleartext_password_flow` is true, we use the old cleartext password
+    /// flow. It is used for websocket connections, which want to minimize the number
+    /// of round trips. (Plaintext password authentication requires only one round-trip,
+    /// where SCRAM requires two.)
     pub async fn authenticate<'a>(
         &mut self,
         extra: &'a ConsoleReqExtra<'a>,
         client: &'a mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+        use_cleartext_password_flow: bool,
     ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
         use BackendType::*;
 
         // Handle cases when `project` is missing in `creds`.
         // TODO: type safety: return `creds` with irrefutable `project`.
-        if let Some(res) = self.try_password_hack(extra, client).await? {
+        if let Some(res) = self
+            .try_password_hack(extra, client, use_cleartext_password_flow)
+            .await?
+        {
             info!("user successfully authenticated (using the password hack)");
             return Ok(res);
         }
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 968104f058..2d2f193bec 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -34,9 +34,6 @@ pub struct ClientCredentials<'a> {
     pub user: &'a str,
     // TODO: this is a severe misnomer! We should think of a new name ASAP.
     pub project: Option<Cow<'a, str>>,
-    /// If `True`, we'll use the old cleartext password flow. This is used for
-    /// websocket connections, which want to minimize the number of round trips.
-    pub use_cleartext_password_flow: bool,
 }
 
 impl ClientCredentials<'_> {
@@ -51,7 +48,6 @@ impl<'a> ClientCredentials<'a> {
         params: &'a StartupMessageParams,
         sni: Option<&str>,
         common_name: Option<&str>,
-        use_cleartext_password_flow: bool,
     ) -> Result<Self, ClientCredsParseError> {
         use ClientCredsParseError::*;
 
@@ -99,14 +95,12 @@ impl<'a> ClientCredentials<'a> {
         info!(
             user = user,
             project = project.as_deref(),
-            use_cleartext_password_flow = use_cleartext_password_flow,
             "credentials"
         );
 
         Ok(Self {
             user,
             project,
-            use_cleartext_password_flow,
         })
     }
 }
@@ -131,7 +125,7 @@ mod tests {
         // According to postgresql, only `user` should be required.
         let options = StartupMessageParams::new([("user", "john_doe")]);
 
-        let creds = ClientCredentials::parse(&options, None, None, false)?;
+        let creds = ClientCredentials::parse(&options, None, None)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project, None);
 
@@ -146,7 +140,7 @@ mod tests {
             ("foo", "bar"),        // should be ignored
         ]);
 
-        let creds = ClientCredentials::parse(&options, None, None, false)?;
+        let creds = ClientCredentials::parse(&options, None, None)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project, None);
 
@@ -160,7 +154,7 @@ mod tests {
         let sni = Some("foo.localhost");
         let common_name = Some("localhost");
 
-        let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
+        let creds = ClientCredentials::parse(&options, sni, common_name)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project.as_deref(), Some("foo"));
 
@@ -174,7 +168,7 @@ mod tests {
             ("options", "-ckey=1 project=bar -c geqo=off"),
         ]);
 
-        let creds = ClientCredentials::parse(&options, None, None, false)?;
+        let creds = ClientCredentials::parse(&options, None, None)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project.as_deref(), Some("bar"));
 
@@ -188,7 +182,7 @@ mod tests {
         let sni = Some("baz.localhost");
         let common_name = Some("localhost");
 
-        let creds = ClientCredentials::parse(&options, sni, common_name, false)?;
+        let creds = ClientCredentials::parse(&options, sni, common_name)?;
         assert_eq!(creds.user, "john_doe");
         assert_eq!(creds.project.as_deref(), Some("baz"));
 
@@ -203,8 +197,7 @@ mod tests {
         let sni = Some("second.localhost");
         let common_name = Some("localhost");
 
-        let err =
-            ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
+        let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
         match err {
             InconsistentProjectNames { domain, option } => {
                 assert_eq!(option, "first");
@@ -221,8 +214,7 @@ mod tests {
         let sni = Some("project.localhost");
         let common_name = Some("example.com");
 
-        let err =
-            ClientCredentials::parse(&options, sni, common_name, false).expect_err("should fail");
+        let err = ClientCredentials::parse(&options, sni, common_name).expect_err("should fail");
         match err {
             InconsistentSni { sni, cn } => {
                 assert_eq!(sni, "project.localhost");
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index a622a35e6d..c1ed79ecb6 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -127,7 +127,7 @@ pub async fn handle_ws_client(
         let result = config
             .auth_backend
             .as_ref()
-            .map(|_| auth::ClientCredentials::parse(&params, hostname, common_name, true))
+            .map(|_| auth::ClientCredentials::parse(&params, hostname, common_name))
             .transpose();
 
         async { result }.or_else(|e| stream.throw_error(e)).await?
@@ -135,7 +135,7 @@ pub async fn handle_ws_client(
 
     let client = Client::new(stream, creds, &params, session_id);
     cancel_map
-        .with_session(|session| client.connect_to_db(session))
+        .with_session(|session| client.connect_to_db(session, true))
         .await
 }
 
@@ -165,7 +165,7 @@ async fn handle_client(
         let result = config
             .auth_backend
             .as_ref()
-            .map(|_| auth::ClientCredentials::parse(&params, sni, common_name, false))
+            .map(|_| auth::ClientCredentials::parse(&params, sni, common_name))
             .transpose();
 
         async { result }.or_else(|e| stream.throw_error(e)).await?
@@ -173,7 +173,7 @@ async fn handle_client(
 
     let client = Client::new(stream, creds, &params, session_id);
     cancel_map
-        .with_session(|session| client.connect_to_db(session))
+        .with_session(|session| client.connect_to_db(session, false))
         .await
 }
 
@@ -401,7 +401,11 @@ impl<'a, S> Client<'a, S> {
 
 impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
     /// Let the client authenticate and connect to the designated compute node.
-    async fn connect_to_db(self, session: cancellation::Session<'_>) -> anyhow::Result<()> {
+    async fn connect_to_db(
+        self,
+        session: cancellation::Session<'_>,
+        use_cleartext_password_flow: bool,
+    ) -> anyhow::Result<()> {
         let Self {
             mut stream,
             mut creds,
@@ -416,7 +420,9 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
 
         let auth_result = async {
             // `&mut stream` doesn't let us merge those 2 lines.
-            let res = creds.authenticate(&extra, &mut stream).await;
+            let res = creds
+                .authenticate(&extra, &mut stream, use_cleartext_password_flow)
+                .await;
             async { res }.or_else(|e| stream.throw_error(e)).await
         }
         .instrument(info_span!("auth"))

From edffe0dd9d811aa4a11e267ceeb82837a56e864b Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <ivadmi5@gmail.com>
Date: Wed, 15 Feb 2023 15:17:18 +0300
Subject: [PATCH 15/42] Extract password hack & cleartext hack

---
 proxy/src/auth/backend.rs         | 190 +++++++++++++++---------------
 proxy/src/auth/backend/classic.rs |   2 +-
 proxy/src/auth/backend/link.rs    |   2 +-
 proxy/src/auth/credentials.rs     |  19 ++-
 4 files changed, 103 insertions(+), 110 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 5cd02df87c..25e288ad2e 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -1,7 +1,6 @@
 mod classic;
-
 mod link;
-use futures::TryFutureExt;
+
 pub use link::LinkAuthError;
 
 use crate::{
@@ -13,6 +12,7 @@ use crate::{
     },
     stream, url,
 };
+use futures::TryFutureExt;
 use std::borrow::Cow;
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::{info, warn};
@@ -105,107 +105,99 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
     }
 }
 
-// TODO: get rid of explicit lifetimes in this block (there's a bug in rustc).
-// Read more: https://github.com/rust-lang/rust/issues/99190
-// Alleged fix: https://github.com/rust-lang/rust/pull/89056
-impl<'l> BackendType<'l, ClientCredentials<'_>> {
-    /// Do something special if user didn't provide the `project` parameter.
-    async fn try_password_hack<'a>(
-        &'a mut self,
-        extra: &'a ConsoleReqExtra<'a>,
-        client: &'a mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-        use_cleartext_password_flow: bool,
-    ) -> auth::Result<Option<AuthSuccess<CachedNodeInfo>>> {
-        use BackendType::*;
+/// Compared to [SCRAM](crate::scram), cleartext password auth saves
+/// one round trip and *expensive* computations (>= 4096 HMAC iterations).
+/// These properties are benefical for serverless JS workers, so we
+/// use this mechanism for websocket connections.
+async fn do_cleartext_hack(
+    api: &impl console::Api,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
+    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    warn!("cleartext auth flow override is enabled, proceeding");
+    let password = AuthFlow::new(client)
+        .begin(auth::CleartextPassword)
+        .await?
+        .authenticate()
+        .await?;
 
-        // If there's no project so far, that entails that client doesn't
-        // support SNI or other means of passing the project name.
-        // We now expect to see a very specific payload in the place of password.
-        let fetch_magic_payload = |client| async {
-            warn!("project name not specified, resorting to the password hack auth flow");
-            let payload = AuthFlow::new(client)
-                .begin(auth::PasswordHack)
-                .await?
-                .authenticate()
-                .await?;
+    let mut node = api.wake_compute(extra, creds).await?;
+    node.config.password(password);
 
-            info!(project = &payload.project, "received missing parameter");
-            auth::Result::Ok(payload)
-        };
+    Ok(AuthSuccess {
+        reported_auth_ok: false,
+        value: node,
+    })
+}
 
-        // If we want to use cleartext password flow, we can read the password
-        // from the client and pretend that it's a magic payload (PasswordHack hack).
-        let fetch_plaintext_password = |client| async {
-            info!("using cleartext password flow");
-            let payload = AuthFlow::new(client)
-                .begin(auth::CleartextPassword)
-                .await?
-                .authenticate()
-                .await?;
+/// Workaround for clients which don't provide an endpoint (project) name.
+/// Very similar to [`do_cleartext`], but there's a specific password format.
+async fn do_password_hack(
+    api: &impl console::Api,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
+    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    warn!("project not specified, resorting to the password hack auth flow");
+    let payload = AuthFlow::new(client)
+        .begin(auth::PasswordHack)
+        .await?
+        .authenticate()
+        .await?;
 
-            auth::Result::Ok(auth::password_hack::PasswordHackPayload {
-                project: String::new(),
-                password: payload,
-            })
-        };
+    info!(project = &payload.project, "received missing parameter");
+    creds.project = Some(payload.project.into());
 
-        // TODO: find a proper way to merge those very similar blocks.
-        let (mut node, password) = match self {
-            Console(api, creds) if creds.project.is_none() => {
-                let payload = fetch_magic_payload(client).await?;
-                creds.project = Some(payload.project.into());
-                let node = api.wake_compute(extra, creds).await?;
+    let mut node = api.wake_compute(extra, creds).await?;
+    node.config.password(payload.password);
 
-                (node, payload.password)
-            }
-            // This is a hack to allow cleartext password in secure connections (wss).
-            Console(api, creds) if use_cleartext_password_flow => {
-                let payload = fetch_plaintext_password(client).await?;
-                let node = api.wake_compute(extra, creds).await?;
+    Ok(AuthSuccess {
+        reported_auth_ok: false,
+        value: node,
+    })
+}
 
-                (node, payload.password)
-            }
-            Postgres(api, creds) if creds.project.is_none() => {
-                let payload = fetch_magic_payload(client).await?;
-                creds.project = Some(payload.project.into());
-                let node = api.wake_compute(extra, creds).await?;
-
-                (node, payload.password)
-            }
-            _ => return Ok(None),
-        };
-
-        node.config.password(password);
-        Ok(Some(AuthSuccess {
-            reported_auth_ok: false,
-            value: node,
-        }))
+/// True to its name, this function encapsulates our current auth trade-offs.
+/// Here, we choose the appropriate auth flow based on circumstances.
+async fn auth_quirks(
+    api: &impl console::Api,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
+    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+    allow_cleartext: bool,
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    // If there's no project so far, that entails that client doesn't
+    // support SNI or other means of passing the endpoint (project) name.
+    // We now expect to see a very specific payload in the place of password.
+    if creds.project.is_none() {
+        return do_password_hack(api, extra, creds, client).await;
     }
 
+    // Password hack should set the project name.
+    // TODO: make `creds.project` more type-safe.
+    assert!(creds.project.is_some());
+
+    // Perform cleartext auth if we're allowed to do that.
+    // Currently, we use it for websocket connections (latency).
+    if allow_cleartext {
+        return do_cleartext_hack(api, extra, creds, client).await;
+    }
+
+    // Finally, proceed with the main auth flow (SCRAM-based).
+    classic::authenticate(api, extra, creds, client).await
+}
+
+impl BackendType<'_, ClientCredentials<'_>> {
     /// Authenticate the client via the requested backend, possibly using credentials.
-    ///
-    /// If `use_cleartext_password_flow` is true, we use the old cleartext password
-    /// flow. It is used for websocket connections, which want to minimize the number
-    /// of round trips. (Plaintext password authentication requires only one round-trip,
-    /// where SCRAM requires two.)
-    pub async fn authenticate<'a>(
+    pub async fn authenticate(
         &mut self,
-        extra: &'a ConsoleReqExtra<'a>,
-        client: &'a mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-        use_cleartext_password_flow: bool,
+        extra: &ConsoleReqExtra<'_>,
+        client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+        allow_cleartext: bool,
     ) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
         use BackendType::*;
 
-        // Handle cases when `project` is missing in `creds`.
-        // TODO: type safety: return `creds` with irrefutable `project`.
-        if let Some(res) = self
-            .try_password_hack(extra, client, use_cleartext_password_flow)
-            .await?
-        {
-            info!("user successfully authenticated (using the password hack)");
-            return Ok(res);
-        }
-
         let res = match self {
             Console(api, creds) => {
                 info!(
@@ -214,20 +206,24 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> {
                     "performing authentication using the console"
                 );
 
-                assert!(creds.project.is_some());
-                classic::handle_user(api.as_ref(), extra, creds, client).await?
+                let api = api.as_ref();
+                auth_quirks(api, extra, creds, client, allow_cleartext).await?
             }
             Postgres(api, creds) => {
-                info!("performing mock authentication using a local postgres instance");
+                info!(
+                    user = creds.user,
+                    project = creds.project(),
+                    "performing authentication using a local postgres instance"
+                );
 
-                assert!(creds.project.is_some());
-                classic::handle_user(api.as_ref(), extra, creds, client).await?
+                let api = api.as_ref();
+                auth_quirks(api, extra, creds, client, allow_cleartext).await?
             }
             // NOTE: this auth backend doesn't use client credentials.
             Link(url) => {
                 info!("performing link authentication");
 
-                link::handle_user(url, client)
+                link::authenticate(url, client)
                     .await?
                     .map(CachedNodeInfo::new_uncached)
             }
@@ -239,9 +235,9 @@ impl<'l> BackendType<'l, ClientCredentials<'_>> {
 
     /// When applicable, wake the compute node, gaining its connection info in the process.
     /// The link auth flow doesn't support this, so we return [`None`] in that case.
-    pub async fn wake_compute<'a>(
+    pub async fn wake_compute(
         &self,
-        extra: &'a ConsoleReqExtra<'a>,
+        extra: &ConsoleReqExtra<'_>,
     ) -> Result<Option<CachedNodeInfo>, console::errors::WakeComputeError> {
         use BackendType::*;
 
diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs
index eefef6e9b4..6753e7ed7f 100644
--- a/proxy/src/auth/backend/classic.rs
+++ b/proxy/src/auth/backend/classic.rs
@@ -9,7 +9,7 @@ use crate::{
 use tokio::io::{AsyncRead, AsyncWrite};
 use tracing::info;
 
-pub(super) async fn handle_user(
+pub(super) async fn authenticate(
     api: &impl console::Api,
     extra: &ConsoleReqExtra<'_>,
     creds: &ClientCredentials<'_>,
diff --git a/proxy/src/auth/backend/link.rs b/proxy/src/auth/backend/link.rs
index 5d0049c957..7175a23dc1 100644
--- a/proxy/src/auth/backend/link.rs
+++ b/proxy/src/auth/backend/link.rs
@@ -53,7 +53,7 @@ pub fn new_psql_session_id() -> String {
     hex::encode(rand::random::<[u8; 8]>())
 }
 
-pub(super) async fn handle_user(
+pub(super) async fn authenticate(
     link_uri: &reqwest::Url,
     client: &mut PqStream<impl AsyncRead + AsyncWrite + Unpin>,
 ) -> auth::Result<AuthSuccess<NodeInfo>> {
diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs
index 2d2f193bec..c556c33197 100644
--- a/proxy/src/auth/credentials.rs
+++ b/proxy/src/auth/credentials.rs
@@ -11,12 +11,16 @@ pub enum ClientCredsParseError {
     #[error("Parameter '{0}' is missing in startup packet.")]
     MissingKey(&'static str),
 
-    #[error("Inconsistent project name inferred from SNI ('{}') and project option ('{}').", .domain, .option)]
+    #[error(
+        "Inconsistent project name inferred from \
+         SNI ('{}') and project option ('{}').",
+        .domain, .option,
+    )]
     InconsistentProjectNames { domain: String, option: String },
 
     #[error(
         "SNI ('{}') inconsistently formatted with respect to common name ('{}'). \
-        SNI should be formatted as '<project-name>.{}'.",
+         SNI should be formatted as '<project-name>.{}'.",
         .sni, .cn, .cn,
     )]
     InconsistentSni { sni: String, cn: String },
@@ -92,16 +96,9 @@ impl<'a> ClientCredentials<'a> {
         }
         .transpose()?;
 
-        info!(
-            user = user,
-            project = project.as_deref(),
-            "credentials"
-        );
+        info!(user, project = project.as_deref(), "credentials");
 
-        Ok(Self {
-            user,
-            project,
-        })
+        Ok(Self { user, project })
     }
 }
 

From a4d5c8085bba69131790b8ac299a9e78893b2427 Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <ivadmi5@gmail.com>
Date: Wed, 15 Feb 2023 15:39:21 +0300
Subject: [PATCH 16/42] Move hacks to a dedicated module.

---
 proxy/src/auth/backend.rs       | 64 ++++----------------------------
 proxy/src/auth/backend/hacks.rs | 66 +++++++++++++++++++++++++++++++++
 proxy/src/proxy.rs              |  4 +-
 3 files changed, 75 insertions(+), 59 deletions(-)
 create mode 100644 proxy/src/auth/backend/hacks.rs

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 25e288ad2e..42b2304bb8 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -1,10 +1,11 @@
 mod classic;
+mod hacks;
 mod link;
 
 pub use link::LinkAuthError;
 
 use crate::{
-    auth::{self, AuthFlow, ClientCredentials},
+    auth::{self, ClientCredentials},
     console::{
         self,
         provider::{CachedNodeInfo, ConsoleReqExtra},
@@ -15,7 +16,7 @@ use crate::{
 use futures::TryFutureExt;
 use std::borrow::Cow;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{info, warn};
+use tracing::info;
 
 /// A product of successful authentication.
 pub struct AuthSuccess<T> {
@@ -105,59 +106,6 @@ impl<'a, T, E> BackendType<'a, Result<T, E>> {
     }
 }
 
-/// Compared to [SCRAM](crate::scram), cleartext password auth saves
-/// one round trip and *expensive* computations (>= 4096 HMAC iterations).
-/// These properties are benefical for serverless JS workers, so we
-/// use this mechanism for websocket connections.
-async fn do_cleartext_hack(
-    api: &impl console::Api,
-    extra: &ConsoleReqExtra<'_>,
-    creds: &mut ClientCredentials<'_>,
-    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
-    warn!("cleartext auth flow override is enabled, proceeding");
-    let password = AuthFlow::new(client)
-        .begin(auth::CleartextPassword)
-        .await?
-        .authenticate()
-        .await?;
-
-    let mut node = api.wake_compute(extra, creds).await?;
-    node.config.password(password);
-
-    Ok(AuthSuccess {
-        reported_auth_ok: false,
-        value: node,
-    })
-}
-
-/// Workaround for clients which don't provide an endpoint (project) name.
-/// Very similar to [`do_cleartext`], but there's a specific password format.
-async fn do_password_hack(
-    api: &impl console::Api,
-    extra: &ConsoleReqExtra<'_>,
-    creds: &mut ClientCredentials<'_>,
-    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
-) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
-    warn!("project not specified, resorting to the password hack auth flow");
-    let payload = AuthFlow::new(client)
-        .begin(auth::PasswordHack)
-        .await?
-        .authenticate()
-        .await?;
-
-    info!(project = &payload.project, "received missing parameter");
-    creds.project = Some(payload.project.into());
-
-    let mut node = api.wake_compute(extra, creds).await?;
-    node.config.password(payload.password);
-
-    Ok(AuthSuccess {
-        reported_auth_ok: false,
-        value: node,
-    })
-}
-
 /// True to its name, this function encapsulates our current auth trade-offs.
 /// Here, we choose the appropriate auth flow based on circumstances.
 async fn auth_quirks(
@@ -171,7 +119,8 @@ async fn auth_quirks(
     // support SNI or other means of passing the endpoint (project) name.
     // We now expect to see a very specific payload in the place of password.
     if creds.project.is_none() {
-        return do_password_hack(api, extra, creds, client).await;
+        // Password will be checked by the compute node later.
+        return hacks::password_hack(api, extra, creds, client).await;
     }
 
     // Password hack should set the project name.
@@ -181,7 +130,8 @@ async fn auth_quirks(
     // Perform cleartext auth if we're allowed to do that.
     // Currently, we use it for websocket connections (latency).
     if allow_cleartext {
-        return do_cleartext_hack(api, extra, creds, client).await;
+        // Password will be checked by the compute node later.
+        return hacks::cleartext_hack(api, extra, creds, client).await;
     }
 
     // Finally, proceed with the main auth flow (SCRAM-based).
diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs
new file mode 100644
index 0000000000..f710581cb2
--- /dev/null
+++ b/proxy/src/auth/backend/hacks.rs
@@ -0,0 +1,66 @@
+use super::AuthSuccess;
+use crate::{
+    auth::{self, AuthFlow, ClientCredentials},
+    console::{
+        self,
+        provider::{CachedNodeInfo, ConsoleReqExtra},
+    },
+    stream,
+};
+use tokio::io::{AsyncRead, AsyncWrite};
+use tracing::{info, warn};
+
+/// Compared to [SCRAM](crate::scram), cleartext password auth saves
+/// one round trip and *expensive* computations (>= 4096 HMAC iterations).
+/// These properties are benefical for serverless JS workers, so we
+/// use this mechanism for websocket connections.
+pub async fn cleartext_hack(
+    api: &impl console::Api,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
+    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    warn!("cleartext auth flow override is enabled, proceeding");
+    let password = AuthFlow::new(client)
+        .begin(auth::CleartextPassword)
+        .await?
+        .authenticate()
+        .await?;
+
+    let mut node = api.wake_compute(extra, creds).await?;
+    node.config.password(password);
+
+    // Report tentative success; compute node will check the password anyway.
+    Ok(AuthSuccess {
+        reported_auth_ok: false,
+        value: node,
+    })
+}
+
+/// Workaround for clients which don't provide an endpoint (project) name.
+/// Very similar to [`cleartext_hack`], but there's a specific password format.
+pub async fn password_hack(
+    api: &impl console::Api,
+    extra: &ConsoleReqExtra<'_>,
+    creds: &mut ClientCredentials<'_>,
+    client: &mut stream::PqStream<impl AsyncRead + AsyncWrite + Unpin>,
+) -> auth::Result<AuthSuccess<CachedNodeInfo>> {
+    warn!("project not specified, resorting to the password hack auth flow");
+    let payload = AuthFlow::new(client)
+        .begin(auth::PasswordHack)
+        .await?
+        .authenticate()
+        .await?;
+
+    info!(project = &payload.project, "received missing parameter");
+    creds.project = Some(payload.project.into());
+
+    let mut node = api.wake_compute(extra, creds).await?;
+    node.config.password(payload.password);
+
+    // Report tentative success; compute node will check the password anyway.
+    Ok(AuthSuccess {
+        reported_auth_ok: false,
+        value: node,
+    })
+}
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index c1ed79ecb6..0dc48f1212 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -404,7 +404,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
     async fn connect_to_db(
         self,
         session: cancellation::Session<'_>,
-        use_cleartext_password_flow: bool,
+        allow_cleartext: bool,
     ) -> anyhow::Result<()> {
         let Self {
             mut stream,
@@ -421,7 +421,7 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
         let auth_result = async {
             // `&mut stream` doesn't let us merge those 2 lines.
             let res = creds
-                .authenticate(&extra, &mut stream, use_cleartext_password_flow)
+                .authenticate(&extra, &mut stream, allow_cleartext)
                 .await;
             async { res }.or_else(|e| stream.throw_error(e)).await
         }

From a1b062123ba921d7ef48e3620eada6b9fc0c288b Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Thu, 16 Feb 2023 21:28:53 +0100
Subject: [PATCH 17/42] Do not deploy storage to old account (#3630)

It's gone
---
 .github/ansible/production.hosts.yaml         | 40 ----------
 .../production.neon-storage-broker.yaml       | 56 --------------
 .github/workflows/deploy-prod.yml             | 77 -------------------
 3 files changed, 173 deletions(-)
 delete mode 100644 .github/ansible/production.hosts.yaml
 delete mode 100644 .github/helm-values/production.neon-storage-broker.yaml

diff --git a/.github/ansible/production.hosts.yaml b/.github/ansible/production.hosts.yaml
deleted file mode 100644
index ecb847bd61..0000000000
--- a/.github/ansible/production.hosts.yaml
+++ /dev/null
@@ -1,40 +0,0 @@
----
-storage:
-  vars:
-    console_mgmt_base_url: http://console-release.local
-    bucket_name: zenith-storage-oregon
-    bucket_region: us-west-2
-    broker_endpoint: http://storage-broker.prod.local:50051
-    pageserver_config_stub:
-      pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
-      metric_collection_interval: 10min
-      remote_storage:
-        bucket_name: "{{ bucket_name }}"
-        bucket_region: "{{ bucket_region }}"
-        prefix_in_bucket: "{{ inventory_hostname }}"
-    safekeeper_s3_prefix: prod-1/wal
-    hostname_suffix: ".local"
-    remote_user: admin
-    sentry_environment: production
-
-  children:
-    pageservers:
-      hosts:
-        zenith-1-ps-2:
-          console_region_id: aws-us-west-2
-        zenith-1-ps-3:
-          console_region_id: aws-us-west-2
-        zenith-1-ps-4:
-          console_region_id: aws-us-west-2
-        zenith-1-ps-5:
-          console_region_id: aws-us-west-2
-
-    safekeepers:
-      hosts:
-        zenith-1-sk-1:
-          console_region_id: aws-us-west-2
-        zenith-1-sk-2:
-          console_region_id: aws-us-west-2
-        zenith-1-sk-4:
-          console_region_id: aws-us-west-2
diff --git a/.github/helm-values/production.neon-storage-broker.yaml b/.github/helm-values/production.neon-storage-broker.yaml
deleted file mode 100644
index aa64081da3..0000000000
--- a/.github/helm-values/production.neon-storage-broker.yaml
+++ /dev/null
@@ -1,56 +0,0 @@
-# Helm chart values for neon-storage-broker
-podLabels:
-  neon_env: production
-  neon_service: storage-broker
-
-# Use L4 LB
-service:
-  # service.annotations -- Annotations to add to the service
-  annotations:
-    service.beta.kubernetes.io/aws-load-balancer-type: external  # use newer AWS Load Balancer Controller
-    service.beta.kubernetes.io/aws-load-balancer-nlb-target-type: ip
-    service.beta.kubernetes.io/aws-load-balancer-scheme: internal  # deploy LB to private subnet
-    # assign service to this name at external-dns
-    external-dns.alpha.kubernetes.io/hostname: storage-broker.prod.local
-  # service.type -- Service type
-  type: LoadBalancer
-  # service.port -- broker listen port
-  port: 50051
-
-ingress:
-  enabled: false
-
-metrics:
-  enabled: true
-  serviceMonitor:
-    enabled: true
-    selector:
-      release: kube-prometheus-stack
-
-extraManifests:
-  - apiVersion: operator.victoriametrics.com/v1beta1
-    kind: VMServiceScrape
-    metadata:
-      name: "{{ include \"neon-storage-broker.fullname\" . }}"
-      labels:
-        helm.sh/chart: neon-storage-broker-{{ .Chart.Version }}
-        app.kubernetes.io/name: neon-storage-broker
-        app.kubernetes.io/instance: neon-storage-broker
-        app.kubernetes.io/version: "{{ .Chart.AppVersion }}"
-        app.kubernetes.io/managed-by: Helm
-      namespace: "{{ .Release.Namespace }}"
-    spec:
-      selector:
-        matchLabels:
-          app.kubernetes.io/name: "neon-storage-broker"
-      endpoints:
-        - port: broker
-          path: /metrics
-          interval: 10s
-          scrapeTimeout: 10s
-      namespaceSelector:
-        matchNames:
-          - "{{ .Release.Namespace }}"
-
-settings:
-  sentryEnvironment: "production"
diff --git a/.github/workflows/deploy-prod.yml b/.github/workflows/deploy-prod.yml
index f4ce7e9afa..540d187274 100644
--- a/.github/workflows/deploy-prod.yml
+++ b/.github/workflows/deploy-prod.yml
@@ -165,80 +165,3 @@ jobs:
       - name: Deploy storage-broker
         run:
           helm upgrade neon-storage-broker-lb neondatabase/neon-storage-broker --namespace neon-storage-broker-lb --create-namespace --install --atomic -f .github/helm-values/${{ matrix.target_cluster }}.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
-  # Deploy to old account below          
-
-  deploy:
-    runs-on: prod
-    container:
-      image: 093970136003.dkr.ecr.eu-central-1.amazonaws.com/ansible:latest
-      options: --user root --privileged
-    if: inputs.deployStorage && inputs.disclamerAcknowledged
-    defaults:
-      run:
-        shell: bash
-    environment:
-      name: prod-old
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-          ref: ${{ inputs.branch }}
-
-      - name: Redeploy
-        run: |
-          export DOCKER_TAG=${{ inputs.dockerTag }}
-          cd "$(pwd)/.github/ansible"
-
-          ./get_binaries.sh
-
-          eval $(ssh-agent)
-          echo "${{ secrets.TELEPORT_SSH_KEY }}"  | tr -d '\n'| base64 --decode >ssh-key
-          echo "${{ secrets.TELEPORT_SSH_CERT }}" | tr -d '\n'| base64 --decode >ssh-key-cert.pub
-          chmod 0600 ssh-key
-          ssh-add ssh-key
-          rm -f ssh-key ssh-key-cert.pub
-          ANSIBLE_CONFIG=./ansible.cfg ansible-galaxy collection install sivel.toiletwater
-          ANSIBLE_CONFIG=./ansible.cfg ansible-playbook deploy.yaml -i production.hosts.yaml -e CONSOLE_API_TOKEN=${{ secrets.NEON_PRODUCTION_API_KEY }} -e SENTRY_URL_PAGESERVER=${{ secrets.SENTRY_URL_PAGESERVER }} -e SENTRY_URL_SAFEKEEPER=${{ secrets.SENTRY_URL_SAFEKEEPER }}
-          rm -f neon_install.tar.gz .neon_current_version
-
-      # Cleanup script fails otherwise - rm: cannot remove '/nvme/actions-runner/_work/_temp/_github_home/.ansible/collections': Permission denied
-      - name: Cleanup ansible folder
-        run: rm -rf ~/.ansible
-
-  deploy-storage-broker:
-    name: deploy storage broker on old staging and old prod
-    runs-on: [ self-hosted, gen3, small ]
-    container: 369495373322.dkr.ecr.eu-central-1.amazonaws.com/ansible:pinned
-    if: inputs.deployStorageBroker && inputs.disclamerAcknowledged
-    defaults:
-      run:
-        shell: bash
-    environment:
-      name: prod-old
-    env:
-      KUBECONFIG: .kubeconfig
-    steps:
-      - name: Checkout
-        uses: actions/checkout@v3
-        with:
-          submodules: true
-          fetch-depth: 0
-          ref: ${{ inputs.branch }}
-
-      - name: Store kubeconfig file
-        run: |
-          echo "${{ secrets.PRODUCTION_KUBECONFIG_DATA }}" | base64 --decode > ${KUBECONFIG}
-          chmod 0600 ${KUBECONFIG}
-
-      - name: Add neon helm chart
-        run: helm repo add neondatabase https://neondatabase.github.io/helm-charts
-
-      - name: Deploy storage-broker
-        run:
-          helm upgrade neon-storage-broker neondatabase/neon-storage-broker --namespace neon-storage-broker --create-namespace --install --atomic -f .github/helm-values/production.neon-storage-broker.yaml --set image.tag=${{ inputs.dockerTag }} --set settings.sentryUrl=${{ secrets.SENTRY_URL_BROKER }} --wait --timeout 5m0s
-
-      - name: Cleanup helm folder
-        run: rm -rf ~/.cache

From 526f8b76aa3055635206f1656364957dd506fae9 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 17 Feb 2023 08:29:52 +0000
Subject: [PATCH 18/42] Bump werkzeug from 2.1.2 to 2.2.3 (#3631)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

## Describe your changes

```
$ poetry add werkzeug@latest "moto[server]@latest"
Using version ^2.2.3 for werkzeug
Using version ^4.1.2 for moto

Updating dependencies
Resolving dependencies... (1.6s)

Writing lock file

Package operations: 0 installs, 2 updates, 1 removal

  • Removing pytz (2022.1)
  • Updating werkzeug (2.1.2 -> 2.2.3)
  • Updating moto (3.1.18 -> 4.1.2)
```

Resolves:
- https://github.com/neondatabase/neon/security/dependabot/14
- https://github.com/neondatabase/neon/security/dependabot/13

`@dependabot` failed to create a PR for some reason (I guess because it
also needed to handle `moto` dependency)

## Issue ticket number and link
N/A

## Checklist before requesting a review
- [x] I have performed a self-review of my code.
- [x] If it is a core feature, I have added thorough tests.
- [x] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [x] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
---
 poetry.lock    | 78 +++++++++++++++++++++-----------------------------
 pyproject.toml |  4 +--
 2 files changed, 35 insertions(+), 47 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index 7e80b1e10a..bc2c56d74c 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1238,7 +1238,6 @@ category = "main"
 optional = false
 python-versions = "*"
 files = [
-    {file = "junit-xml-1.9.tar.gz", hash = "sha256:de16a051990d4e25a3982b2dd9e89d671067548718866416faec14d9de56db9f"},
     {file = "junit_xml-1.9-py2.py3-none-any.whl", hash = "sha256:ec5ca1a55aefdd76d28fcc0b135251d156c7106fa979686a4b48d62b761b4732"},
 ]
 
@@ -1309,66 +1308,63 @@ files = [
 
 [[package]]
 name = "moto"
-version = "3.1.18"
-description = "A library that allows your python tests to easily mock out the boto library"
+version = "4.1.2"
+description = ""
 category = "main"
 optional = false
-python-versions = ">=3.6"
+python-versions = ">=3.7"
 files = [
-    {file = "moto-3.1.18-py3-none-any.whl", hash = "sha256:b6eb096e7880c46ac44d6d90988c0043e31462115cfdc913a0ee8f470bd9555c"},
-    {file = "moto-3.1.18.tar.gz", hash = "sha256:1e05276a62aa5a4aa821b441647c2cbaa2ea175388980b10d5de88d41b327cf7"},
+    {file = "moto-4.1.2-py2.py3-none-any.whl", hash = "sha256:1b361ece638c74a657325378a259276f368aafce2f8be84f8143e69fa93ce8ec"},
+    {file = "moto-4.1.2.tar.gz", hash = "sha256:63431733d2a02c7bd652ad71ec1da442a0e0d580cbac5eeb50d440a2ce066eac"},
 ]
 
 [package.dependencies]
 aws-xray-sdk = {version = ">=0.93,<0.96 || >0.96", optional = true, markers = "extra == \"server\""}
 boto3 = ">=1.9.201"
 botocore = ">=1.12.201"
-cfn-lint = {version = ">=0.4.0", optional = true, markers = "extra == \"server\""}
+cfn-lint = {version = ">=0.40.0", optional = true, markers = "extra == \"server\""}
 cryptography = ">=3.3.1"
 docker = {version = ">=2.5.1", optional = true, markers = "extra == \"server\""}
 ecdsa = {version = "!=0.15", optional = true, markers = "extra == \"server\""}
-flask = {version = "<2.2.0", optional = true, markers = "extra == \"server\""}
+flask = {version = "<2.2.0 || >2.2.0,<2.2.1 || >2.2.1", optional = true, markers = "extra == \"server\""}
 flask-cors = {version = "*", optional = true, markers = "extra == \"server\""}
 graphql-core = {version = "*", optional = true, markers = "extra == \"server\""}
-idna = {version = ">=2.5,<4", optional = true, markers = "extra == \"server\""}
 Jinja2 = ">=2.10.1"
 jsondiff = {version = ">=1.1.2", optional = true, markers = "extra == \"server\""}
-MarkupSafe = "!=2.0.0a1"
 openapi-spec-validator = {version = ">=0.2.8", optional = true, markers = "extra == \"server\""}
 pyparsing = {version = ">=3.0.7", optional = true, markers = "extra == \"server\""}
 python-dateutil = ">=2.1,<3.0.0"
 python-jose = {version = ">=3.1.0,<4.0.0", extras = ["cryptography"], optional = true, markers = "extra == \"server\""}
-pytz = "*"
 PyYAML = {version = ">=5.1", optional = true, markers = "extra == \"server\""}
 requests = ">=2.5"
-responses = ">=0.9.0"
+responses = ">=0.13.0"
 setuptools = {version = "*", optional = true, markers = "extra == \"server\""}
 sshpubkeys = {version = ">=3.1.0", optional = true, markers = "extra == \"server\""}
-werkzeug = ">=0.5,<2.2.0"
+werkzeug = ">=0.5,<2.2.0 || >2.2.0,<2.2.1 || >2.2.1"
 xmltodict = "*"
 
 [package.extras]
-all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
+all = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
 apigateway = ["PyYAML (>=5.1)", "ecdsa (!=0.15)", "openapi-spec-validator (>=0.2.8)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"]
 apigatewayv2 = ["PyYAML (>=5.1)"]
 appsync = ["graphql-core"]
 awslambda = ["docker (>=2.5.1)"]
 batch = ["docker (>=2.5.1)"]
-cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
+cloudformation = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
 cognitoidp = ["ecdsa (!=0.15)", "python-jose[cryptography] (>=3.1.0,<4.0.0)"]
 ds = ["sshpubkeys (>=3.1.0)"]
 dynamodb = ["docker (>=2.5.1)"]
-dynamodb2 = ["docker (>=2.5.1)"]
 dynamodbstreams = ["docker (>=2.5.1)"]
 ebs = ["sshpubkeys (>=3.1.0)"]
 ec2 = ["sshpubkeys (>=3.1.0)"]
 efs = ["sshpubkeys (>=3.1.0)"]
+eks = ["sshpubkeys (>=3.1.0)"]
 glue = ["pyparsing (>=3.0.7)"]
 iotdata = ["jsondiff (>=1.1.2)"]
 route53resolver = ["sshpubkeys (>=3.1.0)"]
 s3 = ["PyYAML (>=5.1)"]
-server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.4.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (<2.2.0)", "flask-cors", "graphql-core", "idna (>=2.5,<4)", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
-ssm = ["PyYAML (>=5.1)", "dataclasses"]
+server = ["PyYAML (>=5.1)", "aws-xray-sdk (>=0.93,!=0.96)", "cfn-lint (>=0.40.0)", "docker (>=2.5.1)", "ecdsa (!=0.15)", "flask (!=2.2.0,!=2.2.1)", "flask-cors", "graphql-core", "jsondiff (>=1.1.2)", "openapi-spec-validator (>=0.2.8)", "pyparsing (>=3.0.7)", "python-jose[cryptography] (>=3.1.0,<4.0.0)", "setuptools", "sshpubkeys (>=3.1.0)"]
+ssm = ["PyYAML (>=5.1)"]
 xray = ["aws-xray-sdk (>=0.93,!=0.96)", "setuptools"]
 
 [[package]]
@@ -1716,7 +1712,6 @@ python-versions = ">=3.6"
 files = [
     {file = "psycopg2-binary-2.9.3.tar.gz", hash = "sha256:761df5313dc15da1502b21453642d7599d26be88bff659382f8f9747c7ebea4e"},
     {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:539b28661b71da7c0e428692438efbcd048ca21ea81af618d845e06ebfd29478"},
-    {file = "psycopg2_binary-2.9.3-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:2f2534ab7dc7e776a263b463a16e189eb30e85ec9bbe1bff9e78dae802608932"},
     {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:6e82d38390a03da28c7985b394ec3f56873174e2c88130e6966cb1c946508e65"},
     {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:57804fc02ca3ce0dbfbef35c4b3a4a774da66d66ea20f4bda601294ad2ea6092"},
     {file = "psycopg2_binary-2.9.3-cp310-cp310-manylinux_2_24_aarch64.whl", hash = "sha256:083a55275f09a62b8ca4902dd11f4b33075b743cf0d360419e2051a8a5d5ff76"},
@@ -1750,7 +1745,6 @@ files = [
     {file = "psycopg2_binary-2.9.3-cp37-cp37m-win32.whl", hash = "sha256:adf20d9a67e0b6393eac162eb81fb10bc9130a80540f4df7e7355c2dd4af9fba"},
     {file = "psycopg2_binary-2.9.3-cp37-cp37m-win_amd64.whl", hash = "sha256:2f9ffd643bc7349eeb664eba8864d9e01f057880f510e4681ba40a6532f93c71"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:def68d7c21984b0f8218e8a15d514f714d96904265164f75f8d3a70f9c295667"},
-    {file = "psycopg2_binary-2.9.3-cp38-cp38-macosx_11_0_arm64.whl", hash = "sha256:e6aa71ae45f952a2205377773e76f4e3f27951df38e69a4c95440c779e013560"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:dffc08ca91c9ac09008870c9eb77b00a46b3378719584059c034b8945e26b272"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:280b0bb5cbfe8039205c7981cceb006156a675362a00fe29b16fbc264e242834"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-manylinux_2_24_aarch64.whl", hash = "sha256:af9813db73395fb1fc211bac696faea4ca9ef53f32dc0cfa27e4e7cf766dcf24"},
@@ -1762,7 +1756,6 @@ files = [
     {file = "psycopg2_binary-2.9.3-cp38-cp38-win32.whl", hash = "sha256:6472a178e291b59e7f16ab49ec8b4f3bdada0a879c68d3817ff0963e722a82ce"},
     {file = "psycopg2_binary-2.9.3-cp38-cp38-win_amd64.whl", hash = "sha256:35168209c9d51b145e459e05c31a9eaeffa9a6b0fd61689b48e07464ffd1a83e"},
     {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_10_14_x86_64.macosx_10_9_intel.macosx_10_9_x86_64.macosx_10_10_intel.macosx_10_10_x86_64.whl", hash = "sha256:47133f3f872faf28c1e87d4357220e809dfd3fa7c64295a4a148bcd1e6e34ec9"},
-    {file = "psycopg2_binary-2.9.3-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:b3a24a1982ae56461cc24f6680604fffa2c1b818e9dc55680da038792e004d18"},
     {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:91920527dea30175cc02a1099f331aa8c1ba39bf8b7762b7b56cbf54bc5cce42"},
     {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:887dd9aac71765ac0d0bac1d0d4b4f2c99d5f5c1382d8b770404f0f3d0ce8a39"},
     {file = "psycopg2_binary-2.9.3-cp39-cp39-manylinux_2_24_aarch64.whl", hash = "sha256:1f14c8b0942714eb3c74e1e71700cbbcb415acbc311c730370e70c578a44a25c"},
@@ -1795,7 +1788,18 @@ category = "main"
 optional = false
 python-versions = "*"
 files = [
+    {file = "pyasn1-0.4.8-py2.4.egg", hash = "sha256:fec3e9d8e36808a28efb59b489e4528c10ad0f480e57dcc32b4de5c9d8c9fdf3"},
+    {file = "pyasn1-0.4.8-py2.5.egg", hash = "sha256:0458773cfe65b153891ac249bcf1b5f8f320b7c2ce462151f8fa74de8934becf"},
+    {file = "pyasn1-0.4.8-py2.6.egg", hash = "sha256:5c9414dcfede6e441f7e8f81b43b34e834731003427e5b09e4e00e3172a10f00"},
+    {file = "pyasn1-0.4.8-py2.7.egg", hash = "sha256:6e7545f1a61025a4e58bb336952c5061697da694db1cae97b116e9c46abcf7c8"},
     {file = "pyasn1-0.4.8-py2.py3-none-any.whl", hash = "sha256:39c7e2ec30515947ff4e87fb6f456dfc6e84857d34be479c9d4a4ba4bf46aa5d"},
+    {file = "pyasn1-0.4.8-py3.1.egg", hash = "sha256:78fa6da68ed2727915c4767bb386ab32cdba863caa7dbe473eaae45f9959da86"},
+    {file = "pyasn1-0.4.8-py3.2.egg", hash = "sha256:08c3c53b75eaa48d71cf8c710312316392ed40899cb34710d092e96745a358b7"},
+    {file = "pyasn1-0.4.8-py3.3.egg", hash = "sha256:03840c999ba71680a131cfaee6fab142e1ed9bbd9c693e285cc6aca0d555e576"},
+    {file = "pyasn1-0.4.8-py3.4.egg", hash = "sha256:7ab8a544af125fb704feadb008c99a88805126fb525280b2270bb25cc1d78a12"},
+    {file = "pyasn1-0.4.8-py3.5.egg", hash = "sha256:e89bf84b5437b532b0803ba5c9a5e054d21fec423a89952a74f87fa2c9b7bce2"},
+    {file = "pyasn1-0.4.8-py3.6.egg", hash = "sha256:014c0e9976956a08139dc0712ae195324a75e142284d5f87f1a87ee1b068a359"},
+    {file = "pyasn1-0.4.8-py3.7.egg", hash = "sha256:99fcc3c8d804d1bc6d9a099921e39d827026409a58f2a720dcdb89374ea0c776"},
     {file = "pyasn1-0.4.8.tar.gz", hash = "sha256:aef77c9fb94a3ac588e87841208bdec464471d9871bd5050a287cc9a475cd0ba"},
 ]
 
@@ -2004,8 +2008,8 @@ files = [
 
 [package.dependencies]
 pytest = [
-    {version = ">=5.0", markers = "python_version < \"3.10\""},
     {version = ">=6.2.4", markers = "python_version >= \"3.10\""},
+    {version = ">=5.0", markers = "python_version < \"3.10\""},
 ]
 
 [[package]]
@@ -2082,18 +2086,6 @@ cryptography = ["cryptography (>=3.4.0)"]
 pycrypto = ["pyasn1", "pycrypto (>=2.6.0,<2.7.0)"]
 pycryptodome = ["pyasn1", "pycryptodome (>=3.3.1,<4.0.0)"]
 
-[[package]]
-name = "pytz"
-version = "2022.1"
-description = "World timezone definitions, modern and historical"
-category = "main"
-optional = false
-python-versions = "*"
-files = [
-    {file = "pytz-2022.1-py2.py3-none-any.whl", hash = "sha256:e68985985296d9a66a881eb3193b0906246245294a881e7c8afe623866ac6a5c"},
-    {file = "pytz-2022.1.tar.gz", hash = "sha256:1e760e2fe6a8163bc0b3d9a19c4f84342afa0a2affebfaa84b01b978a02ecaa7"},
-]
-
 [[package]]
 name = "pywin32"
 version = "301"
@@ -2129,13 +2121,6 @@ files = [
     {file = "PyYAML-6.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl", hash = "sha256:f84fbc98b019fef2ee9a1cb3ce93e3187a6df0b2538a651bfb890254ba9f90b5"},
     {file = "PyYAML-6.0-cp310-cp310-win32.whl", hash = "sha256:2cd5df3de48857ed0544b34e2d40e9fac445930039f3cfe4bcc592a1f836d513"},
     {file = "PyYAML-6.0-cp310-cp310-win_amd64.whl", hash = "sha256:daf496c58a8c52083df09b80c860005194014c3698698d1a57cbcfa182142a3a"},
-    {file = "PyYAML-6.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:d4b0ba9512519522b118090257be113b9468d804b19d63c71dbcf4a48fa32358"},
-    {file = "PyYAML-6.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:81957921f441d50af23654aa6c5e5eaf9b06aba7f0a19c18a538dc7ef291c5a1"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:afa17f5bc4d1b10afd4466fd3a44dc0e245382deca5b3c353d8b757f9e3ecb8d"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:dbad0e9d368bb989f4515da330b88a057617d16b6a8245084f1b05400f24609f"},
-    {file = "PyYAML-6.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:432557aa2c09802be39460360ddffd48156e30721f5e8d917f01d31694216782"},
-    {file = "PyYAML-6.0-cp311-cp311-win32.whl", hash = "sha256:bfaef573a63ba8923503d27530362590ff4f576c626d86a9fed95822a8255fd7"},
-    {file = "PyYAML-6.0-cp311-cp311-win_amd64.whl", hash = "sha256:01b45c0191e6d66c470b6cf1b9531a771a83c1c4208272ead47a3ae4f2f603bf"},
     {file = "PyYAML-6.0-cp36-cp36m-macosx_10_9_x86_64.whl", hash = "sha256:897b80890765f037df3403d22bab41627ca8811ae55e9a722fd0392850ec4d86"},
     {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:50602afada6d6cbfad699b0c7bb50d5ccffa7e46a3d738092afddc1f9758427f"},
     {file = "PyYAML-6.0-cp36-cp36m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:48c346915c114f5fdb3ead70312bd042a953a8ce5c7106d5bfb1a5254e47da92"},
@@ -2449,16 +2434,19 @@ test = ["websockets"]
 
 [[package]]
 name = "werkzeug"
-version = "2.1.2"
+version = "2.2.3"
 description = "The comprehensive WSGI web application library."
 category = "main"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "Werkzeug-2.1.2-py3-none-any.whl", hash = "sha256:72a4b735692dd3135217911cbeaa1be5fa3f62bffb8745c5215420a03dc55255"},
-    {file = "Werkzeug-2.1.2.tar.gz", hash = "sha256:1ce08e8093ed67d638d63879fd1ba3735817f7a80de3674d293f5984f25fb6e6"},
+    {file = "Werkzeug-2.2.3-py3-none-any.whl", hash = "sha256:56433961bc1f12533306c624f3be5e744389ac61d722175d543e1751285da612"},
+    {file = "Werkzeug-2.2.3.tar.gz", hash = "sha256:2e1ccc9417d4da358b9de6f174e3ac094391ea1d4fbef2d667865d819dfd0afe"},
 ]
 
+[package.dependencies]
+MarkupSafe = ">=2.1.1"
+
 [package.extras]
 watchdog = ["watchdog"]
 
@@ -2655,4 +2643,4 @@ testing = ["func-timeout", "jaraco.itertools", "pytest (>=6)", "pytest-black (>=
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.9"
-content-hash = "7563a38912963d8cf20c99acb06fe55623e65b799c4b88d37dc672e5384c96a3"
+content-hash = "3038940781ef59d1ed28cedf46120ad6623e21e602c38ad3c359428d79fa1efd"
diff --git a/pyproject.toml b/pyproject.toml
index d3d3948b9a..415f7f1ae7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -19,12 +19,12 @@ types-requests = "^2.28.5"
 types-psycopg2 = "^2.9.18"
 boto3 = "^1.26.16"
 boto3-stubs = {extras = ["s3"], version = "^1.26.16"}
-moto = {version = "^3.0.0", extras = ["server"]}
+moto = {extras = ["server"], version = "^4.1.2"}
 backoff = "^1.11.1"
 pytest-lazy-fixture = "^0.6.3"
 prometheus-client = "^0.14.1"
 pytest-timeout = "^2.1.0"
-Werkzeug = "2.1.2"
+Werkzeug = "^2.2.3"
 pytest-order = "^1.0.1"
 allure-pytest = "^2.10.0"
 pytest-asyncio = "^0.19.0"

From 501702b27c3d74c2d1b0e7f38c01770302abce32 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 17 Feb 2023 13:34:26 +0200
Subject: [PATCH 19/42] fix: flaky
 test_compaction_downloads_on_demand_with_image_creation (#3629)

fix is to stop postgres before the final checkpoint to ensure no
inmemory layer gets created.

Fixes #3627.
---
 test_runner/regress/test_ondemand_download.py | 50 +++++++++++--------
 1 file changed, 28 insertions(+), 22 deletions(-)

diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index f5f8491ada..eab9c41c57 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -555,32 +555,38 @@ def test_compaction_downloads_on_demand_with_image_creation(
     env.initial_tenant = tenant_id
     pageserver_http = env.pageserver.http_client()
 
-    with env.postgres.create_start("main") as pg:
-        # no particular reason to create the layers like this, but we are sure
-        # not to hit the image_creation_threshold here.
-        with pg.cursor() as cur:
-            cur.execute("create table a (id bigserial primary key, some_value bigint not null)")
-            cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)")
-        wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
-        pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+    pg = env.postgres.create_start("main")
 
-        for _ in range(0, 2):
-            for i in range(0, 3):
-                # create a minimal amount of "delta difficulty" for this table
-                with pg.cursor() as cur:
-                    cur.execute("update a set some_value = -some_value + %s", (i,))
+    # no particular reason to create the layers like this, but we are sure
+    # not to hit the image_creation_threshold here.
+    with pg.cursor() as cur:
+        cur.execute("create table a (id bigserial primary key, some_value bigint not null)")
+        cur.execute("insert into a(some_value) select i from generate_series(1, 10000) s(i)")
+    wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+    pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
 
-                with pg.cursor() as cur:
-                    # vacuuming should aid to reuse keys, though it's not really important
-                    # with image_creation_threshold=1 which we will use on the last compaction
-                    cur.execute("vacuum")
+    for i in range(0, 2):
+        for j in range(0, 3):
+            # create a minimal amount of "delta difficulty" for this table
+            with pg.cursor() as cur:
+                cur.execute("update a set some_value = -some_value + %s", (j,))
 
-                wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
-                pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+            with pg.cursor() as cur:
+                # vacuuming should aid to reuse keys, though it's not really important
+                # with image_creation_threshold=1 which we will use on the last compaction
+                cur.execute("vacuum")
 
-            # images should not yet be created, because threshold is too high,
-            # but these will be reshuffled to L1 layers
-            pageserver_http.timeline_compact(tenant_id, timeline_id)
+            wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
+
+            if i == 1 and j == 2:
+                # last iteration; stop before checkpoint to avoid leaving an inmemory layer
+                pg.stop_and_destroy()
+
+            pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
+
+        # images should not yet be created, because threshold is too high,
+        # but these will be reshuffled to L1 layers
+        pageserver_http.timeline_compact(tenant_id, timeline_id)
 
     for _ in range(0, 20):
         # loop in case flushing is still in progress

From ae3eff1ad2d4fa2865439e2d26de98746803ff1f Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 17 Feb 2023 13:56:00 +0200
Subject: [PATCH 20/42] Tracing panic hook (#3475)

Fixes #3468.

This does change how the panics look, and most importantly, make sure
they are not interleaved with other messages. Adds a `GET /v1/panic`
endpoint for panic testing (useful for sentry dedup and this hook
testing).

The panics are now logged within a new error level span called `panic`
which separates it from other error level events. The panic info is
unpacked into span fields:
- thread=mgmt request worker
- location="pageserver/src/http/routes.rs:898:9"

Co-authored-by: Christian Schwarz <christian@neon.tech>
---
 pageserver/src/bin/pageserver.rs | 54 ++++++++++++++++++++++++++++++--
 pageserver/src/http/routes.rs    | 12 +++++++
 2 files changed, 63 insertions(+), 3 deletions(-)

diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index 50eefa8c77..c499fd8d74 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -88,6 +88,13 @@ fn main() -> anyhow::Result<()> {
         }
     };
 
+    // Initialize logging, which must be initialized before the custom panic hook is installed.
+    logging::init(conf.log_format)?;
+
+    // disable the default rust panic hook by using `set_hook`. sentry will install it's own on top
+    // of this, always processing the panic before we log it.
+    std::panic::set_hook(Box::new(tracing_panic_hook));
+
     // initialize sentry if SENTRY_DSN is provided
     let _sentry_guard = init_sentry(
         Some(GIT_VERSION.into()),
@@ -210,9 +217,6 @@ fn start_pageserver(
     launch_ts: &'static LaunchTimestamp,
     conf: &'static PageServerConf,
 ) -> anyhow::Result<()> {
-    // Initialize logging
-    logging::init(conf.log_format)?;
-
     // Print version and launch timestamp to the log,
     // and expose them as prometheus metrics.
     // A changed version string indicates changed software.
@@ -495,6 +499,50 @@ fn cli() -> Command {
         )
 }
 
+/// Named symbol for our panic hook, which logs the panic.
+fn tracing_panic_hook(info: &std::panic::PanicInfo) {
+    // following rust 1.66.1 std implementation:
+    // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
+    let location = info.location();
+
+    let msg = match info.payload().downcast_ref::<&'static str>() {
+        Some(s) => *s,
+        None => match info.payload().downcast_ref::<String>() {
+            Some(s) => &s[..],
+            None => "Box<dyn Any>",
+        },
+    };
+
+    let thread = std::thread::current();
+    let thread = thread.name().unwrap_or("<unnamed>");
+    let backtrace = std::backtrace::Backtrace::capture();
+
+    struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>);
+
+    impl std::fmt::Display for PrettyLocation<'_, '_> {
+        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+            write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column())
+        }
+    }
+
+    let _entered = if let Some(location) = location {
+        tracing::error_span!("panic", %thread, location = %PrettyLocation(location))
+    } else {
+        // very unlikely to hit here, but the guarantees of std could change
+        tracing::error_span!("panic", %thread)
+    }
+    .entered();
+
+    if backtrace.status() == std::backtrace::BacktraceStatus::Captured {
+        // this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really
+        // get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to
+        // string, maybe even to a TLS one but tracing already does that.
+        tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}");
+    } else {
+        tracing::error!("{msg}");
+    }
+}
+
 #[test]
 fn verify_cli() {
     cli().debug_assert();
diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 7cd7e81fe1..71273159b7 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -1029,6 +1029,17 @@ async fn active_timeline_of_active_tenant(
         .map_err(ApiError::NotFound)
 }
 
+async fn always_panic_handler(req: Request<Body>) -> Result<Response<Body>, ApiError> {
+    // Deliberately cause a panic to exercise the panic hook registered via std::panic::set_hook().
+    // For pageserver, the relevant panic hook is `tracing_panic_hook` , and the `sentry` crate's wrapper around it.
+    // Use catch_unwind to ensure that tokio nor hyper are distracted by our panic.
+    let query = req.uri().query();
+    let _ = std::panic::catch_unwind(|| {
+        panic!("unconditional panic for testing panic hook integration; request query: {query:?}")
+    });
+    json_response(StatusCode::NO_CONTENT, ())
+}
+
 async fn handler_404(_: Request<Body>) -> Result<Response<Body>, ApiError> {
     json_response(
         StatusCode::NOT_FOUND,
@@ -1147,5 +1158,6 @@ pub fn make_router(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/layer/:layer_file_name",
             evict_timeline_layer_handler,
         )
+        .get("/v1/panic", always_panic_handler)
         .any(handler_404))
 }

From 8e6b27bf7c5a78bd716c343f1113228d927399fb Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Fri, 17 Feb 2023 14:15:29 +0200
Subject: [PATCH 21/42] fix: avoid busy loop on replacement failure (#3613)

Add an AtomicBool per RemoteLayer, use it to mark together with closed
semaphore that remotelayer is unusable until restart or ignore+load.

https://github.com/neondatabase/neon/issues/3533#issuecomment-1431481554
---
 pageserver/src/tenant/layer_map.rs            |  6 ++
 .../src/tenant/storage_layer/remote_layer.rs  | 13 ++++
 pageserver/src/tenant/timeline.rs             | 43 +++++++++++--
 test_runner/fixtures/neon_fixtures.py         |  2 +
 test_runner/regress/test_ondemand_download.py | 62 +++++++++++++++++++
 5 files changed, 121 insertions(+), 5 deletions(-)

diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index e446e34f4e..8d7d9c6f8f 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -154,6 +154,12 @@ where
         expected: &Arc<L>,
         new: Arc<L>,
     ) -> anyhow::Result<Replacement<Arc<L>>> {
+        fail::fail_point!("layermap-replace-notfound", |_| Ok(
+            // this is not what happens if an L0 layer was not found a anyhow error but perhaps
+            // that should be changed. this is good enough to show a replacement failure.
+            Replacement::NotFound
+        ));
+
         self.layer_map.replace_historic_noflush(expected, new)
     }
 
diff --git a/pageserver/src/tenant/storage_layer/remote_layer.rs b/pageserver/src/tenant/storage_layer/remote_layer.rs
index 51bb4dcc2a..8465a99339 100644
--- a/pageserver/src/tenant/storage_layer/remote_layer.rs
+++ b/pageserver/src/tenant/storage_layer/remote_layer.rs
@@ -49,6 +49,17 @@ pub struct RemoteLayer {
     access_stats: LayerAccessStats,
 
     pub(crate) ongoing_download: Arc<tokio::sync::Semaphore>,
+
+    /// Has `LayerMap::replace` failed for this (true) or not (false).
+    ///
+    /// Used together with [`ongoing_download`] semaphore in `Timeline::download_remote_layer`.
+    /// The field is used to mark a RemoteLayer permanently (until restart or ignore+load)
+    /// unprocessable, because a LayerMap::replace failed.
+    ///
+    /// It is very unlikely to accumulate these in the Timeline's LayerMap, but having this avoids
+    /// a possible fast loop between `Timeline::get_reconstruct_data` and
+    /// `Timeline::download_remote_layer`, which also logs.
+    pub(crate) download_replacement_failure: std::sync::atomic::AtomicBool,
 }
 
 impl std::fmt::Debug for RemoteLayer {
@@ -207,6 +218,7 @@ impl RemoteLayer {
             file_name: fname.to_owned().into(),
             layer_metadata: layer_metadata.clone(),
             ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
             access_stats,
         }
     }
@@ -228,6 +240,7 @@ impl RemoteLayer {
             file_name: fname.to_owned().into(),
             layer_metadata: layer_metadata.clone(),
             ongoing_download: Arc::new(tokio::sync::Semaphore::new(1)),
+            download_replacement_failure: std::sync::atomic::AtomicBool::default(),
             access_stats,
         }
     }
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 66afe2cdce..683c2cd2d3 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3624,14 +3624,26 @@ impl Timeline {
         &self,
         remote_layer: Arc<RemoteLayer>,
     ) -> anyhow::Result<()> {
+        use std::sync::atomic::Ordering::Relaxed;
+
         let permit = match Arc::clone(&remote_layer.ongoing_download)
             .acquire_owned()
             .await
         {
             Ok(permit) => permit,
             Err(_closed) => {
-                info!("download of layer has already finished");
-                return Ok(());
+                if remote_layer.download_replacement_failure.load(Relaxed) {
+                    // this path will be hit often, in case there are upper retries. however
+                    // hitting this error will prevent a busy loop between get_reconstruct_data and
+                    // download, so an error is prefered.
+                    //
+                    // TODO: we really should poison the timeline, but panicking is not yet
+                    // supported. Related: https://github.com/neondatabase/neon/issues/3621
+                    anyhow::bail!("an earlier download succeeded but LayerMap::replace failed")
+                } else {
+                    info!("download of layer has already finished");
+                    return Ok(());
+                }
             }
         };
 
@@ -3667,8 +3679,8 @@ impl Timeline {
                     {
                         use crate::tenant::layer_map::Replacement;
                         let l: Arc<dyn PersistentLayer> = remote_layer.clone();
-                        match updates.replace_historic(&l, new_layer) {
-                            Ok(Replacement::Replaced { .. }) => { /* expected */ }
+                        let failure = match updates.replace_historic(&l, new_layer) {
+                            Ok(Replacement::Replaced { .. }) => false,
                             Ok(Replacement::NotFound) => {
                                 // TODO: the downloaded file should probably be removed, otherwise
                                 // it will be added to the layermap on next load? we should
@@ -3676,6 +3688,7 @@ impl Timeline {
                                 //
                                 // See: https://github.com/neondatabase/neon/issues/3533
                                 error!("replacing downloaded layer into layermap failed because layer was not found");
+                                true
                             }
                             Ok(Replacement::RemovalBuffered) => {
                                 unreachable!("current implementation does not remove anything")
@@ -3694,12 +3707,32 @@ impl Timeline {
                                     ?other,
                                     "replacing downloaded layer into layermap failed because another layer was found instead of expected"
                                 );
+                                true
                             }
                             Err(e) => {
                                 // this is a precondition failure, the layer filename derived
                                 // attributes didn't match up, which doesn't seem likely.
-                                error!("replacing downloaded layer into layermap failed: {e:#?}")
+                                error!("replacing downloaded layer into layermap failed: {e:#?}");
+                                true
                             }
+                        };
+
+                        if failure {
+                            // mark the remote layer permanently failed; the timeline is most
+                            // likely unusable after this. sadly we cannot just poison the layermap
+                            // lock with panic, because that would create an issue with shutdown.
+                            //
+                            // this does not change the retry semantics on failed downloads.
+                            //
+                            // use of Relaxed is valid because closing of the semaphore gives
+                            // happens-before and wakes up any waiters; we write this value before
+                            // and any waiters (or would be waiters) will load it after closing
+                            // semaphore.
+                            //
+                            // See: https://github.com/neondatabase/neon/issues/3533
+                            remote_layer
+                                .download_replacement_failure
+                                .store(true, Relaxed);
                         }
                     }
                     updates.flush();
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 0620ad8a35..ca5288fa0a 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1288,6 +1288,7 @@ class PageserverHttpClient(requests.Session):
         timeline_id: TimelineId,
         include_non_incremental_logical_size: bool = False,
         include_timeline_dir_layer_file_size_sum: bool = False,
+        **kwargs,
     ) -> Dict[Any, Any]:
         params = {}
         if include_non_incremental_logical_size:
@@ -1298,6 +1299,7 @@ class PageserverHttpClient(requests.Session):
         res = self.get(
             f"http://localhost:{self.port}/v1/tenant/{tenant_id}/timeline/{timeline_id}",
             params=params,
+            **kwargs,
         )
         self.verbose_error(res)
         res_json = res.json()
diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index eab9c41c57..09657470b6 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -10,6 +10,7 @@ import pytest
 from fixtures.log_helper import log
 from fixtures.neon_fixtures import (
     NeonEnvBuilder,
+    PageserverApiException,
     RemoteStorageKind,
     assert_tenant_status,
     available_remote_storages,
@@ -18,6 +19,7 @@ from fixtures.neon_fixtures import (
     wait_for_sk_commit_lsn_to_reach_remote_storage,
     wait_for_upload,
     wait_until,
+    wait_until_tenant_state,
 )
 from fixtures.types import Lsn
 from fixtures.utils import query_scalar
@@ -623,3 +625,63 @@ def test_compaction_downloads_on_demand_with_image_creation(
 
 def stringify(conf: Dict[str, Any]) -> Dict[str, str]:
     return dict(map(lambda x: (x[0], str(x[1])), conf.items()))
+
+
+@pytest.mark.parametrize("remote_storage_kind", [RemoteStorageKind.LOCAL_FS])
+def test_ondemand_download_failure_to_replace(
+    neon_env_builder: NeonEnvBuilder, remote_storage_kind: RemoteStorageKind
+):
+    """
+    Make sure that we fail on being unable to replace a RemoteLayer instead of for example livelocking.
+
+    See: https://github.com/neondatabase/neon/issues/3533
+    """
+
+    neon_env_builder.enable_remote_storage(
+        remote_storage_kind=remote_storage_kind,
+        test_name="test_ondemand_download_failure_to_replace",
+    )
+
+    # disable gc and compaction via default tenant config because config is lost while detaching
+    # so that compaction will not be the one to download the layer but the http handler is
+    neon_env_builder.pageserver_config_override = (
+        """tenant_config={gc_period = "0s", compaction_period = "0s"}"""
+    )
+
+    env = neon_env_builder.init_start()
+
+    tenant_id, timeline_id = env.neon_cli.create_tenant()
+
+    env.initial_tenant = tenant_id
+    pageserver_http = env.pageserver.http_client()
+
+    lsn = Lsn(pageserver_http.timeline_detail(tenant_id, timeline_id)["last_record_lsn"])
+
+    wait_for_upload(pageserver_http, tenant_id, timeline_id, lsn)
+
+    # remove layers so that they will be redownloaded
+    pageserver_http.tenant_detach(tenant_id)
+    pageserver_http.tenant_attach(tenant_id)
+
+    wait_until_tenant_state(pageserver_http, tenant_id, "Active", 5)
+    pageserver_http.configure_failpoints(("layermap-replace-notfound", "return"))
+
+    # requesting details with non-incremental size should trigger a download of the only layer
+    # this will need to be adjusted if an index for logical sizes is ever implemented
+    with pytest.raises(PageserverApiException):
+        # error message is not useful
+        pageserver_http.timeline_detail(tenant_id, timeline_id, True, timeout=2)
+
+    actual_message = (
+        ".* ERROR .*replacing downloaded layer into layermap failed because layer was not found"
+    )
+    assert env.pageserver.log_contains(actual_message) is not None
+    env.pageserver.allowed_errors.append(actual_message)
+
+    env.pageserver.allowed_errors.append(
+        ".* ERROR .*Error processing HTTP request: InternalServerError\\(get local timeline info"
+    )
+    # this might get to run and attempt on-demand, but not always
+    env.pageserver.allowed_errors.append(".* ERROR .*Task 'initial size calculation'")
+
+    # if the above returned, then we didn't have a livelock, and all is well

From 6f9af0aa8c74adc6f5d39c8645ed0b48f84c7313 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki@neon.tech>
Date: Thu, 26 Jan 2023 17:00:58 +0200
Subject: [PATCH 22/42] [proxy] Enable OpenTelemetry tracing.

This commit sets up OpenTelemetry tracing and exporter, so that they
can be exported as OpenTelemetry traces as well.

All outgoing HTTP requests will be traced. A separate (child)
span is created for each outgoing HTTP request, and the tracing
context is also propagated to the server in the HTTP headers.

If tracing is enabled in the control plane and compute node too, you
can now get an end-to-end distributed trace of what happens when a new
connection is established, starting from the handshake with the
client, creating the 'start_compute' operation in the control plane,
starting the compute node, all the way to down to fetching the base
backup and the availability checks in compute_ctl.

Co-authored-by: Dmitry Ivanov <dima@neon.tech>
---
 Cargo.lock                         | 64 ++++++++++++++++++++++++++++++
 Cargo.toml                         |  2 +
 proxy/Cargo.toml                   |  5 +++
 proxy/src/console/provider.rs      | 23 +++++++----
 proxy/src/console/provider/neon.rs |  5 +--
 proxy/src/http.rs                  | 41 ++++++++++++++-----
 proxy/src/logging.rs               | 46 +++++++++++++++++++++
 proxy/src/main.rs                  | 21 ++--------
 proxy/src/metrics.rs               | 17 ++++----
 workspace_hack/Cargo.toml          |  2 +-
 10 files changed, 176 insertions(+), 50 deletions(-)
 create mode 100644 proxy/src/logging.rs

diff --git a/Cargo.lock b/Cargo.lock
index 98c4dca09b..d154b4eaea 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2110,6 +2110,16 @@ version = "0.3.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "2a60c7ce501c71e03a9c9c0d35b861413ae925bd979cc7a4e30d060069aaac8d"
 
+[[package]]
+name = "mime_guess"
+version = "2.0.4"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4192263c238a5f0d0c6bfd21f336a313a4ce1c450542449ca191bb657b4642ef"
+dependencies = [
+ "mime",
+ "unicase",
+]
+
 [[package]]
 name = "minimal-lexical"
 version = "0.2.1"
@@ -2884,6 +2894,7 @@ dependencies = [
  "md5",
  "metrics",
  "once_cell",
+ "opentelemetry",
  "parking_lot",
  "pin-project-lite",
  "pq_proto",
@@ -2892,6 +2903,8 @@ dependencies = [
  "rcgen",
  "regex",
  "reqwest",
+ "reqwest-middleware",
+ "reqwest-tracing",
  "routerify",
  "rstest",
  "rustls",
@@ -2909,7 +2922,9 @@ dependencies = [
  "tokio-postgres-rustls",
  "tokio-rustls",
  "tracing",
+ "tracing-opentelemetry",
  "tracing-subscriber",
+ "tracing-utils",
  "url",
  "utils",
  "uuid",
@@ -3079,6 +3094,7 @@ dependencies = [
  "js-sys",
  "log",
  "mime",
+ "mime_guess",
  "once_cell",
  "percent-encoding",
  "pin-project-lite",
@@ -3098,6 +3114,36 @@ dependencies = [
  "winreg",
 ]
 
+[[package]]
+name = "reqwest-middleware"
+version = "0.2.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4a1c03e9011a8c59716ad13115550469e081e2e9892656b0ba6a47c907921894"
+dependencies = [
+ "anyhow",
+ "async-trait",
+ "http",
+ "reqwest",
+ "serde",
+ "task-local-extensions",
+ "thiserror",
+]
+
+[[package]]
+name = "reqwest-tracing"
+version = "0.4.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b739d87a6b2cf4743968ad2b4cef648fbe0204c19999509824425babb2097bce"
+dependencies = [
+ "async-trait",
+ "opentelemetry",
+ "reqwest",
+ "reqwest-middleware",
+ "task-local-extensions",
+ "tracing",
+ "tracing-opentelemetry",
+]
+
 [[package]]
 name = "ring"
 version = "0.16.20"
@@ -3790,6 +3836,15 @@ dependencies = [
  "xattr",
 ]
 
+[[package]]
+name = "task-local-extensions"
+version = "0.1.3"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "4167afbec18ae012de40f8cf1b9bf48420abb390678c34821caa07d924941cc4"
+dependencies = [
+ "tokio",
+]
+
 [[package]]
 name = "tempfile"
 version = "3.3.0"
@@ -4360,6 +4415,15 @@ dependencies = [
  "libc",
 ]
 
+[[package]]
+name = "unicase"
+version = "2.6.0"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "50f37be617794602aabbeee0be4f259dc1778fabe05e2d67ee8f79326d5cb4f6"
+dependencies = [
+ "version_check",
+]
+
 [[package]]
 name = "unicode-bidi"
 version = "0.3.10"
diff --git a/Cargo.toml b/Cargo.toml
index 4e4667f253..99a3f56026 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -76,6 +76,8 @@ prost = "0.11"
 rand = "0.8"
 regex = "1.4"
 reqwest = { version = "0.11", default-features = false, features = ["rustls-tls"] }
+reqwest-tracing = { version = "0.4.0", features = ["opentelemetry_0_18"] }
+reqwest-middleware = "0.2.0"
 routerify = "3"
 rpds = "0.12.0"
 rustls = "0.20"
diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 152c83e4a0..96a62d2c49 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -28,6 +28,7 @@ itertools.workspace = true
 md5.workspace = true
 metrics.workspace = true
 once_cell.workspace = true
+opentelemetry.workspace = true
 parking_lot.workspace = true
 pin-project-lite.workspace = true
 pq_proto.workspace = true
@@ -35,6 +36,8 @@ prometheus.workspace = true
 rand.workspace = true
 regex.workspace = true
 reqwest = { workspace = true, features = ["json"] }
+reqwest-middleware.workspace = true
+reqwest-tracing.workspace = true
 routerify.workspace = true
 rustls-pemfile.workspace = true
 rustls.workspace = true
@@ -49,7 +52,9 @@ tls-listener.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
 tokio.workspace = true
+tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
+tracing-utils.workspace = true
 tracing.workspace = true
 url.workspace = true
 utils.workspace = true
diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs
index 7621aba19b..80cd94d483 100644
--- a/proxy/src/console/provider.rs
+++ b/proxy/src/console/provider.rs
@@ -11,8 +11,10 @@ use async_trait::async_trait;
 use std::sync::Arc;
 
 pub mod errors {
-    use crate::error::{io_error, UserFacingError};
-    use reqwest::StatusCode as HttpStatusCode;
+    use crate::{
+        error::{io_error, UserFacingError},
+        http,
+    };
     use thiserror::Error;
 
     /// A go-to error message which doesn't leak any detail.
@@ -24,7 +26,7 @@ pub mod errors {
         /// Error returned by the console itself.
         #[error("{REQUEST_FAILED} with {}: {}", .status, .text)]
         Console {
-            status: HttpStatusCode,
+            status: http::StatusCode,
             text: Box<str>,
         },
 
@@ -35,7 +37,7 @@ pub mod errors {
 
     impl ApiError {
         /// Returns HTTP status code if it's the reason for failure.
-        pub fn http_status_code(&self) -> Option<HttpStatusCode> {
+        pub fn http_status_code(&self) -> Option<http::StatusCode> {
             use ApiError::*;
             match self {
                 Console { status, .. } => Some(*status),
@@ -51,15 +53,15 @@ pub mod errors {
                 // To minimize risks, only select errors are forwarded to users.
                 // Ask @neondatabase/control-plane for review before adding more.
                 Console { status, .. } => match *status {
-                    HttpStatusCode::NOT_FOUND => {
+                    http::StatusCode::NOT_FOUND => {
                         // Status 404: failed to get a project-related resource.
                         format!("{REQUEST_FAILED}: endpoint cannot be found")
                     }
-                    HttpStatusCode::NOT_ACCEPTABLE => {
+                    http::StatusCode::NOT_ACCEPTABLE => {
                         // Status 406: endpoint is disabled (we don't allow connections).
                         format!("{REQUEST_FAILED}: endpoint is disabled")
                     }
-                    HttpStatusCode::LOCKED => {
+                    http::StatusCode::LOCKED => {
                         // Status 423: project might be in maintenance mode (or bad state).
                         format!("{REQUEST_FAILED}: endpoint is temporary unavailable")
                     }
@@ -70,13 +72,18 @@ pub mod errors {
         }
     }
 
-    // Helps eliminate graceless `.map_err` calls without introducing another ctor.
     impl From<reqwest::Error> for ApiError {
         fn from(e: reqwest::Error) -> Self {
             io_error(e).into()
         }
     }
 
+    impl From<reqwest_middleware::Error> for ApiError {
+        fn from(e: reqwest_middleware::Error) -> Self {
+            io_error(e).into()
+        }
+    }
+
     #[derive(Debug, Error)]
     pub enum GetAuthInfoError {
         // We shouldn't include the actual secret here.
diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs
index 4eca025d2d..3644db17f7 100644
--- a/proxy/src/console/provider/neon.rs
+++ b/proxy/src/console/provider/neon.rs
@@ -8,7 +8,6 @@ use super::{
 use crate::{auth::ClientCredentials, compute, http, scram};
 use async_trait::async_trait;
 use futures::TryFutureExt;
-use reqwest::StatusCode as HttpStatusCode;
 use tracing::{error, info, info_span, warn, Instrument};
 
 #[derive(Clone)]
@@ -52,7 +51,7 @@ impl Api {
                 Ok(body) => body,
                 // Error 404 is special: it's ok not to have a secret.
                 Err(e) => match e.http_status_code() {
-                    Some(HttpStatusCode::NOT_FOUND) => return Ok(None),
+                    Some(http::StatusCode::NOT_FOUND) => return Ok(None),
                     _otherwise => return Err(e.into()),
                 },
             };
@@ -154,7 +153,7 @@ impl super::Api for Api {
 
 /// Parse http response body, taking status code into account.
 async fn parse_body<T: for<'a> serde::Deserialize<'a>>(
-    response: reqwest::Response,
+    response: http::Response,
 ) -> Result<T, ApiError> {
     let status = response.status();
     if status.is_success() {
diff --git a/proxy/src/http.rs b/proxy/src/http.rs
index e847edc8bd..a544157800 100644
--- a/proxy/src/http.rs
+++ b/proxy/src/http.rs
@@ -1,7 +1,24 @@
+//! HTTP client and server impls.
+//! Other modules should use stuff from this module instead of
+//! directly relying on deps like `reqwest` (think loose coupling).
+
 pub mod server;
 pub mod websocket;
 
+pub use reqwest::{Request, Response, StatusCode};
+pub use reqwest_middleware::{ClientWithMiddleware, Error};
+
 use crate::url::ApiUrl;
+use reqwest_middleware::RequestBuilder;
+
+/// This is the preferred way to create new http clients,
+/// because it takes care of observability (OpenTelemetry).
+/// We deliberately don't want to replace this with a public static.
+pub fn new_client() -> ClientWithMiddleware {
+    reqwest_middleware::ClientBuilder::new(reqwest::Client::new())
+        .with(reqwest_tracing::TracingMiddleware::default())
+        .build()
+}
 
 /// Thin convenience wrapper for an API provided by an http endpoint.
 #[derive(Debug, Clone)]
@@ -9,13 +26,17 @@ pub struct Endpoint {
     /// API's base URL.
     endpoint: ApiUrl,
     /// Connection manager with built-in pooling.
-    client: reqwest::Client,
+    client: ClientWithMiddleware,
 }
 
 impl Endpoint {
     /// Construct a new HTTP endpoint wrapper.
-    pub fn new(endpoint: ApiUrl, client: reqwest::Client) -> Self {
-        Self { endpoint, client }
+    /// Http client is not constructed under the hood so that it can be shared.
+    pub fn new(endpoint: ApiUrl, client: impl Into<ClientWithMiddleware>) -> Self {
+        Self {
+            endpoint,
+            client: client.into(),
+        }
     }
 
     #[inline(always)]
@@ -23,19 +44,16 @@ impl Endpoint {
         &self.endpoint
     }
 
-    /// Return a [builder](reqwest::RequestBuilder) for a `GET` request,
+    /// Return a [builder](RequestBuilder) for a `GET` request,
     /// appending a single `path` segment to the base endpoint URL.
-    pub fn get(&self, path: &str) -> reqwest::RequestBuilder {
+    pub fn get(&self, path: &str) -> RequestBuilder {
         let mut url = self.endpoint.clone();
         url.path_segments_mut().push(path);
         self.client.get(url.into_inner())
     }
 
     /// Execute a [request](reqwest::Request).
-    pub async fn execute(
-        &self,
-        request: reqwest::Request,
-    ) -> Result<reqwest::Response, reqwest::Error> {
+    pub async fn execute(&self, request: Request) -> Result<Response, Error> {
         self.client.execute(request).await
     }
 }
@@ -43,11 +61,12 @@ impl Endpoint {
 #[cfg(test)]
 mod tests {
     use super::*;
+    use reqwest::Client;
 
     #[test]
     fn optional_query_params() -> anyhow::Result<()> {
         let url = "http://example.com".parse()?;
-        let endpoint = Endpoint::new(url, reqwest::Client::new());
+        let endpoint = Endpoint::new(url, Client::new());
 
         // Validate that this pattern makes sense.
         let req = endpoint
@@ -66,7 +85,7 @@ mod tests {
     #[test]
     fn uuid_params() -> anyhow::Result<()> {
         let url = "http://example.com".parse()?;
-        let endpoint = Endpoint::new(url, reqwest::Client::new());
+        let endpoint = Endpoint::new(url, Client::new());
 
         let req = endpoint
             .get("frobnicate")
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
new file mode 100644
index 0000000000..2baf824fc3
--- /dev/null
+++ b/proxy/src/logging.rs
@@ -0,0 +1,46 @@
+use tracing_opentelemetry::OpenTelemetryLayer;
+use tracing_subscriber::{
+    filter::{EnvFilter, LevelFilter},
+    prelude::*,
+};
+
+/// Initialize logging and OpenTelemetry tracing and exporter.
+///
+/// Logging can be configured using `RUST_LOG` environment variable.
+///
+/// OpenTelemetry is configured with OTLP/HTTP exporter. It picks up
+/// configuration from environment variables. For example, to change the
+/// destination, set `OTEL_EXPORTER_OTLP_ENDPOINT=http://jaeger:4318`.
+/// See <https://opentelemetry.io/docs/reference/specification/sdk-environment-variables>
+pub async fn init() -> anyhow::Result<LoggingGuard> {
+    let env_filter = EnvFilter::builder()
+        .with_default_directive(LevelFilter::INFO.into())
+        .from_env_lossy();
+
+    let fmt_layer = tracing_subscriber::fmt::layer()
+        .with_ansi(atty::is(atty::Stream::Stderr))
+        .with_writer(std::io::stderr)
+        .with_target(false);
+
+    let otlp_layer = tracing_utils::init_tracing("proxy")
+        .await
+        .map(OpenTelemetryLayer::new);
+
+    tracing_subscriber::registry()
+        .with(env_filter)
+        .with(otlp_layer)
+        .with(fmt_layer)
+        .try_init()?;
+
+    Ok(LoggingGuard)
+}
+
+pub struct LoggingGuard;
+
+impl Drop for LoggingGuard {
+    fn drop(&mut self) {
+        // Shutdown trace pipeline gracefully, so that it has a chance to send any
+        // pending traces before we exit.
+        tracing_utils::shutdown_tracing();
+    }
+}
diff --git a/proxy/src/main.rs b/proxy/src/main.rs
index 8812f77b62..54f49b5a3c 100644
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -12,6 +12,7 @@ mod config;
 mod console;
 mod error;
 mod http;
+mod logging;
 mod metrics;
 mod parse;
 mod proxy;
@@ -41,8 +42,7 @@ async fn flatten_err(
 
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
-    // First, initialize logging and troubleshooting subsystems.
-    init_tracing();
+    let _logging_guard = logging::init().await?;
     let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
     info!("Version: {GIT_VERSION}");
@@ -107,21 +107,6 @@ async fn main() -> anyhow::Result<()> {
     Ok(())
 }
 
-/// Tracing is used for logging and telemetry.
-fn init_tracing() {
-    tracing_subscriber::fmt()
-        .with_env_filter({
-            // This filter will examine the `RUST_LOG` env variable.
-            use tracing_subscriber::filter::{EnvFilter, LevelFilter};
-            EnvFilter::builder()
-                .with_default_directive(LevelFilter::INFO.into())
-                .from_env_lossy()
-        })
-        .with_ansi(atty::is(atty::Stream::Stdout))
-        .with_target(false)
-        .init();
-}
-
 /// ProxyConfig is created at proxy startup, and lives forever.
 fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> {
     let tls_config = match (
@@ -161,7 +146,7 @@ fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig>
             }));
 
             let url = args.get_one::<String>("auth-endpoint").unwrap().parse()?;
-            let endpoint = http::Endpoint::new(url, reqwest::Client::new());
+            let endpoint = http::Endpoint::new(url, http::new_client());
 
             let api = console::provider::neon::Api::new(endpoint, caches);
             auth::BackendType::Console(Cow::Owned(api), ())
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index d9aa4aec8c..83b28288ee 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,12 +1,11 @@
-//!
 //! Periodically collect proxy consumption metrics
 //! and push them to a HTTP endpoint.
-//!
+use crate::http;
 use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use serde::Serialize;
 use std::{collections::HashMap, time::Duration};
-use tracing::{debug, error, log::info, trace};
+use tracing::{debug, error, info, instrument, trace};
 
 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 
@@ -40,15 +39,14 @@ pub async fn collect_metrics(
         metric_collection_endpoint
     );
 
-    // define client here to reuse it for all requests
-    let client = reqwest::Client::new();
+    let http_client = http::new_client();
     let mut cached_metrics: HashMap<Ids, (u64, DateTime<Utc>)> = HashMap::new();
 
     loop {
         tokio::select! {
             _ = ticker.tick() => {
 
-                match collect_metrics_iteration(&client, &mut cached_metrics, metric_collection_endpoint, hostname.clone()).await
+                match collect_metrics_iteration(&http_client, &mut cached_metrics, metric_collection_endpoint, hostname.clone()).await
                 {
                     Err(e) => {
                         error!("Failed to send consumption metrics: {} ", e);
@@ -60,7 +58,7 @@ pub async fn collect_metrics(
     }
 }
 
-pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
+fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
     let mut current_metrics: Vec<(Ids, (u64, DateTime<Utc>))> = Vec::new();
     let metrics = prometheus::default_registry().gather();
 
@@ -99,8 +97,9 @@ pub fn gather_proxy_io_bytes_per_client() -> Vec<(Ids, (u64, DateTime<Utc>))> {
     current_metrics
 }
 
-pub async fn collect_metrics_iteration(
-    client: &reqwest::Client,
+#[instrument(skip_all)]
+async fn collect_metrics_iteration(
+    client: &http::ClientWithMiddleware,
     cached_metrics: &mut HashMap<Ids, (u64, DateTime<Utc>)>,
     metric_collection_endpoint: &reqwest::Url,
     hostname: String,
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 30a6d3a92b..68138b3df4 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -38,7 +38,7 @@ prost = { version = "0.11" }
 rand = { version = "0.8", features = ["small_rng"] }
 regex = { version = "1" }
 regex-syntax = { version = "0.6" }
-reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "rustls-tls"] }
+reqwest = { version = "0.11", default-features = false, features = ["blocking", "json", "multipart", "rustls-tls"] }
 ring = { version = "0.16", features = ["std"] }
 rustls = { version = "0.20", features = ["dangerous_configuration"] }
 scopeguard = { version = "1" }

From 956b6f17ca35f002d1dcb74a7c803db798d43c94 Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <ivadmi5@gmail.com>
Date: Fri, 17 Feb 2023 13:16:30 +0300
Subject: [PATCH 23/42] [proxy] Handle some unix signals.

On the surface, this doesn't add much, but there are some benefits:

* We can do graceful shutdowns and thus record more code coverage data.

* We now have a foundation for the more interesting behaviors, e.g. "stop
  accepting new connections after SIGTERM but keep serving the existing ones".

* We give the otel machinery a chance to flush trace events before
  finally shutting down.
---
 proxy/Cargo.toml                      |  2 +-
 proxy/src/config.rs                   |  1 +
 proxy/src/console/mgmt.rs             | 20 +++++----
 proxy/src/http/websocket.rs           |  2 +-
 proxy/src/logging.rs                  |  1 +
 proxy/src/main.rs                     | 58 ++++++++++++++++++---------
 proxy/src/metrics.rs                  | 48 ++++++++++------------
 test_runner/fixtures/neon_fixtures.py | 10 +++--
 workspace_hack/Cargo.toml             |  2 +-
 9 files changed, 79 insertions(+), 65 deletions(-)

diff --git a/proxy/Cargo.toml b/proxy/Cargo.toml
index 96a62d2c49..030a5f1d6e 100644
--- a/proxy/Cargo.toml
+++ b/proxy/Cargo.toml
@@ -51,7 +51,7 @@ thiserror.workspace = true
 tls-listener.workspace = true
 tokio-postgres.workspace = true
 tokio-rustls.workspace = true
-tokio.workspace = true
+tokio = { workspace = true, features = ["signal"] }
 tracing-opentelemetry.workspace = true
 tracing-subscriber.workspace = true
 tracing-utils.workspace = true
diff --git a/proxy/src/config.rs b/proxy/src/config.rs
index 5e285f3625..600db7f8ec 100644
--- a/proxy/src/config.rs
+++ b/proxy/src/config.rs
@@ -8,6 +8,7 @@ pub struct ProxyConfig {
     pub metric_collection: Option<MetricCollectionConfig>,
 }
 
+#[derive(Debug)]
 pub struct MetricCollectionConfig {
     pub endpoint: reqwest::Url,
     pub interval: Duration,
diff --git a/proxy/src/console/mgmt.rs b/proxy/src/console/mgmt.rs
index 51a117d3b7..c00c06fbb7 100644
--- a/proxy/src/console/mgmt.rs
+++ b/proxy/src/console/mgmt.rs
@@ -5,10 +5,7 @@ use crate::{
 use anyhow::Context;
 use once_cell::sync::Lazy;
 use pq_proto::{BeMessage, SINGLE_COL_ROWDESC};
-use std::{
-    net::{TcpListener, TcpStream},
-    thread,
-};
+use std::{net::TcpStream, thread};
 use tracing::{error, info, info_span};
 use utils::{
     postgres_backend::{self, AuthType, PostgresBackend},
@@ -34,23 +31,24 @@ pub fn notify(psql_session_id: &str, msg: ComputeReady) -> Result<(), waiters::N
     CPLANE_WAITERS.notify(psql_session_id, msg)
 }
 
-/// Console management API listener thread.
+/// Console management API listener task.
 /// It spawns console response handlers needed for the link auth.
-pub fn thread_main(listener: TcpListener) -> anyhow::Result<()> {
+pub async fn task_main(listener: tokio::net::TcpListener) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("mgmt has shut down");
     }
 
-    listener
-        .set_nonblocking(false)
-        .context("failed to set listener to blocking")?;
-
     loop {
-        let (socket, peer_addr) = listener.accept().context("failed to accept a new client")?;
+        let (socket, peer_addr) = listener.accept().await?;
         info!("accepted connection from {peer_addr}");
+
+        let socket = socket.into_std()?;
         socket
             .set_nodelay(true)
             .context("failed to set client socket option")?;
+        socket
+            .set_nonblocking(false)
+            .context("failed to set client socket option")?;
 
         // TODO: replace with async tasks.
         thread::spawn(move || {
diff --git a/proxy/src/http/websocket.rs b/proxy/src/http/websocket.rs
index d4235c2c38..1757652a90 100644
--- a/proxy/src/http/websocket.rs
+++ b/proxy/src/http/websocket.rs
@@ -186,8 +186,8 @@ async fn ws_handler(
 }
 
 pub async fn task_main(
-    ws_listener: TcpListener,
     config: &'static ProxyConfig,
+    ws_listener: TcpListener,
 ) -> anyhow::Result<()> {
     scopeguard::defer! {
         info!("websocket server has shut down");
diff --git a/proxy/src/logging.rs b/proxy/src/logging.rs
index 2baf824fc3..0c8c2858b9 100644
--- a/proxy/src/logging.rs
+++ b/proxy/src/logging.rs
@@ -41,6 +41,7 @@ impl Drop for LoggingGuard {
     fn drop(&mut self) {
         // Shutdown trace pipeline gracefully, so that it has a chance to send any
         // pending traces before we exit.
+        tracing::info!("shutting down the tracing machinery");
         tracing_utils::shutdown_tracing();
     }
 }
diff --git a/proxy/src/main.rs b/proxy/src/main.rs
index 54f49b5a3c..c319cb9cfc 100644
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -28,7 +28,7 @@ use config::ProxyConfig;
 use futures::FutureExt;
 use std::{borrow::Cow, future::Future, net::SocketAddr};
 use tokio::{net::TcpListener, task::JoinError};
-use tracing::{info, info_span, Instrument};
+use tracing::{info, warn};
 use utils::{project_git_version, sentry_init::init_sentry};
 
 project_git_version!(GIT_VERSION);
@@ -60,16 +60,17 @@ async fn main() -> anyhow::Result<()> {
 
     let mgmt_address: SocketAddr = args.get_one::<String>("mgmt").unwrap().parse()?;
     info!("Starting mgmt on {mgmt_address}");
-    let mgmt_listener = TcpListener::bind(mgmt_address).await?.into_std()?;
+    let mgmt_listener = TcpListener::bind(mgmt_address).await?;
 
     let proxy_address: SocketAddr = args.get_one::<String>("proxy").unwrap().parse()?;
     info!("Starting proxy on {proxy_address}");
     let proxy_listener = TcpListener::bind(proxy_address).await?;
 
     let mut tasks = vec![
+        tokio::spawn(handle_signals()),
         tokio::spawn(http::server::task_main(http_listener)),
         tokio::spawn(proxy::task_main(config, proxy_listener)),
-        tokio::task::spawn_blocking(move || console::mgmt::thread_main(mgmt_listener)),
+        tokio::spawn(console::mgmt::task_main(mgmt_listener)),
     ];
 
     if let Some(wss_address) = args.get_one::<String>("wss") {
@@ -78,35 +79,52 @@ async fn main() -> anyhow::Result<()> {
         let wss_listener = TcpListener::bind(wss_address).await?;
 
         tasks.push(tokio::spawn(http::websocket::task_main(
-            wss_listener,
             config,
+            wss_listener,
         )));
     }
 
-    // TODO: refactor.
-    if let Some(metric_collection) = &config.metric_collection {
-        let hostname = hostname::get()?
-            .into_string()
-            .map_err(|e| anyhow::anyhow!("failed to get hostname {e:?}"))?;
-
-        tasks.push(tokio::spawn(
-            metrics::collect_metrics(
-                &metric_collection.endpoint,
-                metric_collection.interval,
-                hostname,
-            )
-            .instrument(info_span!("collect_metrics")),
-        ));
+    if let Some(metrics_config) = &config.metric_collection {
+        tasks.push(tokio::spawn(metrics::task_main(metrics_config)));
     }
 
-    // This will block until all tasks have completed.
-    // Furthermore, the first one to fail will cancel the rest.
+    // This combinator will block until either all tasks complete or
+    // one of them finishes with an error (others will be cancelled).
     let tasks = tasks.into_iter().map(flatten_err);
     let _: Vec<()> = futures::future::try_join_all(tasks).await?;
 
     Ok(())
 }
 
+/// Handle unix signals appropriately.
+async fn handle_signals() -> anyhow::Result<()> {
+    use tokio::signal::unix::{signal, SignalKind};
+
+    let mut hangup = signal(SignalKind::hangup())?;
+    let mut interrupt = signal(SignalKind::interrupt())?;
+    let mut terminate = signal(SignalKind::terminate())?;
+
+    loop {
+        tokio::select! {
+            // Hangup is commonly used for config reload.
+            _ = hangup.recv() => {
+                warn!("received SIGHUP; config reload is not supported");
+            }
+            // Shut down the whole application.
+            _ = interrupt.recv() => {
+                warn!("received SIGINT, exiting immediately");
+                bail!("interrupted");
+            }
+            // TODO: Don't accept new proxy connections.
+            // TODO: Shut down once all exisiting connections have been closed.
+            _ = terminate.recv() => {
+                warn!("received SIGTERM, exiting immediately");
+                bail!("terminated");
+            }
+        }
+    }
+}
+
 /// ProxyConfig is created at proxy startup, and lives forever.
 fn build_config(args: &clap::ArgMatches) -> anyhow::Result<&'static ProxyConfig> {
     let tls_config = match (
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 83b28288ee..8bbae9638b 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -1,10 +1,10 @@
 //! Periodically collect proxy consumption metrics
 //! and push them to a HTTP endpoint.
-use crate::http;
+use crate::{config::MetricCollectionConfig, http};
 use chrono::{DateTime, Utc};
 use consumption_metrics::{idempotency_key, Event, EventChunk, EventType, CHUNK_SIZE};
 use serde::Serialize;
-use std::{collections::HashMap, time::Duration};
+use std::collections::HashMap;
 use tracing::{debug, error, info, instrument, trace};
 
 const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
@@ -23,37 +23,31 @@ pub struct Ids {
     pub endpoint_id: String,
 }
 
-pub async fn collect_metrics(
-    metric_collection_endpoint: &reqwest::Url,
-    metric_collection_interval: Duration,
-    hostname: String,
-) -> anyhow::Result<()> {
+pub async fn task_main(config: &MetricCollectionConfig) -> anyhow::Result<()> {
+    info!("metrics collector config: {config:?}");
     scopeguard::defer! {
-        info!("collect_metrics has shut down");
+        info!("metrics collector has shut down");
     }
 
-    let mut ticker = tokio::time::interval(metric_collection_interval);
-
-    info!(
-        "starting collect_metrics. metric_collection_endpoint: {}",
-        metric_collection_endpoint
-    );
-
     let http_client = http::new_client();
     let mut cached_metrics: HashMap<Ids, (u64, DateTime<Utc>)> = HashMap::new();
+    let hostname = hostname::get()?.as_os_str().to_string_lossy().into_owned();
 
+    let mut ticker = tokio::time::interval(config.interval);
     loop {
-        tokio::select! {
-            _ = ticker.tick() => {
+        ticker.tick().await;
 
-                match collect_metrics_iteration(&http_client, &mut cached_metrics, metric_collection_endpoint, hostname.clone()).await
-                {
-                    Err(e) => {
-                        error!("Failed to send consumption metrics: {} ", e);
-                    },
-                    Ok(_) => { trace!("collect_metrics_iteration completed successfully") },
-                }
-            }
+        let res = collect_metrics_iteration(
+            &http_client,
+            &mut cached_metrics,
+            &config.endpoint,
+            &hostname,
+        )
+        .await;
+
+        match res {
+            Err(e) => error!("failed to send consumption metrics: {e} "),
+            Ok(_) => trace!("periodic metrics collection completed successfully"),
         }
     }
 }
@@ -102,7 +96,7 @@ async fn collect_metrics_iteration(
     client: &http::ClientWithMiddleware,
     cached_metrics: &mut HashMap<Ids, (u64, DateTime<Utc>)>,
     metric_collection_endpoint: &reqwest::Url,
-    hostname: String,
+    hostname: &str,
 ) -> anyhow::Result<()> {
     info!(
         "starting collect_metrics_iteration. metric_collection_endpoint: {}",
@@ -133,7 +127,7 @@ async fn collect_metrics_iteration(
                     stop_time: *curr_time,
                 },
                 metric: PROXY_IO_BYTES_PER_CLIENT,
-                idempotency_key: idempotency_key(hostname.clone()),
+                idempotency_key: idempotency_key(hostname.to_owned()),
                 value,
                 extra: Ids {
                     endpoint_id: curr_key.endpoint_id.clone(),
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index ca5288fa0a..59d616eb6f 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2529,15 +2529,17 @@ class NeonProxy(PgProtocol):
         tb: Optional[TracebackType],
     ):
         if self._popen is not None:
-            # NOTE the process will die when we're done with tests anyway, because
-            # it's a child process. This is mostly to clean up in between different tests.
-            self._popen.kill()
+            self._popen.terminate()
+            try:
+                self._popen.wait(timeout=5)
+            except subprocess.TimeoutExpired:
+                log.warn("failed to gracefully terminate proxy; killing")
+                self._popen.kill()
 
     @staticmethod
     async def activate_link_auth(
         local_vanilla_pg, proxy_with_metric_collector, psql_session_id, create_user=True
     ):
-
         pg_user = "proxy"
 
         if create_user:
diff --git a/workspace_hack/Cargo.toml b/workspace_hack/Cargo.toml
index 68138b3df4..c0cf3c5611 100644
--- a/workspace_hack/Cargo.toml
+++ b/workspace_hack/Cargo.toml
@@ -45,7 +45,7 @@ scopeguard = { version = "1" }
 serde = { version = "1", features = ["alloc", "derive"] }
 serde_json = { version = "1", features = ["raw_value"] }
 socket2 = { version = "0.4", default-features = false, features = ["all"] }
-tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "sync", "time"] }
+tokio = { version = "1", features = ["fs", "io-std", "io-util", "macros", "net", "process", "rt-multi-thread", "signal", "sync", "time"] }
 tokio-util = { version = "0.7", features = ["codec", "io"] }
 tonic = { version = "0.8", features = ["tls-roots"] }
 tower = { version = "0.4", features = ["balance", "buffer", "limit", "retry", "timeout", "util"] }

From d90cd36bcc014fef63e35964a42ffc4ed3e18424 Mon Sep 17 00:00:00 2001
From: Dmitry Ivanov <ivadmi5@gmail.com>
Date: Fri, 17 Feb 2023 13:32:45 +0300
Subject: [PATCH 24/42] [proxy] Improve tracing spans here and there.

---
 proxy/src/auth/backend.rs | 1 +
 proxy/src/proxy.rs        | 9 +++++----
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs
index 42b2304bb8..b8599adaeb 100644
--- a/proxy/src/auth/backend.rs
+++ b/proxy/src/auth/backend.rs
@@ -140,6 +140,7 @@ async fn auth_quirks(
 
 impl BackendType<'_, ClientCredentials<'_>> {
     /// Authenticate the client via the requested backend, possibly using credentials.
+    #[tracing::instrument(fields(allow_cleartext), skip_all)]
     pub async fn authenticate(
         &mut self,
         extra: &ConsoleReqExtra<'_>,
diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs
index 0dc48f1212..9642047812 100644
--- a/proxy/src/proxy.rs
+++ b/proxy/src/proxy.rs
@@ -17,7 +17,7 @@ use once_cell::sync::Lazy;
 use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams};
 use std::sync::Arc;
 use tokio::io::{AsyncRead, AsyncWrite};
-use tracing::{error, info, info_span, warn, Instrument};
+use tracing::{error, info, warn};
 
 /// Number of times we should retry the `/proxy_wake_compute` http request.
 const NUM_RETRIES_WAKE_COMPUTE: usize = 1;
@@ -91,13 +91,13 @@ pub async fn task_main(
             .unwrap_or_else(|e| {
                 // Acknowledge that the task has finished with an error.
                 error!("per-client task finished with an error: {e:#}");
-            })
-            .instrument(info_span!("client", session = format_args!("{session_id}"))),
+            }),
         );
     }
 }
 
 // TODO(tech debt): unite this with its twin below.
+#[tracing::instrument(fields(session_id), skip_all)]
 pub async fn handle_ws_client(
     config: &'static ProxyConfig,
     cancel_map: &CancelMap,
@@ -139,6 +139,7 @@ pub async fn handle_ws_client(
         .await
 }
 
+#[tracing::instrument(fields(session_id), skip_all)]
 async fn handle_client(
     config: &'static ProxyConfig,
     cancel_map: &CancelMap,
@@ -423,9 +424,9 @@ impl<S: AsyncRead + AsyncWrite + Unpin> Client<'_, S> {
             let res = creds
                 .authenticate(&extra, &mut stream, allow_cleartext)
                 .await;
+
             async { res }.or_else(|e| stream.throw_error(e)).await
         }
-        .instrument(info_span!("auth"))
         .await?;
 
         let AuthSuccess {

From b242b0ad6787f23ee008527fdf690ccc2012694f Mon Sep 17 00:00:00 2001
From: Konstantin Knizhnik <knizhnik@garret.ru>
Date: Fri, 17 Feb 2023 15:56:56 +0200
Subject: [PATCH 25/42] Fix flaky tests (#3616)

## Describe your changes

test_on_demand_download is flaky because not waiting until created image
layer is transferred to S3.
test_tenants_with_remote_storage just leaves garbage at the end of
overwritten file.

Right solution for test_on_demand_download is to add some API call to
wait completion of synchronization with S3 (not just based on last
record LSN). But right now it is solved using sleep.

## Issue ticket number and link

#3209

## Checklist before requesting a review
- [ ] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
---
 test_runner/regress/test_ondemand_download.py           | 3 +++
 test_runner/regress/test_tenants_with_remote_storage.py | 1 +
 2 files changed, 4 insertions(+)

diff --git a/test_runner/regress/test_ondemand_download.py b/test_runner/regress/test_ondemand_download.py
index 09657470b6..5ee94de32d 100644
--- a/test_runner/regress/test_ondemand_download.py
+++ b/test_runner/regress/test_ondemand_download.py
@@ -218,6 +218,9 @@ def test_ondemand_download_timetravel(
     log.info(filled_size)
     assert filled_current_physical == filled_size, "we don't yet do layer eviction"
 
+    # Wait until generated image layers are uploaded to S3
+    time.sleep(3)
+
     env.pageserver.stop()
 
     # remove all the layer files
diff --git a/test_runner/regress/test_tenants_with_remote_storage.py b/test_runner/regress/test_tenants_with_remote_storage.py
index 6da6a4d446..769bc10280 100644
--- a/test_runner/regress/test_tenants_with_remote_storage.py
+++ b/test_runner/regress/test_tenants_with_remote_storage.py
@@ -280,6 +280,7 @@ def test_tenant_upgrades_index_json_from_v0(
 
         timeline_file.seek(0)
         json.dump(v0_index_part, timeline_file)
+        timeline_file.truncate(timeline_file.tell())
 
     env.pageserver.start()
     pageserver_http = env.pageserver.http_client()

From 40799d8ae760e188b9e0770ee94075476bdfeb21 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 17 Feb 2023 16:49:40 +0200
Subject: [PATCH 26/42] Add debug messages to catch abnormal consumption metric
 values

---
 pageserver/src/consumption_metrics.rs | 8 +++++++-
 proxy/src/metrics.rs                  | 8 +++++++-
 2 files changed, 14 insertions(+), 2 deletions(-)

diff --git a/pageserver/src/consumption_metrics.rs b/pageserver/src/consumption_metrics.rs
index d1383b33cb..8916d4f1c9 100644
--- a/pageserver/src/consumption_metrics.rs
+++ b/pageserver/src/consumption_metrics.rs
@@ -25,7 +25,7 @@ const REMOTE_STORAGE_SIZE: &str = "remote_storage_size";
 const TIMELINE_LOGICAL_SIZE: &str = "timeline_logical_size";
 
 #[serde_as]
-#[derive(Serialize)]
+#[derive(Serialize, Debug)]
 struct Ids {
     #[serde_as(as = "DisplayFromStr")]
     tenant_id: TenantId,
@@ -287,6 +287,12 @@ pub async fn collect_metrics_iteration(
                     }
                 } else {
                     error!("metrics endpoint refused the sent metrics: {:?}", res);
+                    for metric in chunk_to_send.iter() {
+                        // Report if the metric value is suspiciously large
+                        if metric.value > (1u64 << 40) {
+                            error!("potentially abnormal metric value: {:?}", metric);
+                        }
+                    }
                 }
             }
             Err(err) => {
diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs
index 8bbae9638b..3b28346872 100644
--- a/proxy/src/metrics.rs
+++ b/proxy/src/metrics.rs
@@ -18,7 +18,7 @@ const PROXY_IO_BYTES_PER_CLIENT: &str = "proxy_io_bytes_per_client";
 /// so while the project-id is unique across regions the whole pipeline will work correctly
 /// because we enrich the event with project_id in the control-plane endpoint.
 ///
-#[derive(Eq, Hash, PartialEq, Serialize)]
+#[derive(Eq, Hash, PartialEq, Serialize, Debug)]
 pub struct Ids {
     pub endpoint_id: String,
 }
@@ -183,6 +183,12 @@ async fn collect_metrics_iteration(
             }
         } else {
             error!("metrics endpoint refused the sent metrics: {:?}", res);
+            for metric in chunk.iter() {
+                // Report if the metric value is suspiciously large
+                if metric.value > (1u64 << 40) {
+                    error!("potentially abnormal metric value: {:?}", metric);
+                }
+            }
         }
     }
     Ok(())

From 53128d56d9b3692873416e5d618091508d56e504 Mon Sep 17 00:00:00 2001
From: Anastasia Lubennikova <anastasia@neon.tech>
Date: Fri, 17 Feb 2023 16:57:37 +0200
Subject: [PATCH 27/42] Fix make clean: Use correct paths in neon-pg-ext-clean

---
 Makefile | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/Makefile b/Makefile
index 92a4532684..e04a82c7c9 100644
--- a/Makefile
+++ b/Makefile
@@ -136,9 +136,15 @@ neon-pg-ext-%: postgres-%
 
 .PHONY: neon-pg-ext-clean-%
 neon-pg-ext-clean-%:
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_walredo-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
-	$(MAKE) -C $(POSTGRES_INSTALL_DIR)/pgxn/neon_test_utils-$* -f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
+	-C $(POSTGRES_INSTALL_DIR)/build/neon-$* \
+	-f $(ROOT_PROJECT_DIR)/pgxn/neon/Makefile clean
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
+	-C $(POSTGRES_INSTALL_DIR)/build/neon-walredo-$* \
+	-f $(ROOT_PROJECT_DIR)/pgxn/neon_walredo/Makefile clean
+	$(MAKE) PG_CONFIG=$(POSTGRES_INSTALL_DIR)/$*/bin/pg_config \
+	-C $(POSTGRES_INSTALL_DIR)/build/neon-test-utils-$* \
+	-f $(ROOT_PROJECT_DIR)/pgxn/neon_test_utils/Makefile clean
 
 .PHONY: neon-pg-ext
 neon-pg-ext: \

From 8d28a24b26aeb5d7dc4ed3f13930971b6b381e49 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Fri, 17 Feb 2023 17:32:01 +0100
Subject: [PATCH 28/42] staging: enable automatic layer eviction at 20m
 threshold + period (#3636)

What it says on the tin.

Part of #2476
---
 .github/ansible/staging.eu-west-1.hosts.yaml | 5 +++++
 .github/ansible/staging.us-east-2.hosts.yaml | 5 +++++
 2 files changed, 10 insertions(+)

diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml
index f28dc8e07b..d19026b3f7 100644
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -8,6 +8,11 @@ storage:
       pg_distrib_dir: /usr/local
       metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
       metric_collection_interval: 10min
+      tenant_config:
+        eviction_policy:
+          kind: "LayerAccessThreshold"
+          period: "20m"
+          threshold: "20m"
       remote_storage:
         bucket_name: "{{ bucket_name }}"
         bucket_region: "{{ bucket_region }}"
diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index 9a1a095282..a8b9d41a65 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -8,6 +8,11 @@ storage:
       pg_distrib_dir: /usr/local
       metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
       metric_collection_interval: 10min
+      tenant_config:
+        eviction_policy:
+          kind: "LayerAccessThreshold"
+          period: "20m"
+          threshold: "20m"
       remote_storage:
         bucket_name: "{{ bucket_name }}"
         bucket_region: "{{ bucket_region }}"

From 564fa11244ef23889f1402af64c8cb7745153200 Mon Sep 17 00:00:00 2001
From: Alexander Bayandin <alexander@neon.tech>
Date: Fri, 17 Feb 2023 18:18:23 +0000
Subject: [PATCH 29/42] Update Postgres extensions (#3615)

- Update postgis from 3.3.1 from 3.3.2
- Update plv8 from 3.1.4 to 3.1.5
- Update h3-pg from 4.0.1 to 4.1.2 (and underlying h3 from 4.0.1 to 4.1.0)
---
 Dockerfile.compute-node | 62 +++++++++++++++--------------------------
 1 file changed, 22 insertions(+), 40 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index f4479c46cb..95d6237ecf 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -50,15 +50,15 @@ RUN apt update && \
     libcgal-dev libgdal-dev libgmp-dev libmpfr-dev libopenscenegraph-dev libprotobuf-c-dev \
     protobuf-c-compiler xsltproc
 
-RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz && \
-    tar zxvf SFCGAL-v1.3.10.tar.gz && \
-    cd SFCGAL-v1.3.10 && cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
+# SFCGAL > 1.3 requires CGAL > 5.2, Bullseye's libcgal-dev is 5.2
+RUN wget https://gitlab.com/Oslandia/SFCGAL/-/archive/v1.3.10/SFCGAL-v1.3.10.tar.gz -O SFCGAL.tar.gz && \
+    mkdir sfcgal-src && cd sfcgal-src && tar xvzf ../SFCGAL.tar.gz --strip-components=1 -C . && \
+    cmake . && make -j $(getconf _NPROCESSORS_ONLN) && \
     DESTDIR=/sfcgal make install -j $(getconf _NPROCESSORS_ONLN) && \
     make clean && cp -R /sfcgal/* /
 
-RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
-    tar xvzf postgis-3.3.1.tar.gz && \
-    cd postgis-3.3.1 && \
+RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.2.tar.gz -O postgis.tar.gz && \
+    mkdir postgis-src && cd postgis-src && tar xvzf ../postgis.tar.gz --strip-components=1 -C . && \
     ./autogen.sh && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     ./configure --with-sfcgal=/usr/local/bin/sfcgal-config && \
@@ -83,29 +83,14 @@ RUN wget https://download.osgeo.org/postgis/source/postgis-3.3.1.tar.gz && \
 FROM build-deps AS plv8-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 RUN apt update && \
-    apt install -y ninja-build python3-dev libc++-dev libc++abi-dev libncurses5 binutils
+    apt install -y ninja-build python3-dev libncurses5 binutils clang
 
-# https://github.com/plv8/plv8/issues/475:
-#   v8 uses gold for linking and sets `--thread-count=4` which breaks
-#   gold version <= 1.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=23607)
-# Install newer gold version manually as debian-testing binutils version updates
-# libc version, which in turn breaks other extension built against non-testing libc.
-RUN wget https://ftp.gnu.org/gnu/binutils/binutils-2.38.tar.gz && \
-    tar xvzf binutils-2.38.tar.gz && \
-    cd binutils-2.38 && \
-    cd libiberty && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && \
-    cd ../bfd && ./configure && make bfdver.h && \
-    cd ../gold && ./configure && make -j $(getconf _NPROCESSORS_ONLN) && make install && \
-    cp /usr/local/bin/ld.gold /usr/bin/gold
-
-# Sed is used to patch for https://github.com/plv8/plv8/issues/503
-RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.4.tar.gz && \
-    tar xvzf v3.1.4.tar.gz && \
-    cd plv8-3.1.4 && \
+RUN wget https://github.com/plv8/plv8/archive/refs/tags/v3.1.5.tar.gz -O plv8.tar.gz && \
+    mkdir plv8-src && cd plv8-src && tar xvzf ../plv8.tar.gz --strip-components=1 -C . && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
-    sed -i 's/MemoryContextAlloc(/MemoryContextAllocZero(/' plv8.cc && \
     make DOCKER=1 -j $(getconf _NPROCESSORS_ONLN) install && \
     rm -rf /plv8-* && \
+    find /usr/local/pgsql/ -name "plv8-*.so" | xargs strip && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plv8.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plcoffee.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/plls.control
@@ -126,20 +111,17 @@ RUN wget https://github.com/Kitware/CMake/releases/download/v3.24.2/cmake-3.24.2
       && /tmp/cmake-install.sh --skip-license --prefix=/usr/local/ \
       && rm /tmp/cmake-install.sh
 
-RUN wget https://github.com/uber/h3/archive/refs/tags/v4.0.1.tar.gz -O h3.tgz && \
-    tar xvzf h3.tgz  && \
-    cd h3-4.0.1 && \
-    mkdir build && \
-    cd build && \
+RUN wget https://github.com/uber/h3/archive/refs/tags/v4.1.0.tar.gz -O h3.tar.gz && \
+    mkdir h3-src && cd h3-src && tar xvzf ../h3.tar.gz --strip-components=1 -C . && \
+    mkdir build && cd build && \
     cmake .. -DCMAKE_BUILD_TYPE=Release && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     DESTDIR=/h3 make install && \
     cp -R /h3/usr / && \
     rm -rf build
 
-RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3-pg.tgz && \
-    tar xvzf h3-pg.tgz && \
-    cd h3-pg-4.0.1 && \
+RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.1.2.tar.gz -O h3-pg.tar.gz && \
+    mkdir h3-pg-src && cd h3-pg-src && tar xvzf ../h3-pg.tar.gz --strip-components=1 -C . && \
     export PATH="/usr/local/pgsql/bin:$PATH" && \
     make -j $(getconf _NPROCESSORS_ONLN) && \
     make -j $(getconf _NPROCESSORS_ONLN) install && \
@@ -155,9 +137,8 @@ RUN wget https://github.com/zachasme/h3-pg/archive/refs/tags/v4.0.1.tar.gz -O h3
 FROM build-deps AS unit-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz && \
-    tar xvzf 7.7.tar.gz && \
-    cd postgresql-unit-7.7 && \
+RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz -O postgresql-unit.tar.gz && \
+    mkdir postgresql-unit-src && cd postgresql-unit-src && tar xvzf ../postgresql-unit.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     # unit extension's "create extension" script relies on absolute install path to fill some reference tables.
@@ -176,8 +157,8 @@ RUN wget https://github.com/df7cb/postgresql-unit/archive/refs/tags/7.7.tar.gz &
 FROM build-deps AS vector-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN git clone --branch v0.4.0 https://github.com/pgvector/pgvector.git && \
-    cd pgvector && \
+RUN wget https://github.com/pgvector/pgvector/archive/refs/tags/v0.4.0.tar.gz -O pgvector.tar.gz && \
+    mkdir pgvector-src && cd pgvector-src && tar xvzf ../pgvector.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/vector.control
@@ -191,8 +172,9 @@ RUN git clone --branch v0.4.0 https://github.com/pgvector/pgvector.git && \
 FROM build-deps AS pgjwt-pg-build
 COPY --from=pg-build /usr/local/pgsql/ /usr/local/pgsql/
 
-RUN git clone https://github.com/michelp/pgjwt.git && \
-    cd pgjwt && \
+# 9742dab1b2f297ad3811120db7b21451bca2d3c9 made on 13/11/2021
+RUN wget https://github.com/michelp/pgjwt/archive/9742dab1b2f297ad3811120db7b21451bca2d3c9.tar.gz -O pgjwt.tar.gz && \
+    mkdir pgjwt-src && cd pgjwt-src && tar xvzf ../pgjwt.tar.gz --strip-components=1 -C . && \
     make -j $(getconf _NPROCESSORS_ONLN) install PG_CONFIG=/usr/local/pgsql/bin/pg_config && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgjwt.control
 

From 2153d2e00a917b675b454ae8d222e2946900e4d1 Mon Sep 17 00:00:00 2001
From: sharnoff <sharnoff@neon.tech>
Date: Fri, 17 Feb 2023 14:14:41 -0800
Subject: [PATCH 30/42] Run compute_ctl in a cgroup in VMs (#3577)

---
 .dockerignore                        |  1 +
 .github/workflows/build_and_test.yml | 25 ++++++--------
 Dockerfile.vm-compute-node           | 32 ++++++++++++++++++
 compute_tools/src/bin/compute_ctl.rs |  3 --
 compute_tools/src/informant.rs       | 50 ----------------------------
 compute_tools/src/lib.rs             |  1 -
 vm-cgconfig.conf                     | 12 +++++++
 7 files changed, 56 insertions(+), 68 deletions(-)
 create mode 100644 Dockerfile.vm-compute-node
 delete mode 100644 compute_tools/src/informant.rs
 create mode 100644 vm-cgconfig.conf

diff --git a/.dockerignore b/.dockerignore
index d256b21af1..a6e11805e9 100644
--- a/.dockerignore
+++ b/.dockerignore
@@ -21,3 +21,4 @@
 !workspace_hack/
 !neon_local/
 !scripts/ninstall.sh
+!vm-cgconfig.conf
diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml
index 27b7f54856..d16d221cc4 100644
--- a/.github/workflows/build_and_test.yml
+++ b/.github/workflows/build_and_test.yml
@@ -611,34 +611,31 @@ jobs:
       run:
         shell: sh -eu {0}
     env:
-      VM_INFORMANT_VERSION: 0.1.1
+      VM_BUILDER_VERSION: v0.4.6
 
     steps:
-      - name: Downloading latest vm-builder
+      - name: Checkout
+        uses: actions/checkout@v1
+        with:
+          fetch-depth: 0
+
+      - name: Downloading vm-builder
         run: |
-          curl -L https://github.com/neondatabase/neonvm/releases/latest/download/vm-builder -o vm-builder
+          curl -L https://github.com/neondatabase/neonvm/releases/download/$VM_BUILDER_VERSION/vm-builder -o vm-builder
           chmod +x vm-builder
 
       - name: Pulling compute-node image
         run: |
           docker pull 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
-      - name: Downloading VM informant version ${{ env.VM_INFORMANT_VERSION }}
+      - name: Building VM compute-node rootfs
         run: |
-          curl -fL https://github.com/neondatabase/autoscaling/releases/download/${{ env.VM_INFORMANT_VERSION }}/vm-informant -o vm-informant
-          chmod +x vm-informant
-
-      - name: Adding VM informant to compute-node image
-        run: |
-          ID=$(docker create 369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}})
-          docker cp vm-informant $ID:/bin/vm-informant
-          docker commit $ID temp-vm-compute-node
-          docker rm -f $ID
+          docker build -t temp-vm-compute-node --build-arg SRC_IMAGE=369495373322.dkr.ecr.eu-central-1.amazonaws.com/compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}} -f Dockerfile.vm-compute-node .
 
       - name: Build vm image
         run: |
           # note: as of 2023-01-12, vm-builder requires a trailing ":latest" for local images
-          ./vm-builder -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
+          ./vm-builder -use-inittab -src=temp-vm-compute-node:latest -dst=369495373322.dkr.ecr.eu-central-1.amazonaws.com/vm-compute-node-${{ matrix.version }}:${{needs.tag.outputs.build-tag}}
 
       - name: Pushing vm-compute-node image
         run: |
diff --git a/Dockerfile.vm-compute-node b/Dockerfile.vm-compute-node
new file mode 100644
index 0000000000..af3bfb3590
--- /dev/null
+++ b/Dockerfile.vm-compute-node
@@ -0,0 +1,32 @@
+# Note: this file *mostly* just builds on Dockerfile.compute-node
+
+ARG SRC_IMAGE
+ARG VM_INFORMANT_VERSION=v0.1.6
+
+# Pull VM informant and set up inittab
+FROM neondatabase/vm-informant:$VM_INFORMANT_VERSION as informant
+
+RUN set -e \
+	&& rm -f /etc/inittab \
+	&& touch /etc/inittab
+
+ADD vm-cgconfig.conf /etc/cgconfig.conf
+RUN set -e \
+	&& echo "::sysinit:cgconfigparser -l /etc/cgconfig.conf -s 1664" >> /etc/inittab \
+	&& echo "::respawn:su vm-informant -c '/usr/local/bin/vm-informant --auto-restart --cgroup=neon-postgres'" >> /etc/inittab
+
+# Combine, starting from non-VM compute node image.
+FROM $SRC_IMAGE as base
+
+# Temporarily set user back to root so we can run apt update and adduser
+USER root
+RUN apt update && \
+	apt install --no-install-recommends -y \
+        cgroup-tools
+RUN adduser vm-informant --disabled-password --no-create-home
+USER postgres
+
+COPY --from=informant /etc/inittab /etc/inittab
+COPY --from=informant /usr/bin/vm-informant /usr/local/bin/vm-informant
+
+ENTRYPOINT ["/usr/sbin/cgexec", "-g", "*:neon-postgres", "/usr/local/bin/compute_ctl"]
diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs
index 2c42662020..49cf1cd347 100644
--- a/compute_tools/src/bin/compute_ctl.rs
+++ b/compute_tools/src/bin/compute_ctl.rs
@@ -44,7 +44,6 @@ use tracing::{error, info};
 
 use compute_tools::compute::{ComputeMetrics, ComputeNode, ComputeState, ComputeStatus};
 use compute_tools::http::api::launch_http_server;
-use compute_tools::informant::spawn_vm_informant_if_present;
 use compute_tools::logger::*;
 use compute_tools::monitor::launch_monitor;
 use compute_tools::params::*;
@@ -141,8 +140,6 @@ fn main() -> Result<()> {
     // requests, while configuration is still in progress.
     let _http_handle = launch_http_server(&compute).expect("cannot launch http endpoint thread");
     let _monitor_handle = launch_monitor(&compute).expect("cannot launch compute monitor thread");
-    // Also spawn the thread responsible for handling the VM informant -- if it's present
-    let _vm_informant_handle = spawn_vm_informant_if_present().expect("cannot launch VM informant");
 
     // Start Postgres
     let mut delay_exit = false;
diff --git a/compute_tools/src/informant.rs b/compute_tools/src/informant.rs
deleted file mode 100644
index 8a6e3ab43a..0000000000
--- a/compute_tools/src/informant.rs
+++ /dev/null
@@ -1,50 +0,0 @@
-use std::path::Path;
-use std::process;
-use std::thread;
-use std::time::Duration;
-use tracing::{info, warn};
-
-use anyhow::{Context, Result};
-
-const VM_INFORMANT_PATH: &str = "/bin/vm-informant";
-const RESTART_INFORMANT_AFTER_MILLIS: u64 = 5000;
-
-/// Launch a thread to start the VM informant if it's present (and restart, on failure)
-pub fn spawn_vm_informant_if_present() -> Result<Option<thread::JoinHandle<()>>> {
-    let exists = Path::new(VM_INFORMANT_PATH)
-        .try_exists()
-        .context("could not check if path exists")?;
-
-    if !exists {
-        return Ok(None);
-    }
-
-    Ok(Some(
-        thread::Builder::new()
-            .name("run-vm-informant".into())
-            .spawn(move || run_informant())?,
-    ))
-}
-
-fn run_informant() -> ! {
-    let restart_wait = Duration::from_millis(RESTART_INFORMANT_AFTER_MILLIS);
-
-    info!("starting VM informant");
-
-    loop {
-        let mut cmd = process::Command::new(VM_INFORMANT_PATH);
-        // Block on subprocess:
-        let result = cmd.status();
-
-        match result {
-            Err(e) => warn!("failed to run VM informant at {VM_INFORMANT_PATH:?}: {e}"),
-            Ok(status) if !status.success() => {
-                warn!("{VM_INFORMANT_PATH} exited with code {status:?}, retrying")
-            }
-            Ok(_) => info!("{VM_INFORMANT_PATH} ended gracefully (unexpectedly). Retrying"),
-        }
-
-        // Wait before retrying
-        thread::sleep(restart_wait);
-    }
-}
diff --git a/compute_tools/src/lib.rs b/compute_tools/src/lib.rs
index a71b92f91a..aee6b53e6a 100644
--- a/compute_tools/src/lib.rs
+++ b/compute_tools/src/lib.rs
@@ -8,7 +8,6 @@ pub mod http;
 #[macro_use]
 pub mod logger;
 pub mod compute;
-pub mod informant;
 pub mod monitor;
 pub mod params;
 pub mod pg_helpers;
diff --git a/vm-cgconfig.conf b/vm-cgconfig.conf
new file mode 100644
index 0000000000..a2e201708e
--- /dev/null
+++ b/vm-cgconfig.conf
@@ -0,0 +1,12 @@
+# Configuration for cgroups in VM compute nodes
+group neon-postgres {
+    perm {
+        admin {
+            uid = vm-informant;
+        }
+        task {
+            gid = users;
+        }
+    }
+    memory {}
+}

From af210c8b42da3b57f4750d41cba0e7b4047aa92a Mon Sep 17 00:00:00 2001
From: Shany Pozin <shany@neon.tech>
Date: Mon, 20 Feb 2023 13:23:13 +0200
Subject: [PATCH 31/42] Allow running do_gc in non testing env (#3639)

## Describe your changes
Since the current default gc period is set to 1 hour, whenever there is
an immediate need to reduce PITR and run gc, the user has to wait 1 hour
for PITR change to take effect
By enabling this API the user can configure PITR and immediately call
the do_gc API to trigger gc
## Issue ticket number and link
#3590
## Checklist before requesting a review
- [X] I have performed a self-review of my code.
- [ ] If it is a core feature, I have added thorough tests.
- [ ] Do we need to implement analytics? if so did you add the relevant
metrics to the dashboard?
- [ ] If this PR requires public announcement, mark it with
/release-notes label and add several sentences in this section.
---
 pageserver/src/http/routes.rs | 7 +++----
 pageserver/src/tenant/mgr.rs  | 2 --
 2 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs
index 71273159b7..d2d9f24efb 100644
--- a/pageserver/src/http/routes.rs
+++ b/pageserver/src/http/routes.rs
@@ -14,7 +14,7 @@ use utils::http::request::{get_request_param, must_get_query_param, parse_query_
 
 use super::models::{
     StatusResponse, TenantConfigRequest, TenantCreateRequest, TenantCreateResponse, TenantInfo,
-    TimelineCreateRequest, TimelineInfo,
+    TimelineCreateRequest, TimelineGcRequest, TimelineInfo,
 };
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::pgdatadir_mapping::LsnForTimestamp;
@@ -40,7 +40,7 @@ use utils::{
 
 // Imports only used for testing APIs
 #[cfg(feature = "testing")]
-use super::models::{ConfigureFailpointsRequest, TimelineGcRequest};
+use super::models::ConfigureFailpointsRequest;
 
 struct State {
     conf: &'static PageServerConf,
@@ -925,7 +925,6 @@ async fn failpoints_handler(mut request: Request<Body>) -> Result<Response<Body>
 }
 
 // Run GC immediately on given timeline.
-#[cfg(feature = "testing")]
 async fn timeline_gc_handler(mut request: Request<Body>) -> Result<Response<Body>, ApiError> {
     let tenant_id: TenantId = parse_request_param(&request, "tenant_id")?;
     let timeline_id: TimelineId = parse_request_param(&request, "timeline_id")?;
@@ -1124,7 +1123,7 @@ pub fn make_router(
         )
         .put(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/do_gc",
-            testing_api!("run timeline GC", timeline_gc_handler),
+            timeline_gc_handler,
         )
         .put(
             "/v1/tenant/:tenant_id/timeline/:timeline_id/compact",
diff --git a/pageserver/src/tenant/mgr.rs b/pageserver/src/tenant/mgr.rs
index a74dfdea04..a44cb02b4d 100644
--- a/pageserver/src/tenant/mgr.rs
+++ b/pageserver/src/tenant/mgr.rs
@@ -540,13 +540,11 @@ where
     }
 }
 
-#[cfg(feature = "testing")]
 use {
     crate::repository::GcResult, pageserver_api::models::TimelineGcRequest,
     utils::http::error::ApiError,
 };
 
-#[cfg(feature = "testing")]
 pub async fn immediate_gc(
     tenant_id: TenantId,
     timeline_id: TimelineId,

From 8f557477c612a181a22075946e89e180879d1d35 Mon Sep 17 00:00:00 2001
From: Arthur Petukhovsky <petuhovskiy@yandex.ru>
Date: Mon, 20 Feb 2023 17:51:27 +0300
Subject: [PATCH 32/42] Add new safekeeper to ap-southeast-1 prod (#3645)

---
 .github/ansible/prod.ap-southeast-1.hosts.yaml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml
index 7c6d1db6d7..71fced23c2 100644
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -36,3 +36,5 @@ storage:
           ansible_host:  i-0e338adda8eb2d19f
         safekeeper-2.ap-southeast-1.aws.neon.tech:
           ansible_host:  i-04fb63634e4679eb9
+        safekeeper-3.ap-southeast-1.aws.neon.tech:
+          ansible_host:  i-05481f3bc88cfc2d4

From d5d690c0442d812d61a9d65dbf41411b987c3674 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Mon, 20 Feb 2023 16:05:21 +0100
Subject: [PATCH 33/42] Use fqdn for staging console management API (#3642)

`console-staging.local` is legacy manual CNAME to
`neon-internal-api.aws.neon.build` in r53
We could use `neon-internal-api.aws.neon.build` name directly
---
 .github/ansible/staging.eu-west-1.hosts.yaml                  | 4 ++--
 .github/ansible/staging.us-east-2.hosts.yaml                  | 4 ++--
 .github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml  | 4 ++--
 .github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml   | 2 +-
 .../dev-us-east-2-beta.neon-proxy-scram-legacy.yaml           | 4 ++--
 .github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml  | 4 ++--
 6 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/.github/ansible/staging.eu-west-1.hosts.yaml b/.github/ansible/staging.eu-west-1.hosts.yaml
index d19026b3f7..b537795704 100644
--- a/.github/ansible/staging.eu-west-1.hosts.yaml
+++ b/.github/ansible/staging.eu-west-1.hosts.yaml
@@ -2,11 +2,11 @@ storage:
   vars:
     bucket_name: neon-dev-storage-eu-west-1
     bucket_region: eu-west-1
-    console_mgmt_base_url: http://console-staging.local
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.build
     broker_endpoint: http://storage-broker-lb.zeta.eu-west-1.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
       metric_collection_interval: 10min
       tenant_config:
         eviction_policy:
diff --git a/.github/ansible/staging.us-east-2.hosts.yaml b/.github/ansible/staging.us-east-2.hosts.yaml
index a8b9d41a65..cd8f832af0 100644
--- a/.github/ansible/staging.us-east-2.hosts.yaml
+++ b/.github/ansible/staging.us-east-2.hosts.yaml
@@ -2,11 +2,11 @@ storage:
   vars:
     bucket_name: neon-staging-storage-us-east-2
     bucket_region: us-east-2
-    console_mgmt_base_url: http://console-staging.local
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.build
     broker_endpoint: http://storage-broker-lb.beta.us-east-2.internal.aws.neon.build:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-staging.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events
       metric_collection_interval: 10min
       tenant_config:
         eviction_policy:
diff --git a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
index c49b8d2009..ecf57554d9 100644
--- a/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-eu-west-1-zeta.neon-proxy-scram.yaml
@@ -6,11 +6,11 @@ image:
 
 settings:
   authBackend: "console"
-  authEndpoint: "http://console-staging.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
   domain: "*.eu-west-1.aws.neon.build"
   sentryEnvironment: "staging"
   wssPort: 8443
-  metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
 
 # -- Additional labels for neon-proxy pods
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
index 157ae66ed1..91ddd07eae 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-link.yaml
@@ -10,7 +10,7 @@ settings:
   uri: "https://console.stage.neon.tech/psql_session/"
   domain: "pg.neon.build"
   sentryEnvironment: "staging"
-  metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
 
 # -- Additional labels for neon-proxy-link pods
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
index 99b67d75c1..6ec18ff388 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram-legacy.yaml
@@ -6,11 +6,11 @@ image:
 
 settings:
   authBackend: "console"
-  authEndpoint: "http://console-staging.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
   domain: "*.cloud.stage.neon.tech"
   sentryEnvironment: "staging"
   wssPort: 8443
-  metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
 
 # -- Additional labels for neon-proxy pods
diff --git a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
index 764bb25b64..9b250fce6e 100644
--- a/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
+++ b/.github/helm-values/dev-us-east-2-beta.neon-proxy-scram.yaml
@@ -6,11 +6,11 @@ image:
 
 settings:
   authBackend: "console"
-  authEndpoint: "http://console-staging.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.build/management/api/v2"
   domain: "*.us-east-2.aws.neon.build"
   sentryEnvironment: "staging"
   wssPort: 8443
-  metricCollectionEndpoint: "http://console-staging.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.build/billing/api/v1/usage_events"
   metricCollectionInterval: "1min"
 
 # -- Additional labels for neon-proxy pods

From e363911c85c85faa515c8905b08e60c1c431c5e8 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 20 Feb 2023 16:18:13 +0100
Subject: [PATCH 34/42] timeline: propagate span to download_remote_layer
 (#3644)

fixes #3643
refs #3604
---
 pageserver/src/tenant/timeline.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 683c2cd2d3..f2b0a98509 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -3758,7 +3758,7 @@ impl Timeline {
                 drop(permit);
 
                 Ok(())
-            },
+            }.in_current_span(),
         );
 
         receiver.await.context("download task cancelled")?

From ee1eda99212636285f92745140c625c152f6b04a Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 20 Feb 2023 16:29:39 +0100
Subject: [PATCH 35/42] eviction: remove
 EvictionStats::not_considered_due_to_clock_skew

Rationale: see the block comment added in this patch.

fixes #3641
---
 .../src/tenant/timeline/eviction_task.rs      | 22 ++++++++++++++-----
 1 file changed, 16 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index e3e7ce4c9d..fc84517cc2 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -100,7 +100,6 @@ impl Timeline {
         #[allow(dead_code)]
         #[derive(Debug, Default)]
         struct EvictionStats {
-            not_considered_due_to_clock_skew: usize,
             candidates: usize,
             evicted: usize,
             errors: usize,
@@ -129,9 +128,21 @@ impl Timeline {
                 let no_activity_for = match now.duration_since(last_activity_ts) {
                     Ok(d) => d,
                     Err(_e) => {
-                        // NB: don't log the error. If there are many layers and the system clock
-                        // is skewed, we'd be flooding the log.
-                        stats.not_considered_due_to_clock_skew += 1;
+                        // We reach here if `now` < `last_activity_ts`, which can legitimately
+                        // happen if there is an access between us getting `now`, and us getting
+                        // the access stats from the layer.
+                        //
+                        // The other reason why it can happen is system clock skew because
+                        // SystemTime::now() is not monotonic, so, even if there is no access
+                        // to the layer after we get `now` at the beginning of this function,
+                        // it could be that `now`  < `last_activity_ts`.
+                        //
+                        // To distinguish the cases, we would need to record `Instant`s in the
+                        // access stats (i.e., monotonic timestamps), but then, the timestamps
+                        // values in the access stats would need to be `Instant`'s, and hence
+                        // they would be meaningless outside of the pageserver process.
+                        // At the time of writing, the trade-off is that access stats are more
+                        // valuable than detecting clock skew.
                         continue;
                     }
                 };
@@ -188,8 +199,7 @@ impl Timeline {
                 }
             }
         }
-        if stats.not_considered_due_to_clock_skew > 0 || stats.errors > 0 || stats.not_evictable > 0
-        {
+        if stats.errors > 0 || stats.not_evictable > 0 {
             warn!(stats=?stats, "eviction iteration complete");
         } else {
             info!(stats=?stats, "eviction iteration complete");

From 485b2696749a93d67ca58a3d54b9b659766903d1 Mon Sep 17 00:00:00 2001
From: Christian Schwarz <christian@neon.tech>
Date: Mon, 20 Feb 2023 16:35:23 +0100
Subject: [PATCH 36/42] eviction: tone down logs to debug!() level if there
 were no evictions

fixes #3647
---
 pageserver/src/tenant/timeline/eviction_task.rs | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index fc84517cc2..fe7e7a1654 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -199,7 +199,9 @@ impl Timeline {
                 }
             }
         }
-        if stats.errors > 0 || stats.not_evictable > 0 {
+        if stats.candidates == stats.not_evictable {
+            debug!(stats=?stats, "eviction iteration complete");
+        } else if stats.errors > 0 || stats.not_evictable > 0 {
             warn!(stats=?stats, "eviction iteration complete");
         } else {
             info!(stats=?stats, "eviction iteration complete");

From e3d75879c0af3e820c6bd9a74e19a9411d8afb52 Mon Sep 17 00:00:00 2001
From: Sergey Melnikov <sergey@neon.tech>
Date: Mon, 20 Feb 2023 18:11:06 +0100
Subject: [PATCH 37/42] Use fqdn to access console management API on production
 (#3651)

console-release.local is legacy manual CNAME to
neon-internal-api.aws.neon.tech in r53
We could use neon-internal-api.aws.neon.tech name directly

This already was deployed to staging in
https://github.com/neondatabase/neon/pull/3642
---
 .github/ansible/prod.ap-southeast-1.hosts.yaml                | 4 ++--
 .github/ansible/prod.eu-central-1.hosts.yaml                  | 4 ++--
 .github/ansible/prod.us-east-2.hosts.yaml                     | 4 ++--
 .github/ansible/prod.us-west-2.hosts.yaml                     | 4 ++--
 .../prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml         | 4 ++--
 .../helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml | 4 ++--
 .../helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml    | 4 ++--
 .../prod-us-west-2-eta.neon-proxy-scram-legacy.yaml           | 4 ++--
 .github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml  | 4 ++--
 9 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/.github/ansible/prod.ap-southeast-1.hosts.yaml b/.github/ansible/prod.ap-southeast-1.hosts.yaml
index 71fced23c2..13b44f4052 100644
--- a/.github/ansible/prod.ap-southeast-1.hosts.yaml
+++ b/.github/ansible/prod.ap-southeast-1.hosts.yaml
@@ -2,11 +2,11 @@ storage:
   vars:
     bucket_name: neon-prod-storage-ap-southeast-1
     bucket_region: ap-southeast-1
-    console_mgmt_base_url: http://console-release.local
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
     broker_endpoint: http://storage-broker-lb.epsilon.ap-southeast-1.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
       metric_collection_interval: 10min
       remote_storage:
         bucket_name: "{{ bucket_name }}"
diff --git a/.github/ansible/prod.eu-central-1.hosts.yaml b/.github/ansible/prod.eu-central-1.hosts.yaml
index 83d4f6f37d..2236dcbc06 100644
--- a/.github/ansible/prod.eu-central-1.hosts.yaml
+++ b/.github/ansible/prod.eu-central-1.hosts.yaml
@@ -2,11 +2,11 @@ storage:
   vars:
     bucket_name: neon-prod-storage-eu-central-1
     bucket_region: eu-central-1
-    console_mgmt_base_url: http://console-release.local
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
     broker_endpoint: http://storage-broker-lb.gamma.eu-central-1.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
       metric_collection_interval: 10min
       remote_storage:
         bucket_name: "{{ bucket_name }}"
diff --git a/.github/ansible/prod.us-east-2.hosts.yaml b/.github/ansible/prod.us-east-2.hosts.yaml
index 7f7601cd39..56bece3e77 100644
--- a/.github/ansible/prod.us-east-2.hosts.yaml
+++ b/.github/ansible/prod.us-east-2.hosts.yaml
@@ -2,11 +2,11 @@ storage:
   vars:
     bucket_name: neon-prod-storage-us-east-2
     bucket_region: us-east-2
-    console_mgmt_base_url: http://console-release.local
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
     broker_endpoint: http://storage-broker-lb.delta.us-east-2.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
       metric_collection_interval: 10min
       remote_storage:
         bucket_name: "{{ bucket_name }}"
diff --git a/.github/ansible/prod.us-west-2.hosts.yaml b/.github/ansible/prod.us-west-2.hosts.yaml
index 9cad79b986..f03e2d9435 100644
--- a/.github/ansible/prod.us-west-2.hosts.yaml
+++ b/.github/ansible/prod.us-west-2.hosts.yaml
@@ -2,11 +2,11 @@ storage:
   vars:
     bucket_name: neon-prod-storage-us-west-2
     bucket_region: us-west-2
-    console_mgmt_base_url: http://console-release.local
+    console_mgmt_base_url: http://neon-internal-api.aws.neon.tech
     broker_endpoint: http://storage-broker-lb.eta.us-west-2.internal.aws.neon.tech:50051
     pageserver_config_stub:
       pg_distrib_dir: /usr/local
-      metric_collection_endpoint: http://console-release.local/billing/api/v1/usage_events
+      metric_collection_endpoint: http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events
       metric_collection_interval: 10min
       remote_storage:
         bucket_name: "{{ bucket_name }}"
diff --git a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
index a640d468b3..389da35463 100644
--- a/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-ap-southeast-1-epsilon.neon-proxy-scram.yaml
@@ -6,11 +6,11 @@ image:
 
 settings:
   authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.ap-southeast-1.aws.neon.tech"
   sentryEnvironment: "production"
   wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
   metricCollectionInterval: "10min"
 
 # -- Additional labels for neon-proxy pods
diff --git a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
index c9430877de..7e16ac2d3d 100644
--- a/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-eu-central-1-gamma.neon-proxy-scram.yaml
@@ -6,11 +6,11 @@ image:
 
 settings:
   authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.eu-central-1.aws.neon.tech"
   sentryEnvironment: "production"
   wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
   metricCollectionInterval: "10min"
 
 # -- Additional labels for neon-proxy pods
diff --git a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
index 677df6a5be..05e41e7a97 100644
--- a/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-east-2-delta.neon-proxy-scram.yaml
@@ -6,11 +6,11 @@ image:
 
 settings:
   authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.us-east-2.aws.neon.tech"
   sentryEnvironment: "production"
   wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
   metricCollectionInterval: "10min"
 
 # -- Additional labels for neon-proxy pods
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
index 3a5cde4b01..e67a3e4461 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram-legacy.yaml
@@ -6,11 +6,11 @@ image:
 
 settings:
   authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.cloud.neon.tech"
   sentryEnvironment: "production"
   wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
   metricCollectionInterval: "10min"
 
 # -- Additional labels for neon-proxy pods
diff --git a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
index 919a0d503c..5dc23b282e 100644
--- a/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
+++ b/.github/helm-values/prod-us-west-2-eta.neon-proxy-scram.yaml
@@ -6,11 +6,11 @@ image:
 
 settings:
   authBackend: "console"
-  authEndpoint: "http://console-release.local/management/api/v2"
+  authEndpoint: "http://neon-internal-api.aws.neon.tech/management/api/v2"
   domain: "*.us-west-2.aws.neon.tech"
   sentryEnvironment: "production"
   wssPort: 8443
-  metricCollectionEndpoint: "http://console-release.local/billing/api/v1/usage_events"
+  metricCollectionEndpoint: "http://neon-internal-api.aws.neon.tech/billing/api/v1/usage_events"
   metricCollectionInterval: "10min"
 
 # -- Additional labels for neon-proxy pods

From bc7d3c647601d56332121403f1138d16317d2d98 Mon Sep 17 00:00:00 2001
From: Keanu Ashwell <supra4keanu@hotmail.com>
Date: Tue, 21 Feb 2023 05:51:54 +1000
Subject: [PATCH 38/42] docs: add dependency requirements for arch based
 systems (#3588)

This pull request adds information on building neon on Arch based system
such as Artix, Manjaro, Antergos, etc.
---
 README.md | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/README.md b/README.md
index f8bc1b7736..819693f1f3 100644
--- a/README.md
+++ b/README.md
@@ -34,6 +34,11 @@ dnf install flex bison readline-devel zlib-devel openssl-devel \
   libseccomp-devel perl clang cmake postgresql postgresql-contrib protobuf-compiler \
   protobuf-devel
 ```
+* On Arch based systems, these packages are needed:
+```bash
+pacman -S base-devel readline zlib libseccomp openssl clang \
+postgresql-libs cmake postgresql protobuf
+```
 
 2. [Install Rust](https://www.rust-lang.org/tools/install)
 ```

From d7d3f451f0fe6a037c8c4add276da8dcb2ad11f8 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 21 Feb 2023 10:03:55 +0200
Subject: [PATCH 39/42] Use tracing panic hook in all binaries (#3634)

Enables tracing panic hook in addition to pageserver introduced in
#3475:

- proxy
- safekeeper
- storage_broker

For proxy, a drop guard which resets the original std panic hook was
added on the first commit. Other binaries don't need it so they never
reset anything by `disarm`ing the drop guard.

The aim of the change is to make sure all panics a) have span
information b) are logged similar to other messages, not interleaved
with other messages as happens right now. Interleaving happens right now
because std prints panics to stderr, and other logging happens in
stdout. If this was handled gracefully by some utility, the log message
splitter would treat panics as belonging to the previous message because
it expects a message to start with a timestamp.

Cc: #3468
---
 libs/utils/src/logging.rs                | 112 +++++++++++++++++++++++
 pageserver/src/bin/pageserver.rs         |  50 +---------
 proxy/src/main.rs                        |   1 +
 safekeeper/src/bin/safekeeper.rs         |   5 +
 storage_broker/src/bin/storage_broker.rs |  10 +-
 5 files changed, 128 insertions(+), 50 deletions(-)

diff --git a/libs/utils/src/logging.rs b/libs/utils/src/logging.rs
index 02684d3d16..f770622a60 100644
--- a/libs/utils/src/logging.rs
+++ b/libs/utils/src/logging.rs
@@ -45,3 +45,115 @@ pub fn init(log_format: LogFormat) -> anyhow::Result<()> {
 
     Ok(())
 }
+
+/// Disable the default rust panic hook by using `set_hook`.
+///
+/// For neon binaries, the assumption is that tracing is configured before with [`init`], after
+/// that sentry is configured (if needed). sentry will install it's own on top of this, always
+/// processing the panic before we log it.
+///
+/// When the return value is dropped, the hook is reverted to std default hook (prints to stderr).
+/// If the assumptions about the initialization order are not held, use
+/// [`TracingPanicHookGuard::disarm`] but keep in mind, if tracing is stopped, then panics will be
+/// lost.
+#[must_use]
+pub fn replace_panic_hook_with_tracing_panic_hook() -> TracingPanicHookGuard {
+    std::panic::set_hook(Box::new(tracing_panic_hook));
+    TracingPanicHookGuard::new()
+}
+
+/// Drop guard which restores the std panic hook on drop.
+///
+/// Tracing should not be used when it's not configured, but we cannot really latch on to any
+/// imaginary lifetime of tracing.
+pub struct TracingPanicHookGuard {
+    act: bool,
+}
+
+impl TracingPanicHookGuard {
+    fn new() -> Self {
+        TracingPanicHookGuard { act: true }
+    }
+
+    /// Make this hook guard not do anything when dropped.
+    pub fn forget(&mut self) {
+        self.act = false;
+    }
+}
+
+impl Drop for TracingPanicHookGuard {
+    fn drop(&mut self) {
+        if self.act {
+            let _ = std::panic::take_hook();
+        }
+    }
+}
+
+/// Named symbol for our panic hook, which logs the panic.
+fn tracing_panic_hook(info: &std::panic::PanicInfo) {
+    // following rust 1.66.1 std implementation:
+    // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
+    let location = info.location();
+
+    let msg = match info.payload().downcast_ref::<&'static str>() {
+        Some(s) => *s,
+        None => match info.payload().downcast_ref::<String>() {
+            Some(s) => &s[..],
+            None => "Box<dyn Any>",
+        },
+    };
+
+    let thread = std::thread::current();
+    let thread = thread.name().unwrap_or("<unnamed>");
+    let backtrace = std::backtrace::Backtrace::capture();
+
+    let _entered = if let Some(location) = location {
+        tracing::error_span!("panic", %thread, location = %PrettyLocation(location))
+    } else {
+        // very unlikely to hit here, but the guarantees of std could change
+        tracing::error_span!("panic", %thread)
+    }
+    .entered();
+
+    if backtrace.status() == std::backtrace::BacktraceStatus::Captured {
+        // this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really
+        // get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to
+        // string, maybe even to a TLS one but tracing already does that.
+        tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}");
+    } else {
+        tracing::error!("{msg}");
+    }
+
+    // ensure that we log something on the panic if this hook is left after tracing has been
+    // unconfigured. worst case when teardown is racing the panic is to log the panic twice.
+    tracing::dispatcher::get_default(|d| {
+        if let Some(_none) = d.downcast_ref::<tracing::subscriber::NoSubscriber>() {
+            let location = location.map(PrettyLocation);
+            log_panic_to_stderr(thread, msg, location, &backtrace);
+        }
+    });
+}
+
+#[cold]
+fn log_panic_to_stderr(
+    thread: &str,
+    msg: &str,
+    location: Option<PrettyLocation<'_, '_>>,
+    backtrace: &std::backtrace::Backtrace,
+) {
+    eprintln!("panic while tracing is unconfigured: thread '{thread}' panicked at '{msg}', {location:?}\nStack backtrace:\n{backtrace}");
+}
+
+struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>);
+
+impl std::fmt::Display for PrettyLocation<'_, '_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column())
+    }
+}
+
+impl std::fmt::Debug for PrettyLocation<'_, '_> {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        <Self as std::fmt::Display>::fmt(self, f)
+    }
+}
diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs
index c499fd8d74..01a2c85d74 100644
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -91,9 +91,9 @@ fn main() -> anyhow::Result<()> {
     // Initialize logging, which must be initialized before the custom panic hook is installed.
     logging::init(conf.log_format)?;
 
-    // disable the default rust panic hook by using `set_hook`. sentry will install it's own on top
-    // of this, always processing the panic before we log it.
-    std::panic::set_hook(Box::new(tracing_panic_hook));
+    // mind the order required here: 1. logging, 2. panic_hook, 3. sentry.
+    // disarming this hook on pageserver, because we never tear down tracing.
+    logging::replace_panic_hook_with_tracing_panic_hook().forget();
 
     // initialize sentry if SENTRY_DSN is provided
     let _sentry_guard = init_sentry(
@@ -499,50 +499,6 @@ fn cli() -> Command {
         )
 }
 
-/// Named symbol for our panic hook, which logs the panic.
-fn tracing_panic_hook(info: &std::panic::PanicInfo) {
-    // following rust 1.66.1 std implementation:
-    // https://github.com/rust-lang/rust/blob/90743e7298aca107ddaa0c202a4d3604e29bfeb6/library/std/src/panicking.rs#L235-L288
-    let location = info.location();
-
-    let msg = match info.payload().downcast_ref::<&'static str>() {
-        Some(s) => *s,
-        None => match info.payload().downcast_ref::<String>() {
-            Some(s) => &s[..],
-            None => "Box<dyn Any>",
-        },
-    };
-
-    let thread = std::thread::current();
-    let thread = thread.name().unwrap_or("<unnamed>");
-    let backtrace = std::backtrace::Backtrace::capture();
-
-    struct PrettyLocation<'a, 'b>(&'a std::panic::Location<'b>);
-
-    impl std::fmt::Display for PrettyLocation<'_, '_> {
-        fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
-            write!(f, "{}:{}:{}", self.0.file(), self.0.line(), self.0.column())
-        }
-    }
-
-    let _entered = if let Some(location) = location {
-        tracing::error_span!("panic", %thread, location = %PrettyLocation(location))
-    } else {
-        // very unlikely to hit here, but the guarantees of std could change
-        tracing::error_span!("panic", %thread)
-    }
-    .entered();
-
-    if backtrace.status() == std::backtrace::BacktraceStatus::Captured {
-        // this has an annoying extra '\n' in the end which anyhow doesn't do, but we cannot really
-        // get rid of it as we cannot get in between of std::fmt::Formatter<'_>; we could format to
-        // string, maybe even to a TLS one but tracing already does that.
-        tracing::error!("{msg}\n\nStack backtrace:\n{backtrace}");
-    } else {
-        tracing::error!("{msg}");
-    }
-}
-
 #[test]
 fn verify_cli() {
     cli().debug_assert();
diff --git a/proxy/src/main.rs b/proxy/src/main.rs
index c319cb9cfc..85478da3bc 100644
--- a/proxy/src/main.rs
+++ b/proxy/src/main.rs
@@ -43,6 +43,7 @@ async fn flatten_err(
 #[tokio::main]
 async fn main() -> anyhow::Result<()> {
     let _logging_guard = logging::init().await?;
+    let _panic_hook_guard = utils::logging::replace_panic_hook_with_tracing_panic_hook();
     let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
 
     info!("Version: {GIT_VERSION}");
diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs
index 1a068412c8..683050e9cd 100644
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -126,7 +126,12 @@ fn main() -> anyhow::Result<()> {
         return Ok(());
     }
 
+    // important to keep the order of:
+    // 1. init logging
+    // 2. tracing panic hook
+    // 3. sentry
     logging::init(LogFormat::from_config(&args.log_format)?)?;
+    logging::replace_panic_hook_with_tracing_panic_hook().forget();
     info!("version: {GIT_VERSION}");
 
     let args_workdir = &args.datadir;
diff --git a/storage_broker/src/bin/storage_broker.rs b/storage_broker/src/bin/storage_broker.rs
index c73206b7dc..1a0d261184 100644
--- a/storage_broker/src/bin/storage_broker.rs
+++ b/storage_broker/src/bin/storage_broker.rs
@@ -424,12 +424,16 @@ async fn http1_handler(
 
 #[tokio::main]
 async fn main() -> Result<(), Box<dyn std::error::Error>> {
-    // initialize sentry if SENTRY_DSN is provided
-    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
-
     let args = Args::parse();
 
+    // important to keep the order of:
+    // 1. init logging
+    // 2. tracing panic hook
+    // 3. sentry
     logging::init(LogFormat::from_config(&args.log_format)?)?;
+    logging::replace_panic_hook_with_tracing_panic_hook().forget();
+    // initialize sentry if SENTRY_DSN is provided
+    let _sentry_guard = init_sentry(Some(GIT_VERSION.into()), &[]);
     info!("version: {GIT_VERSION}");
     ::metrics::set_build_info_metric(GIT_VERSION);
 

From 5c5b03ce08f930480592aeb0a473a82279590efb Mon Sep 17 00:00:00 2001
From: Vadim Kharitonov <vadim@neon.tech>
Date: Mon, 20 Feb 2023 17:22:52 +0100
Subject: [PATCH 40/42] Compile xml2 extension

---
 Dockerfile.compute-node | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/Dockerfile.compute-node b/Dockerfile.compute-node
index 95d6237ecf..1d6c2f354f 100644
--- a/Dockerfile.compute-node
+++ b/Dockerfile.compute-node
@@ -11,7 +11,7 @@ FROM debian:bullseye-slim AS build-deps
 RUN apt update &&  \
     apt install -y git autoconf automake libtool build-essential bison flex libreadline-dev \
     zlib1g-dev libxml2-dev libcurl4-openssl-dev libossp-uuid-dev wget pkg-config libssl-dev \
-    libicu-dev
+    libicu-dev libxslt1-dev
 
 #########################################################################################
 #
@@ -23,7 +23,8 @@ FROM build-deps AS pg-build
 ARG PG_VERSION
 COPY vendor/postgres-${PG_VERSION} postgres
 RUN cd postgres && \
-    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu && \
+    ./configure CFLAGS='-O2 -g3' --enable-debug --with-openssl --with-uuid=ossp --with-icu \
+    --with-libxml --with-libxslt && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s install && \
     make MAKELEVEL=0 -j $(getconf _NPROCESSORS_ONLN) -s -C contrib/ install && \
     # Install headers
@@ -34,7 +35,8 @@ RUN cd postgres && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgrowlocks.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/intagg.control && \
     echo 'trusted = true' >> /usr/local/pgsql/share/extension/pgstattuple.control && \
-    echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/earthdistance.control && \
+    echo 'trusted = true' >> /usr/local/pgsql/share/extension/xml2.control
 
 #########################################################################################
 #
@@ -255,6 +257,7 @@ COPY --from=compute-tools --chown=postgres /home/nonroot/target/release-line-deb
 # libicu67, locales for collations (including ICU)
 # libossp-uuid16 for extension ossp-uuid
 # libgeos, libgdal, libsfcgal1, libproj and libprotobuf-c1 for PostGIS
+# libxml2, libxslt1.1 for xml2
 RUN apt update &&  \
     apt install --no-install-recommends -y \
         locales \
@@ -266,6 +269,8 @@ RUN apt update &&  \
         libproj19 \
         libprotobuf-c1 \
         libsfcgal1 \
+        libxml2 \
+        libxslt1.1 \
         gdb && \
     rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* && \
     localedef -i en_US -c -f UTF-8 -A /usr/share/locale/locale.alias en_US.UTF-8

From 7de373210df708d874d36bf335c669b12f685da9 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 21 Feb 2023 13:02:19 +0200
Subject: [PATCH 41/42] Warn when background tasks exceed their configured
 period (#3654)

Fixes #3648.
---
 pageserver/src/tenant/tasks.rs                | 64 +++++++++++++------
 .../src/tenant/timeline/eviction_task.rs      |  9 +--
 test_runner/fixtures/neon_fixtures.py         |  1 +
 3 files changed, 47 insertions(+), 27 deletions(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index b126545ee4..db269a1745 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -3,7 +3,7 @@
 
 use std::ops::ControlFlow;
 use std::sync::Arc;
-use std::time::Duration;
+use std::time::{Duration, Instant};
 
 use crate::context::{DownloadBehavior, RequestContext};
 use crate::metrics::TENANT_TASK_EVENTS;
@@ -60,26 +60,32 @@ async fn compaction_loop(tenant_id: TenantId) {
             let tenant = tokio::select! {
                 _ = task_mgr::shutdown_watcher() => {
                     info!("received cancellation request");
-                return;
+                    return;
                 },
                 tenant_wait_result = wait_for_active_tenant(tenant_id, wait_duration) => match tenant_wait_result {
                     ControlFlow::Break(()) => return,
                     ControlFlow::Continue(tenant) => tenant,
                 },
-        };
+            };
 
-            let mut sleep_duration = tenant.get_compaction_period();
-            if sleep_duration == Duration::ZERO {
+            let started_at = Instant::now();
+
+            let period = tenant.get_compaction_period();
+            let sleep_duration = if period == Duration::ZERO {
                 info!("automatic compaction is disabled");
                 // check again in 10 seconds, in case it's been enabled again.
-                sleep_duration = Duration::from_secs(10);
+                Duration::from_secs(10)
             } else {
                 // Run compaction
                 if let Err(e) = tenant.compaction_iteration(&ctx).await {
-                    sleep_duration = wait_duration;
-                    error!("Compaction failed, retrying in {:?}: {e:?}", sleep_duration);
+                    error!("Compaction failed, retrying in {:?}: {e:?}", wait_duration);
+                    wait_duration
+                } else {
+                    period
                 }
-            }
+            };
+
+            warn_when_period_overrun(started_at.elapsed(), period, "compaction");
 
             // Sleep
             tokio::select! {
@@ -122,23 +128,26 @@ async fn gc_loop(tenant_id: TenantId) {
                 },
             };
 
-            let gc_period = tenant.get_gc_period();
+            let started_at = Instant::now();
+
+            let period = tenant.get_gc_period();
             let gc_horizon = tenant.get_gc_horizon();
-            let mut sleep_duration = gc_period;
-            if sleep_duration == Duration::ZERO {
+            let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 {
                 info!("automatic GC is disabled");
                 // check again in 10 seconds, in case it's been enabled again.
-                sleep_duration = Duration::from_secs(10);
+                Duration::from_secs(10)
             } else {
                 // Run gc
-                if gc_horizon > 0 {
-                    if let Err(e) = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await
-                    {
-                        sleep_duration = wait_duration;
-                        error!("Gc failed, retrying in {:?}: {e:?}", sleep_duration);
-                    }
+                let res = tenant.gc_iteration(None, gc_horizon, tenant.get_pitr_interval(), &ctx).await;
+                if let Err(e) = res {
+                    error!("Gc failed, retrying in {:?}: {e:?}", wait_duration);
+                    wait_duration
+                } else {
+                    period
                 }
-            }
+            };
+
+            warn_when_period_overrun(started_at.elapsed(), period, "gc");
 
             // Sleep
             tokio::select! {
@@ -197,3 +206,18 @@ async fn wait_for_active_tenant(
         }
     }
 }
+
+pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
+    // Duration::ZERO will happen because it's the "disable [bgtask]" value.
+    if elapsed >= period && period != Duration::ZERO {
+        // humantime does no significant digits clamping whereas Duration's debug is a bit more
+        // intelligent. however it makes sense to keep the "configuration format" for period, even
+        // though there's no way to output the actual config value.
+        warn!(
+            ?elapsed,
+            period = %humantime::format_duration(period),
+            task,
+            "task iteration took longer than the configured period"
+        );
+    }
+}
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index fe7e7a1654..0dd169363e 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -44,6 +44,7 @@ impl Timeline {
         loop {
             let policy = self.get_eviction_policy();
             let cf = self.eviction_iteration(&policy, cancel.clone()).await;
+
             match cf {
                 ControlFlow::Break(()) => break,
                 ControlFlow::Continue(sleep_until) => {
@@ -78,13 +79,7 @@ impl Timeline {
                     ControlFlow::Continue(()) => (),
                 }
                 let elapsed = start.elapsed();
-                if elapsed > p.period {
-                    warn!(
-                        configured_period = %humantime::format_duration(p.period),
-                        last_period = %humantime::format_duration(elapsed),
-                        "this eviction period took longer than the configured period"
-                    );
-                }
+                crate::tenant::tasks::warn_when_period_overrun(elapsed, p.period, "eviction");
                 ControlFlow::Continue(start + p.period)
             }
         }
diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py
index 59d616eb6f..63196609cc 100644
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -2079,6 +2079,7 @@ class NeonPageserver(PgProtocol):
             ".*compaction_loop.*Compaction failed, retrying in.*timeline is Stopping",  # When compaction checks timeline state after acquiring layer_removal_cs
             ".*query handler for 'pagestream.*failed: Timeline .* was not found",  # postgres reconnects while timeline_delete doesn't hold the tenant's timelines.lock()
             ".*query handler for 'pagestream.*failed: Timeline .* is not active",  # timeline delete in progress
+            ".*task iteration took longer than the configured period.*",
         ]
 
     def start(

From b220ba6cd1973cdbe4d886904f2c217c6df72c77 Mon Sep 17 00:00:00 2001
From: Joonas Koivunen <joonas@neon.tech>
Date: Tue, 21 Feb 2023 13:42:11 +0200
Subject: [PATCH 42/42] add random init delay for background tasks (#3655)

Fixes #3649.
---
 pageserver/src/tenant/tasks.rs                | 66 +++++++++++++++++--
 .../src/tenant/timeline/eviction_task.rs      | 13 ++++
 2 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/pageserver/src/tenant/tasks.rs b/pageserver/src/tenant/tasks.rs
index db269a1745..e9ce52d1ab 100644
--- a/pageserver/src/tenant/tasks.rs
+++ b/pageserver/src/tenant/tasks.rs
@@ -11,6 +11,7 @@ use crate::task_mgr;
 use crate::task_mgr::{TaskKind, BACKGROUND_RUNTIME};
 use crate::tenant::mgr;
 use crate::tenant::{Tenant, TenantState};
+use tokio_util::sync::CancellationToken;
 use tracing::*;
 use utils::id::TenantId;
 
@@ -53,12 +54,14 @@ async fn compaction_loop(tenant_id: TenantId) {
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
+        let cancel = task_mgr::shutdown_token();
         let ctx = RequestContext::todo_child(TaskKind::Compaction, DownloadBehavior::Download);
+        let mut first = true;
         loop {
             trace!("waking up");
 
             let tenant = tokio::select! {
-                _ = task_mgr::shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                     info!("received cancellation request");
                     return;
                 },
@@ -68,9 +71,19 @@ async fn compaction_loop(tenant_id: TenantId) {
                 },
             };
 
+            let period = tenant.get_compaction_period();
+
+            // TODO: we shouldn't need to await to find tenant and this could be moved outside of
+            // loop
+            if first {
+                first = false;
+                if random_init_delay(period, &cancel).await.is_err() {
+                    break;
+                }
+            }
+
             let started_at = Instant::now();
 
-            let period = tenant.get_compaction_period();
             let sleep_duration = if period == Duration::ZERO {
                 info!("automatic compaction is disabled");
                 // check again in 10 seconds, in case it's been enabled again.
@@ -89,7 +102,7 @@ async fn compaction_loop(tenant_id: TenantId) {
 
             // Sleep
             tokio::select! {
-                _ = task_mgr::shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                     info!("received cancellation request during idling");
                     break;
                 },
@@ -111,14 +124,16 @@ async fn gc_loop(tenant_id: TenantId) {
     info!("starting");
     TENANT_TASK_EVENTS.with_label_values(&["start"]).inc();
     async {
+        let cancel = task_mgr::shutdown_token();
         // GC might require downloading, to find the cutoff LSN that corresponds to the
         // cutoff specified as time.
         let ctx = RequestContext::todo_child(TaskKind::GarbageCollector, DownloadBehavior::Download);
+        let mut first = true;
         loop {
             trace!("waking up");
 
             let tenant = tokio::select! {
-                _ = task_mgr::shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                     info!("received cancellation request");
                     return;
                 },
@@ -128,9 +143,17 @@ async fn gc_loop(tenant_id: TenantId) {
                 },
             };
 
+            let period = tenant.get_gc_period();
+
+            if first {
+                first = false;
+                if random_init_delay(period, &cancel).await.is_err() {
+                    break;
+                }
+            }
+
             let started_at = Instant::now();
 
-            let period = tenant.get_gc_period();
             let gc_horizon = tenant.get_gc_horizon();
             let sleep_duration = if period == Duration::ZERO || gc_horizon == 0 {
                 info!("automatic GC is disabled");
@@ -151,7 +174,7 @@ async fn gc_loop(tenant_id: TenantId) {
 
             // Sleep
             tokio::select! {
-                _ = task_mgr::shutdown_watcher() => {
+                _ = cancel.cancelled() => {
                     info!("received cancellation request during idling");
                     break;
                 },
@@ -207,6 +230,37 @@ async fn wait_for_active_tenant(
     }
 }
 
+#[derive(thiserror::Error, Debug)]
+#[error("cancelled")]
+pub(crate) struct Cancelled;
+
+/// Provide a random delay for background task initialization.
+///
+/// This delay prevents a thundering herd of background tasks and will likely keep them running on
+/// different periods for more stable load.
+pub(crate) async fn random_init_delay(
+    period: Duration,
+    cancel: &CancellationToken,
+) -> Result<(), Cancelled> {
+    use rand::Rng;
+
+    let d = {
+        let mut rng = rand::thread_rng();
+
+        // gen_range asserts that the range cannot be empty, which it could be because period can
+        // be set to zero to disable gc or compaction, so lets set it to be at least 10s.
+        let period = std::cmp::max(period, Duration::from_secs(10));
+
+        // semi-ok default as the source of jitter
+        rng.gen_range(Duration::ZERO..=period)
+    };
+
+    tokio::select! {
+        _ = cancel.cancelled() => Err(Cancelled),
+        _ = tokio::time::sleep(d) => Ok(()),
+    }
+}
+
 pub(crate) fn warn_when_period_overrun(elapsed: Duration, period: Duration, task: &str) {
     // Duration::ZERO will happen because it's the "disable [bgtask]" value.
     if elapsed >= period && period != Duration::ZERO {
diff --git a/pageserver/src/tenant/timeline/eviction_task.rs b/pageserver/src/tenant/timeline/eviction_task.rs
index 0dd169363e..2aad0ef0f3 100644
--- a/pageserver/src/tenant/timeline/eviction_task.rs
+++ b/pageserver/src/tenant/timeline/eviction_task.rs
@@ -41,6 +41,19 @@ impl Timeline {
 
     #[instrument(skip_all, fields(tenant_id = %self.tenant_id, timeline_id = %self.timeline_id))]
     async fn eviction_task(self: Arc<Self>, cancel: CancellationToken) {
+        use crate::tenant::tasks::random_init_delay;
+        {
+            let policy = self.get_eviction_policy();
+            let period = match policy {
+                EvictionPolicy::LayerAccessThreshold(lat) => lat.period,
+                EvictionPolicy::NoEviction => Duration::from_secs(10),
+            };
+            if random_init_delay(period, &cancel).await.is_err() {
+                info!("shutting down");
+                return;
+            }
+        }
+
         loop {
             let policy = self.get_eviction_policy();
             let cf = self.eviction_iteration(&policy, cancel.clone()).await;