From d7bca9fcdb5b98e76b0f6ea60387fcdfb6040e10 Mon Sep 17 00:00:00 2001
From: John Spray <john@neon.tech>
Date: Sat, 29 Jun 2024 20:54:49 +0100
Subject: [PATCH] pageserver: implementation of update_layer_visibility

---
 pageserver/src/metrics.rs                     |  15 ++
 pageserver/src/tenant/layer_map.rs            | 162 +++++++++++++++++-
 .../layer_map/historic_layer_coverage.rs      |   4 +
 pageserver/src/tenant/storage_layer.rs        |   4 +
 pageserver/src/tenant/timeline.rs             |   4 +
 pageserver/src/tenant/timeline/compaction.rs  |  84 ++++++++-
 6 files changed, 270 insertions(+), 3 deletions(-)
diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs
index 59b7293631..0845a0a684 100644
--- a/pageserver/src/metrics.rs
+++ b/pageserver/src/metrics.rs
@@ -500,6 +500,15 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
     .expect("failed to define a metric")
 });
 
+static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
+    register_uint_gauge_vec!(
+        "pageserver_visible_physical_size",
+        "The size of the layer files present in the pageserver's filesystem.",
+        &["tenant_id", "shard_id", "timeline_id"]
+    )
+    .expect("failed to define a metric")
+});
+
 pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
     register_uint_gauge!(
         "pageserver_resident_physical_size_global",
@@ -2130,6 +2139,7 @@ pub(crate) struct TimelineMetrics {
     pub archival_size: UIntGauge,
     pub standby_horizon_gauge: IntGauge,
     pub resident_physical_size_gauge: UIntGauge,
+    pub visible_physical_size_gauge: UIntGauge,
     /// copy of LayeredTimeline.current_logical_size
     pub current_logical_size_gauge: UIntGauge,
     pub aux_file_size_gauge: IntGauge,
@@ -2216,6 +2226,9 @@ impl TimelineMetrics {
         let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
             .unwrap();
+        let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
+            .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
+            .unwrap();
         // TODO: we shouldn't expose this metric
         let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
             .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
@@ -2266,6 +2279,7 @@ impl TimelineMetrics {
             archival_size,
             standby_horizon_gauge,
             resident_physical_size_gauge,
+            visible_physical_size_gauge,
             current_logical_size_gauge,
             aux_file_size_gauge,
             directory_entries_count_gauge,
@@ -2317,6 +2331,7 @@ impl TimelineMetrics {
             RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
             let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         }
+        let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
         if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
             let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs
index 2724a5cc07..8c526462de 100644
--- a/pageserver/src/tenant/layer_map.rs
+++ b/pageserver/src/tenant/layer_map.rs
@@ -51,7 +51,7 @@ use crate::keyspace::KeyPartitioning;
 use crate::repository::Key;
 use crate::tenant::storage_layer::InMemoryLayer;
 use anyhow::Result;
-use pageserver_api::keyspace::KeySpaceAccum;
+use pageserver_api::keyspace::{KeySpace, KeySpaceAccum, KeySpaceRandomAccum};
 use std::collections::{HashMap, VecDeque};
 use std::iter::Peekable;
 use std::ops::Range;
@@ -61,7 +61,7 @@ use utils::lsn::Lsn;
 use historic_layer_coverage::BufferedHistoricLayerCoverage;
 pub use historic_layer_coverage::LayerKey;
 
-use super::storage_layer::PersistentLayerDesc;
+use super::storage_layer::{LayerVisibility, PersistentLayerDesc};
 
 ///
 /// LayerMap tracks what layers exist on a timeline.
@@ -870,6 +870,164 @@ impl LayerMap {
         println!("End dump LayerMap");
         Ok(())
     }
+
+    /// `read_points` represent the tip of a timeline and any branch points, i.e. the places
+    /// where we expect to serve reads.
+    ///
+    /// This function is O(N) and should be called infrequently.  The caller is responsible for
+    /// looking up and updating the Layer objects for these layer descriptors.
+    pub(crate) fn get_visibility(
+        &self,
+        mut read_points: Vec<(Lsn, KeySpace)>,
+    ) -> (Vec<(Arc<PersistentLayerDesc>, LayerVisibility)>, KeySpace) {
+        // This is like a KeySpace, but written for efficient subtraction of layers and unions with KeySpaces
+        struct KeyShadow {
+            // FIXME: consider efficiency.  KeySpace is a flat vector, so in principle fairly inefficient for
+            // repeatedly calling contains(), BUT as we iterate through the layermap we expect the shadow to shrink
+            // to something quite small, and for small collections an algorithmically expensive vector is often better
+            // for performance than a more algorithmically cheap data structure.
+            inner: KeySpace,
+        }
+
+        impl KeyShadow {
+            fn new(keyspace: KeySpace) -> Self {
+                Self { inner: keyspace }
+            }
+
+            fn contains(&self, range: Range<Key>) -> bool {
+                self.inner.overlaps(&range)
+            }
+
+            /// Return true if anything was removed.
+            fn subtract(&mut self, range: Range<Key>) -> bool {
+                let removed = self.inner.remove_overlapping_with(&KeySpace {
+                    ranges: vec![range],
+                });
+                !removed.ranges.is_empty()
+            }
+
+            fn union_with(&mut self, keyspace: KeySpace) {
+                let mut accum = KeySpaceRandomAccum::new();
+                let prev = std::mem::take(&mut self.inner);
+                accum.add_keyspace(prev);
+                accum.add_keyspace(keyspace);
+                self.inner = accum.to_keyspace();
+            }
+        }
+
+        // The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow,
+        // and a ReadPoint
+        read_points.sort_by_key(|rp| rp.0);
+        let mut shadow = KeyShadow::new(
+            read_points
+                .pop()
+                .expect("Every timeline has at least one read point")
+                .1,
+        );
+
+        // We will interleave all our read points and layers into a sorted collection
+        enum Item {
+            ReadPoint { lsn: Lsn, keyspace: KeySpace },
+            Layer(Arc<PersistentLayerDesc>),
+        }
+
+        let mut items = Vec::with_capacity(self.historic.len() + read_points.len());
+        items.extend(self.iter_historic_layers().map(Item::Layer));
+        items.extend(read_points.into_iter().map(|rp| Item::ReadPoint {
+            lsn: rp.0,
+            keyspace: rp.1,
+        }));
+
+        // Ordering: we want to iterate like this:
+        // 1. Highest LSNs first
+        // 2. Consider ReadPoints before image layers if they're at the same LSN
+        items.sort_by_key(|item| {
+            std::cmp::Reverse(match item {
+                Item::ReadPoint {
+                    lsn,
+                    keyspace: _keyspace,
+                } => (*lsn, 0),
+                Item::Layer(layer) => {
+                    if layer.is_delta() {
+                        (layer.get_lsn_range().end, 1)
+                    } else {
+                        (layer.image_layer_lsn(), 2)
+                    }
+                }
+            })
+        });
+
+        let mut results = Vec::with_capacity(self.historic.len());
+
+        // TODO: handle delta layers properly with multiple read points: if a read point intersects a delta layer, we might already
+        // have encountered it and marked it as not-visible.  We need to keep track of which delta layers we are currently within, and
+        // when we encounter a ReadPoint, update the delta layer's visibility as needed.
+        // let mut pending_delta : Vec= ...
+        let mut maybe_covered_deltas: Vec<Arc<PersistentLayerDesc>> = Vec::new();
+
+        for item in items {
+            let (reached_lsn, is_readpoint) = match &item {
+                Item::ReadPoint {
+                    lsn,
+                    keyspace: _keyspace,
+                } => (lsn, true),
+                Item::Layer(layer) => (&layer.lsn_range.start, false),
+            };
+            maybe_covered_deltas.retain(|d| {
+                if *reached_lsn >= d.lsn_range.start && is_readpoint {
+                    // We encountered a readpoint within the delta layer: it is visible
+                    results.push((d.clone(), LayerVisibility::Visible));
+                    false
+                } else if *reached_lsn < d.lsn_range.start {
+                    // We passed the layer's range without encountering a read point: it is not visible
+                    results.push((d.clone(), LayerVisibility::Covered));
+                    false
+                } else {
+                    // We're still in the delta layer: continue iterating
+                    true
+                }
+            });
+
+            match item {
+                Item::ReadPoint {
+                    lsn: _lsn,
+                    keyspace,
+                } => {
+                    shadow.union_with(keyspace);
+                }
+                Item::Layer(layer) => {
+                    let visibility = if layer.is_delta() {
+                        if shadow.contains(layer.get_key_range()) {
+                            LayerVisibility::Visible
+                        } else {
+                            // If a layer isn't visible based on current state, we must defer deciding whether
+                            // it is truly not visible until we have advanced past the delta's range: we might
+                            // encounter another branch point within this delta layer's LSN range.
+                            maybe_covered_deltas.push(layer);
+                            continue;
+                        }
+                    } else if shadow.subtract(layer.get_key_range()) {
+                        // An image layer, which overlapped with the shadow
+                        LayerVisibility::Visible
+                    } else {
+                        // An image layer, which did not overlap with the shadow
+                        LayerVisibility::Covered
+                    };
+
+                    results.push((layer, visibility));
+                }
+            }
+        }
+
+        // Drain any remaining maybe_covered deltas
+        results.extend(
+            maybe_covered_deltas
+                .into_iter()
+                .map(|d| (d, LayerVisibility::Covered)),
+        );
+
+        (results, shadow.inner)
+    }
 }
 
 #[cfg(test)]
diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
index 347490c1ba..136f68bc36 100644
--- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
+++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs
@@ -521,6 +521,10 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
 
         Ok(&self.historic_coverage)
     }
+
+    pub(crate) fn len(&self) -> usize {
+        self.layers.len()
+    }
 }
 
 #[test]
diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs
index 94a64a8644..fa042700da 100644
--- a/pageserver/src/tenant/storage_layer.rs
+++ b/pageserver/src/tenant/storage_layer.rs
@@ -700,6 +700,10 @@ impl LayerAccessStats {
             },
         }
     }
+
+    pub(crate) fn set_visibility(&self, visibility: LayerVisibility) {
+        self.0.lock().unwrap().visibility = visibility;
+    }
 }
 
 /// Get a layer descriptor from a layer.
diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs
index 3312d05ea6..f78a8be45b 100644
--- a/pageserver/src/tenant/timeline.rs
+++ b/pageserver/src/tenant/timeline.rs
@@ -1817,6 +1817,10 @@ impl Timeline {
             self.compact_shard_ancestors(rewrite_max, ctx).await?;
         }
 
+        // TODO: be more selective: call this once at startup, and thereafter only when some branching changes or
+        // when image layer are generated.
+        self.update_layer_visibility(ctx).await?;
+
         Ok(())
     }
 
diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs
index f8dc87787a..720d96815b 100644
--- a/pageserver/src/tenant/timeline/compaction.rs
+++ b/pageserver/src/tenant/timeline/compaction.rs
@@ -26,7 +26,7 @@ use utils::id::TimelineId;
 
 use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
 use crate::page_cache;
-use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
+use crate::tenant::storage_layer::{AsLayerDesc, LayerVisibility, PersistentLayerDesc};
 use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
 use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
 use crate::tenant::timeline::{Layer, ResidentLayer};
@@ -347,6 +347,88 @@ impl Timeline {
         Ok(())
     }
 
+    /// A post-compaction step to update the LayerVisibility of layers covered by image layers.  This
+    /// should also be called when new branches are created.
+    ///
+    /// Sweep through the layer map, identifying layers which are covered by image layers
+    /// such that they do not need to be available to service reads. The resulting LayerVisibility
+    /// result may be used as an input to eviction and secondary downloads to de-prioritize layers
+    /// that we know won't be needed for reads.
+    pub(super) async fn update_layer_visibility(
+        &self,
+        ctx: &RequestContext,
+    ) -> Result<(), CompactionError> {
+        // Start with a keyspace representing all the keys we need to read from the tip of the branch
+        let head_lsn = self.get_last_record_lsn();
+        let (mut head_keyspace, sparse_ks) = self.collect_keyspace(head_lsn, ctx).await?;
+
+        // Converting the sparse part of the keyspace into the dense keyspace is safe in this context
+        // because we will never iterate through the keys.
+        head_keyspace.merge(&sparse_ks.0);
+
+        // We will sweep through layers in reverse-LSN order.  We only do historic layers.  L0 deltas
+        // are implicitly visible, because LayerVisibility's default is Visible, and we never modify it here.
+        let layer_manager = self.layers.read().await;
+        let layer_map = layer_manager.layer_map();
+
+        let mut visible_size: u64 = 0;
+
+        // FIXME: we only get accurate keyspaces from children if they've already run update_layer_visibility themselves.  At startup all the timelines
+        // initialize this in arbitrary order (at the end of initial_logical_size_calculation).  We should coordinate these.  Perhaps at the very start
+        // of the tenant compaction task we should do all the timelines' layer visibility calculations in a leaf-first order?
+        let readable_points = {
+            let children = self.gc_info.read().unwrap().retain_lsns.clone();
+
+            let mut readable_points = Vec::with_capacity(children.len() + 1);
+            for (child_lsn, _child_timeline_id, child_keyspace) in &children {
+                let keyspace = match child_keyspace {
+                    Some(ks) => ks.clone(),
+                    None => {
+                        // The child has not posted information about which parts of the keyspace they depend on: presume they depend on all of it.
+                        let (mut keyspace, sparse_keyspace) =
+                            self.collect_keyspace(*child_lsn, ctx).await?;
+                        keyspace.merge(&sparse_keyspace.0);
+                        keyspace
+                    }
+                };
+                readable_points.push((*child_lsn, keyspace));
+            }
+            readable_points.push((head_lsn, head_keyspace));
+            readable_points
+        };
+
+        let (layer_visibility, shadow) = layer_map.get_visibility(readable_points);
+        for (layer_desc, visibility) in layer_visibility {
+            // FIXME: a more efficiency bulk zip() through the layers rather than NlogN getting each one
+            let layer = layer_manager.get_from_desc(&layer_desc);
+            if matches!(visibility, LayerVisibility::Visible) {
+                visible_size += layer.metadata().file_size;
+            }
+
+            layer.access_stats().set_visibility(visibility);
+        }
+
+        if let Some(ancestor) = &self.ancestor_timeline {
+            // Having calculated the readable keyspace after walking back through all this timeline's layers, the resulting keyspace is the remaining
+            // keys for which reads may still fall through to the parent branch.  Notify the parent branch of this, so that they may GC layers which
+            // do not overlap with this keyspace, and so that they may use this as an input to their own visibility updates.
+            ancestor
+                .gc_info
+                .write()
+                .unwrap()
+                .notify_child_keyspace(self.timeline_id, shadow);
+        }
+
+        // Also include in the visible size all the layers which we would never update visibility on
+        // TODO: getter that doesn't spuriously construct a Vec<>
+        for layer in layer_map.get_level0_deltas().unwrap() {
+            visible_size += layer.file_size;
+        }
+        self.metrics.visible_physical_size_gauge.set(visible_size);
+
+        Ok(())
+    }
+
     /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
     /// as Level 1 files.
     async fn compact_level0(