From d7bca9fcdb5b98e76b0f6ea60387fcdfb6040e10 Mon Sep 17 00:00:00 2001 From: John Spray Date: Sat, 29 Jun 2024 20:54:49 +0100 Subject: [PATCH] pageserver: implementation of update_layer_visibility --- pageserver/src/metrics.rs | 15 ++ pageserver/src/tenant/layer_map.rs | 162 +++++++++++++++++- .../layer_map/historic_layer_coverage.rs | 4 + pageserver/src/tenant/storage_layer.rs | 4 + pageserver/src/tenant/timeline.rs | 4 + pageserver/src/tenant/timeline/compaction.rs | 84 ++++++++- 6 files changed, 270 insertions(+), 3 deletions(-) diff --git a/pageserver/src/metrics.rs b/pageserver/src/metrics.rs index 59b7293631..0845a0a684 100644 --- a/pageserver/src/metrics.rs +++ b/pageserver/src/metrics.rs @@ -500,6 +500,15 @@ static RESIDENT_PHYSICAL_SIZE: Lazy = Lazy::new(|| { .expect("failed to define a metric") }); +static VISIBLE_PHYSICAL_SIZE: Lazy = Lazy::new(|| { + register_uint_gauge_vec!( + "pageserver_visible_physical_size", + "The size of the layer files present in the pageserver's filesystem.", + &["tenant_id", "shard_id", "timeline_id"] + ) + .expect("failed to define a metric") +}); + pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy = Lazy::new(|| { register_uint_gauge!( "pageserver_resident_physical_size_global", @@ -2130,6 +2139,7 @@ pub(crate) struct TimelineMetrics { pub archival_size: UIntGauge, pub standby_horizon_gauge: IntGauge, pub resident_physical_size_gauge: UIntGauge, + pub visible_physical_size_gauge: UIntGauge, /// copy of LayeredTimeline.current_logical_size pub current_logical_size_gauge: UIntGauge, pub aux_file_size_gauge: IntGauge, @@ -2216,6 +2226,9 @@ impl TimelineMetrics { let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) .unwrap(); + let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE + .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) + .unwrap(); // TODO: we shouldn't expose this metric let current_logical_size_gauge = CURRENT_LOGICAL_SIZE .get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id]) @@ -2266,6 +2279,7 @@ impl TimelineMetrics { archival_size, standby_horizon_gauge, resident_physical_size_gauge, + visible_physical_size_gauge, current_logical_size_gauge, aux_file_size_gauge, directory_entries_count_gauge, @@ -2317,6 +2331,7 @@ impl TimelineMetrics { RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get()); let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); } + let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]); if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) { let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]); diff --git a/pageserver/src/tenant/layer_map.rs b/pageserver/src/tenant/layer_map.rs index 2724a5cc07..8c526462de 100644 --- a/pageserver/src/tenant/layer_map.rs +++ b/pageserver/src/tenant/layer_map.rs @@ -51,7 +51,7 @@ use crate::keyspace::KeyPartitioning; use crate::repository::Key; use crate::tenant::storage_layer::InMemoryLayer; use anyhow::Result; -use pageserver_api::keyspace::KeySpaceAccum; +use pageserver_api::keyspace::{KeySpace, KeySpaceAccum, KeySpaceRandomAccum}; use std::collections::{HashMap, VecDeque}; use std::iter::Peekable; use std::ops::Range; @@ -61,7 +61,7 @@ use utils::lsn::Lsn; use historic_layer_coverage::BufferedHistoricLayerCoverage; pub use historic_layer_coverage::LayerKey; -use super::storage_layer::PersistentLayerDesc; +use super::storage_layer::{LayerVisibility, PersistentLayerDesc}; /// /// LayerMap tracks what layers exist on a timeline. @@ -870,6 +870,164 @@ impl LayerMap { println!("End dump LayerMap"); Ok(()) } + + /// `read_points` represent the tip of a timeline and any branch points, i.e. the places + /// where we expect to serve reads. + /// + /// This function is O(N) and should be called infrequently. The caller is responsible for + /// looking up and updating the Layer objects for these layer descriptors. + pub(crate) fn get_visibility( + &self, + mut read_points: Vec<(Lsn, KeySpace)>, + ) -> (Vec<(Arc, LayerVisibility)>, KeySpace) { + // This is like a KeySpace, but written for efficient subtraction of layers and unions with KeySpaces + struct KeyShadow { + // FIXME: consider efficiency. KeySpace is a flat vector, so in principle fairly inefficient for + // repeatedly calling contains(), BUT as we iterate through the layermap we expect the shadow to shrink + // to something quite small, and for small collections an algorithmically expensive vector is often better + // for performance than a more algorithmically cheap data structure. + inner: KeySpace, + } + + impl KeyShadow { + fn new(keyspace: KeySpace) -> Self { + Self { inner: keyspace } + } + + fn contains(&self, range: Range) -> bool { + self.inner.overlaps(&range) + } + + /// Return true if anything was removed. + fn subtract(&mut self, range: Range) -> bool { + let removed = self.inner.remove_overlapping_with(&KeySpace { + ranges: vec![range], + }); + !removed.ranges.is_empty() + } + + fn union_with(&mut self, keyspace: KeySpace) { + let mut accum = KeySpaceRandomAccum::new(); + let prev = std::mem::take(&mut self.inner); + accum.add_keyspace(prev); + accum.add_keyspace(keyspace); + self.inner = accum.to_keyspace(); + } + } + + // The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow, + // and a ReadPoint + read_points.sort_by_key(|rp| rp.0); + let mut shadow = KeyShadow::new( + read_points + .pop() + .expect("Every timeline has at least one read point") + .1, + ); + + // We will interleave all our read points and layers into a sorted collection + enum Item { + ReadPoint { lsn: Lsn, keyspace: KeySpace }, + Layer(Arc), + } + + let mut items = Vec::with_capacity(self.historic.len() + read_points.len()); + items.extend(self.iter_historic_layers().map(Item::Layer)); + items.extend(read_points.into_iter().map(|rp| Item::ReadPoint { + lsn: rp.0, + keyspace: rp.1, + })); + + // Ordering: we want to iterate like this: + // 1. Highest LSNs first + // 2. Consider ReadPoints before image layers if they're at the same LSN + items.sort_by_key(|item| { + std::cmp::Reverse(match item { + Item::ReadPoint { + lsn, + keyspace: _keyspace, + } => (*lsn, 0), + Item::Layer(layer) => { + if layer.is_delta() { + (layer.get_lsn_range().end, 1) + } else { + (layer.image_layer_lsn(), 2) + } + } + }) + }); + + let mut results = Vec::with_capacity(self.historic.len()); + + // TODO: handle delta layers properly with multiple read points: if a read point intersects a delta layer, we might already + // have encountered it and marked it as not-visible. We need to keep track of which delta layers we are currently within, and + // when we encounter a ReadPoint, update the delta layer's visibility as needed. + // let mut pending_delta : Vec= ... + let mut maybe_covered_deltas: Vec> = Vec::new(); + + for item in items { + let (reached_lsn, is_readpoint) = match &item { + Item::ReadPoint { + lsn, + keyspace: _keyspace, + } => (lsn, true), + Item::Layer(layer) => (&layer.lsn_range.start, false), + }; + maybe_covered_deltas.retain(|d| { + if *reached_lsn >= d.lsn_range.start && is_readpoint { + // We encountered a readpoint within the delta layer: it is visible + results.push((d.clone(), LayerVisibility::Visible)); + false + } else if *reached_lsn < d.lsn_range.start { + // We passed the layer's range without encountering a read point: it is not visible + results.push((d.clone(), LayerVisibility::Covered)); + false + } else { + // We're still in the delta layer: continue iterating + true + } + }); + + match item { + Item::ReadPoint { + lsn: _lsn, + keyspace, + } => { + shadow.union_with(keyspace); + } + Item::Layer(layer) => { + let visibility = if layer.is_delta() { + if shadow.contains(layer.get_key_range()) { + LayerVisibility::Visible + } else { + // If a layer isn't visible based on current state, we must defer deciding whether + // it is truly not visible until we have advanced past the delta's range: we might + // encounter another branch point within this delta layer's LSN range. + maybe_covered_deltas.push(layer); + continue; + } + } else if shadow.subtract(layer.get_key_range()) { + // An image layer, which overlapped with the shadow + LayerVisibility::Visible + } else { + // An image layer, which did not overlap with the shadow + LayerVisibility::Covered + }; + + results.push((layer, visibility)); + } + } + } + + // Drain any remaining maybe_covered deltas + results.extend( + maybe_covered_deltas + .into_iter() + .map(|d| (d, LayerVisibility::Covered)), + ); + + (results, shadow.inner) + } } #[cfg(test)] diff --git a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs index 347490c1ba..136f68bc36 100644 --- a/pageserver/src/tenant/layer_map/historic_layer_coverage.rs +++ b/pageserver/src/tenant/layer_map/historic_layer_coverage.rs @@ -521,6 +521,10 @@ impl BufferedHistoricLayerCoverage { Ok(&self.historic_coverage) } + + pub(crate) fn len(&self) -> usize { + self.layers.len() + } } #[test] diff --git a/pageserver/src/tenant/storage_layer.rs b/pageserver/src/tenant/storage_layer.rs index 94a64a8644..fa042700da 100644 --- a/pageserver/src/tenant/storage_layer.rs +++ b/pageserver/src/tenant/storage_layer.rs @@ -700,6 +700,10 @@ impl LayerAccessStats { }, } } + + pub(crate) fn set_visibility(&self, visibility: LayerVisibility) { + self.0.lock().unwrap().visibility = visibility; + } } /// Get a layer descriptor from a layer. diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 3312d05ea6..f78a8be45b 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -1817,6 +1817,10 @@ impl Timeline { self.compact_shard_ancestors(rewrite_max, ctx).await?; } + // TODO: be more selective: call this once at startup, and thereafter only when some branching changes or + // when image layer are generated. + self.update_layer_visibility(ctx).await?; + Ok(()) } diff --git a/pageserver/src/tenant/timeline/compaction.rs b/pageserver/src/tenant/timeline/compaction.rs index f8dc87787a..720d96815b 100644 --- a/pageserver/src/tenant/timeline/compaction.rs +++ b/pageserver/src/tenant/timeline/compaction.rs @@ -26,7 +26,7 @@ use utils::id::TimelineId; use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder}; use crate::page_cache; -use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc}; +use crate::tenant::storage_layer::{AsLayerDesc, LayerVisibility, PersistentLayerDesc}; use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome}; use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter}; use crate::tenant::timeline::{Layer, ResidentLayer}; @@ -347,6 +347,88 @@ impl Timeline { Ok(()) } + /// A post-compaction step to update the LayerVisibility of layers covered by image layers. This + /// should also be called when new branches are created. + /// + /// Sweep through the layer map, identifying layers which are covered by image layers + /// such that they do not need to be available to service reads. The resulting LayerVisibility + /// result may be used as an input to eviction and secondary downloads to de-prioritize layers + /// that we know won't be needed for reads. + pub(super) async fn update_layer_visibility( + &self, + ctx: &RequestContext, + ) -> Result<(), CompactionError> { + // Start with a keyspace representing all the keys we need to read from the tip of the branch + let head_lsn = self.get_last_record_lsn(); + let (mut head_keyspace, sparse_ks) = self.collect_keyspace(head_lsn, ctx).await?; + + // Converting the sparse part of the keyspace into the dense keyspace is safe in this context + // because we will never iterate through the keys. + head_keyspace.merge(&sparse_ks.0); + + // We will sweep through layers in reverse-LSN order. We only do historic layers. L0 deltas + // are implicitly visible, because LayerVisibility's default is Visible, and we never modify it here. + let layer_manager = self.layers.read().await; + let layer_map = layer_manager.layer_map(); + + let mut visible_size: u64 = 0; + + // FIXME: we only get accurate keyspaces from children if they've already run update_layer_visibility themselves. At startup all the timelines + // initialize this in arbitrary order (at the end of initial_logical_size_calculation). We should coordinate these. Perhaps at the very start + // of the tenant compaction task we should do all the timelines' layer visibility calculations in a leaf-first order? + let readable_points = { + let children = self.gc_info.read().unwrap().retain_lsns.clone(); + + let mut readable_points = Vec::with_capacity(children.len() + 1); + for (child_lsn, _child_timeline_id, child_keyspace) in &children { + let keyspace = match child_keyspace { + Some(ks) => ks.clone(), + None => { + // The child has not posted information about which parts of the keyspace they depend on: presume they depend on all of it. + let (mut keyspace, sparse_keyspace) = + self.collect_keyspace(*child_lsn, ctx).await?; + keyspace.merge(&sparse_keyspace.0); + keyspace + } + }; + readable_points.push((*child_lsn, keyspace)); + } + readable_points.push((head_lsn, head_keyspace)); + readable_points + }; + + let (layer_visibility, shadow) = layer_map.get_visibility(readable_points); + for (layer_desc, visibility) in layer_visibility { + // FIXME: a more efficiency bulk zip() through the layers rather than NlogN getting each one + let layer = layer_manager.get_from_desc(&layer_desc); + if matches!(visibility, LayerVisibility::Visible) { + visible_size += layer.metadata().file_size; + } + + layer.access_stats().set_visibility(visibility); + } + + if let Some(ancestor) = &self.ancestor_timeline { + // Having calculated the readable keyspace after walking back through all this timeline's layers, the resulting keyspace is the remaining + // keys for which reads may still fall through to the parent branch. Notify the parent branch of this, so that they may GC layers which + // do not overlap with this keyspace, and so that they may use this as an input to their own visibility updates. + ancestor + .gc_info + .write() + .unwrap() + .notify_child_keyspace(self.timeline_id, shadow); + } + + // Also include in the visible size all the layers which we would never update visibility on + // TODO: getter that doesn't spuriously construct a Vec<> + for layer in layer_map.get_level0_deltas().unwrap() { + visible_size += layer.file_size; + } + self.metrics.visible_physical_size_gauge.set(visible_size); + + Ok(()) + } + /// Collect a bunch of Level 0 layer files, and compact and reshuffle them as /// as Level 1 files. async fn compact_level0(