mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-15 17:32:56 +00:00
pageserver: implementation of update_layer_visibility
This commit is contained in:
@@ -500,6 +500,15 @@ static RESIDENT_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
static VISIBLE_PHYSICAL_SIZE: Lazy<UIntGaugeVec> = Lazy::new(|| {
|
||||
register_uint_gauge_vec!(
|
||||
"pageserver_visible_physical_size",
|
||||
"The size of the layer files present in the pageserver's filesystem.",
|
||||
&["tenant_id", "shard_id", "timeline_id"]
|
||||
)
|
||||
.expect("failed to define a metric")
|
||||
});
|
||||
|
||||
pub(crate) static RESIDENT_PHYSICAL_SIZE_GLOBAL: Lazy<UIntGauge> = Lazy::new(|| {
|
||||
register_uint_gauge!(
|
||||
"pageserver_resident_physical_size_global",
|
||||
@@ -2130,6 +2139,7 @@ pub(crate) struct TimelineMetrics {
|
||||
pub archival_size: UIntGauge,
|
||||
pub standby_horizon_gauge: IntGauge,
|
||||
pub resident_physical_size_gauge: UIntGauge,
|
||||
pub visible_physical_size_gauge: UIntGauge,
|
||||
/// copy of LayeredTimeline.current_logical_size
|
||||
pub current_logical_size_gauge: UIntGauge,
|
||||
pub aux_file_size_gauge: IntGauge,
|
||||
@@ -2216,6 +2226,9 @@ impl TimelineMetrics {
|
||||
let resident_physical_size_gauge = RESIDENT_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
let visible_physical_size_gauge = VISIBLE_PHYSICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
.unwrap();
|
||||
// TODO: we shouldn't expose this metric
|
||||
let current_logical_size_gauge = CURRENT_LOGICAL_SIZE
|
||||
.get_metric_with_label_values(&[&tenant_id, &shard_id, &timeline_id])
|
||||
@@ -2266,6 +2279,7 @@ impl TimelineMetrics {
|
||||
archival_size,
|
||||
standby_horizon_gauge,
|
||||
resident_physical_size_gauge,
|
||||
visible_physical_size_gauge,
|
||||
current_logical_size_gauge,
|
||||
aux_file_size_gauge,
|
||||
directory_entries_count_gauge,
|
||||
@@ -2317,6 +2331,7 @@ impl TimelineMetrics {
|
||||
RESIDENT_PHYSICAL_SIZE_GLOBAL.sub(self.resident_physical_size_get());
|
||||
let _ = RESIDENT_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
}
|
||||
let _ = VISIBLE_PHYSICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
let _ = CURRENT_LOGICAL_SIZE.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
if let Some(metric) = Lazy::get(&DIRECTORY_ENTRIES_COUNT) {
|
||||
let _ = metric.remove_label_values(&[tenant_id, shard_id, timeline_id]);
|
||||
|
||||
@@ -51,7 +51,7 @@ use crate::keyspace::KeyPartitioning;
|
||||
use crate::repository::Key;
|
||||
use crate::tenant::storage_layer::InMemoryLayer;
|
||||
use anyhow::Result;
|
||||
use pageserver_api::keyspace::KeySpaceAccum;
|
||||
use pageserver_api::keyspace::{KeySpace, KeySpaceAccum, KeySpaceRandomAccum};
|
||||
use std::collections::{HashMap, VecDeque};
|
||||
use std::iter::Peekable;
|
||||
use std::ops::Range;
|
||||
@@ -61,7 +61,7 @@ use utils::lsn::Lsn;
|
||||
use historic_layer_coverage::BufferedHistoricLayerCoverage;
|
||||
pub use historic_layer_coverage::LayerKey;
|
||||
|
||||
use super::storage_layer::PersistentLayerDesc;
|
||||
use super::storage_layer::{LayerVisibility, PersistentLayerDesc};
|
||||
|
||||
///
|
||||
/// LayerMap tracks what layers exist on a timeline.
|
||||
@@ -870,6 +870,164 @@ impl LayerMap {
|
||||
println!("End dump LayerMap");
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// `read_points` represent the tip of a timeline and any branch points, i.e. the places
|
||||
/// where we expect to serve reads.
|
||||
///
|
||||
/// This function is O(N) and should be called infrequently. The caller is responsible for
|
||||
/// looking up and updating the Layer objects for these layer descriptors.
|
||||
pub(crate) fn get_visibility(
|
||||
&self,
|
||||
mut read_points: Vec<(Lsn, KeySpace)>,
|
||||
) -> (Vec<(Arc<PersistentLayerDesc>, LayerVisibility)>, KeySpace) {
|
||||
// This is like a KeySpace, but written for efficient subtraction of layers and unions with KeySpaces
|
||||
struct KeyShadow {
|
||||
// FIXME: consider efficiency. KeySpace is a flat vector, so in principle fairly inefficient for
|
||||
// repeatedly calling contains(), BUT as we iterate through the layermap we expect the shadow to shrink
|
||||
// to something quite small, and for small collections an algorithmically expensive vector is often better
|
||||
// for performance than a more algorithmically cheap data structure.
|
||||
inner: KeySpace,
|
||||
}
|
||||
|
||||
impl KeyShadow {
|
||||
fn new(keyspace: KeySpace) -> Self {
|
||||
Self { inner: keyspace }
|
||||
}
|
||||
|
||||
fn contains(&self, range: Range<Key>) -> bool {
|
||||
self.inner.overlaps(&range)
|
||||
}
|
||||
|
||||
/// Return true if anything was removed.
|
||||
fn subtract(&mut self, range: Range<Key>) -> bool {
|
||||
let removed = self.inner.remove_overlapping_with(&KeySpace {
|
||||
ranges: vec![range],
|
||||
});
|
||||
!removed.ranges.is_empty()
|
||||
}
|
||||
|
||||
fn union_with(&mut self, keyspace: KeySpace) {
|
||||
let mut accum = KeySpaceRandomAccum::new();
|
||||
let prev = std::mem::take(&mut self.inner);
|
||||
accum.add_keyspace(prev);
|
||||
accum.add_keyspace(keyspace);
|
||||
self.inner = accum.to_keyspace();
|
||||
}
|
||||
}
|
||||
|
||||
// The 'shadow' will be updated as we sweep through the layers: an image layer subtracts from the shadow,
|
||||
// and a ReadPoint
|
||||
read_points.sort_by_key(|rp| rp.0);
|
||||
let mut shadow = KeyShadow::new(
|
||||
read_points
|
||||
.pop()
|
||||
.expect("Every timeline has at least one read point")
|
||||
.1,
|
||||
);
|
||||
|
||||
// We will interleave all our read points and layers into a sorted collection
|
||||
enum Item {
|
||||
ReadPoint { lsn: Lsn, keyspace: KeySpace },
|
||||
Layer(Arc<PersistentLayerDesc>),
|
||||
}
|
||||
|
||||
let mut items = Vec::with_capacity(self.historic.len() + read_points.len());
|
||||
items.extend(self.iter_historic_layers().map(Item::Layer));
|
||||
items.extend(read_points.into_iter().map(|rp| Item::ReadPoint {
|
||||
lsn: rp.0,
|
||||
keyspace: rp.1,
|
||||
}));
|
||||
|
||||
// Ordering: we want to iterate like this:
|
||||
// 1. Highest LSNs first
|
||||
// 2. Consider ReadPoints before image layers if they're at the same LSN
|
||||
items.sort_by_key(|item| {
|
||||
std::cmp::Reverse(match item {
|
||||
Item::ReadPoint {
|
||||
lsn,
|
||||
keyspace: _keyspace,
|
||||
} => (*lsn, 0),
|
||||
Item::Layer(layer) => {
|
||||
if layer.is_delta() {
|
||||
(layer.get_lsn_range().end, 1)
|
||||
} else {
|
||||
(layer.image_layer_lsn(), 2)
|
||||
}
|
||||
}
|
||||
})
|
||||
});
|
||||
|
||||
let mut results = Vec::with_capacity(self.historic.len());
|
||||
|
||||
// TODO: handle delta layers properly with multiple read points: if a read point intersects a delta layer, we might already
|
||||
// have encountered it and marked it as not-visible. We need to keep track of which delta layers we are currently within, and
|
||||
// when we encounter a ReadPoint, update the delta layer's visibility as needed.
|
||||
// let mut pending_delta : Vec= ...
|
||||
let mut maybe_covered_deltas: Vec<Arc<PersistentLayerDesc>> = Vec::new();
|
||||
|
||||
for item in items {
|
||||
let (reached_lsn, is_readpoint) = match &item {
|
||||
Item::ReadPoint {
|
||||
lsn,
|
||||
keyspace: _keyspace,
|
||||
} => (lsn, true),
|
||||
Item::Layer(layer) => (&layer.lsn_range.start, false),
|
||||
};
|
||||
maybe_covered_deltas.retain(|d| {
|
||||
if *reached_lsn >= d.lsn_range.start && is_readpoint {
|
||||
// We encountered a readpoint within the delta layer: it is visible
|
||||
results.push((d.clone(), LayerVisibility::Visible));
|
||||
false
|
||||
} else if *reached_lsn < d.lsn_range.start {
|
||||
// We passed the layer's range without encountering a read point: it is not visible
|
||||
results.push((d.clone(), LayerVisibility::Covered));
|
||||
false
|
||||
} else {
|
||||
// We're still in the delta layer: continue iterating
|
||||
true
|
||||
}
|
||||
});
|
||||
|
||||
match item {
|
||||
Item::ReadPoint {
|
||||
lsn: _lsn,
|
||||
keyspace,
|
||||
} => {
|
||||
shadow.union_with(keyspace);
|
||||
}
|
||||
Item::Layer(layer) => {
|
||||
let visibility = if layer.is_delta() {
|
||||
if shadow.contains(layer.get_key_range()) {
|
||||
LayerVisibility::Visible
|
||||
} else {
|
||||
// If a layer isn't visible based on current state, we must defer deciding whether
|
||||
// it is truly not visible until we have advanced past the delta's range: we might
|
||||
// encounter another branch point within this delta layer's LSN range.
|
||||
maybe_covered_deltas.push(layer);
|
||||
continue;
|
||||
}
|
||||
} else if shadow.subtract(layer.get_key_range()) {
|
||||
// An image layer, which overlapped with the shadow
|
||||
LayerVisibility::Visible
|
||||
} else {
|
||||
// An image layer, which did not overlap with the shadow
|
||||
LayerVisibility::Covered
|
||||
};
|
||||
|
||||
results.push((layer, visibility));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Drain any remaining maybe_covered deltas
|
||||
results.extend(
|
||||
maybe_covered_deltas
|
||||
.into_iter()
|
||||
.map(|d| (d, LayerVisibility::Covered)),
|
||||
);
|
||||
|
||||
(results, shadow.inner)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
|
||||
@@ -521,6 +521,10 @@ impl<Value: Clone> BufferedHistoricLayerCoverage<Value> {
|
||||
|
||||
Ok(&self.historic_coverage)
|
||||
}
|
||||
|
||||
pub(crate) fn len(&self) -> usize {
|
||||
self.layers.len()
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
|
||||
@@ -700,6 +700,10 @@ impl LayerAccessStats {
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn set_visibility(&self, visibility: LayerVisibility) {
|
||||
self.0.lock().unwrap().visibility = visibility;
|
||||
}
|
||||
}
|
||||
|
||||
/// Get a layer descriptor from a layer.
|
||||
|
||||
@@ -1817,6 +1817,10 @@ impl Timeline {
|
||||
self.compact_shard_ancestors(rewrite_max, ctx).await?;
|
||||
}
|
||||
|
||||
// TODO: be more selective: call this once at startup, and thereafter only when some branching changes or
|
||||
// when image layer are generated.
|
||||
self.update_layer_visibility(ctx).await?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ use utils::id::TimelineId;
|
||||
|
||||
use crate::context::{AccessStatsBehavior, RequestContext, RequestContextBuilder};
|
||||
use crate::page_cache;
|
||||
use crate::tenant::storage_layer::{AsLayerDesc, PersistentLayerDesc};
|
||||
use crate::tenant::storage_layer::{AsLayerDesc, LayerVisibility, PersistentLayerDesc};
|
||||
use crate::tenant::timeline::{drop_rlock, Hole, ImageLayerCreationOutcome};
|
||||
use crate::tenant::timeline::{DeltaLayerWriter, ImageLayerWriter};
|
||||
use crate::tenant::timeline::{Layer, ResidentLayer};
|
||||
@@ -347,6 +347,88 @@ impl Timeline {
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// A post-compaction step to update the LayerVisibility of layers covered by image layers. This
|
||||
/// should also be called when new branches are created.
|
||||
///
|
||||
/// Sweep through the layer map, identifying layers which are covered by image layers
|
||||
/// such that they do not need to be available to service reads. The resulting LayerVisibility
|
||||
/// result may be used as an input to eviction and secondary downloads to de-prioritize layers
|
||||
/// that we know won't be needed for reads.
|
||||
pub(super) async fn update_layer_visibility(
|
||||
&self,
|
||||
ctx: &RequestContext,
|
||||
) -> Result<(), CompactionError> {
|
||||
// Start with a keyspace representing all the keys we need to read from the tip of the branch
|
||||
let head_lsn = self.get_last_record_lsn();
|
||||
let (mut head_keyspace, sparse_ks) = self.collect_keyspace(head_lsn, ctx).await?;
|
||||
|
||||
// Converting the sparse part of the keyspace into the dense keyspace is safe in this context
|
||||
// because we will never iterate through the keys.
|
||||
head_keyspace.merge(&sparse_ks.0);
|
||||
|
||||
// We will sweep through layers in reverse-LSN order. We only do historic layers. L0 deltas
|
||||
// are implicitly visible, because LayerVisibility's default is Visible, and we never modify it here.
|
||||
let layer_manager = self.layers.read().await;
|
||||
let layer_map = layer_manager.layer_map();
|
||||
|
||||
let mut visible_size: u64 = 0;
|
||||
|
||||
// FIXME: we only get accurate keyspaces from children if they've already run update_layer_visibility themselves. At startup all the timelines
|
||||
// initialize this in arbitrary order (at the end of initial_logical_size_calculation). We should coordinate these. Perhaps at the very start
|
||||
// of the tenant compaction task we should do all the timelines' layer visibility calculations in a leaf-first order?
|
||||
let readable_points = {
|
||||
let children = self.gc_info.read().unwrap().retain_lsns.clone();
|
||||
|
||||
let mut readable_points = Vec::with_capacity(children.len() + 1);
|
||||
for (child_lsn, _child_timeline_id, child_keyspace) in &children {
|
||||
let keyspace = match child_keyspace {
|
||||
Some(ks) => ks.clone(),
|
||||
None => {
|
||||
// The child has not posted information about which parts of the keyspace they depend on: presume they depend on all of it.
|
||||
let (mut keyspace, sparse_keyspace) =
|
||||
self.collect_keyspace(*child_lsn, ctx).await?;
|
||||
keyspace.merge(&sparse_keyspace.0);
|
||||
keyspace
|
||||
}
|
||||
};
|
||||
readable_points.push((*child_lsn, keyspace));
|
||||
}
|
||||
readable_points.push((head_lsn, head_keyspace));
|
||||
readable_points
|
||||
};
|
||||
|
||||
let (layer_visibility, shadow) = layer_map.get_visibility(readable_points);
|
||||
for (layer_desc, visibility) in layer_visibility {
|
||||
// FIXME: a more efficiency bulk zip() through the layers rather than NlogN getting each one
|
||||
let layer = layer_manager.get_from_desc(&layer_desc);
|
||||
if matches!(visibility, LayerVisibility::Visible) {
|
||||
visible_size += layer.metadata().file_size;
|
||||
}
|
||||
|
||||
layer.access_stats().set_visibility(visibility);
|
||||
}
|
||||
|
||||
if let Some(ancestor) = &self.ancestor_timeline {
|
||||
// Having calculated the readable keyspace after walking back through all this timeline's layers, the resulting keyspace is the remaining
|
||||
// keys for which reads may still fall through to the parent branch. Notify the parent branch of this, so that they may GC layers which
|
||||
// do not overlap with this keyspace, and so that they may use this as an input to their own visibility updates.
|
||||
ancestor
|
||||
.gc_info
|
||||
.write()
|
||||
.unwrap()
|
||||
.notify_child_keyspace(self.timeline_id, shadow);
|
||||
}
|
||||
|
||||
// Also include in the visible size all the layers which we would never update visibility on
|
||||
// TODO: getter that doesn't spuriously construct a Vec<>
|
||||
for layer in layer_map.get_level0_deltas().unwrap() {
|
||||
visible_size += layer.file_size;
|
||||
}
|
||||
self.metrics.visible_physical_size_gauge.set(visible_size);
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
||||
/// Collect a bunch of Level 0 layer files, and compact and reshuffle them as
|
||||
/// as Level 1 files.
|
||||
async fn compact_level0(
|
||||
|
||||
Reference in New Issue
Block a user