mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 14:02:55 +00:00
pageserver: prioritize secondary downloads to get most recent layers first, except l0s (#8729)
## Problem When a secondary location is trying to catch up while a tenant is receiving new writes, it can become quite wasteful: - Downloading L0s which are soon destroyed by compaction to L1s - Downloading older layer files which are soon made irrelevant when covered by image layers. ## Summary of changes Sort the layer files in the heatmap: - L0 layers are the lowest priority - Other layers are sorted to download the highest LSNs first.
This commit is contained in:
@@ -29,16 +29,16 @@ pub(super) struct HeatMapTenant {
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub(crate) struct HeatMapTimeline {
|
||||
#[serde_as(as = "DisplayFromStr")]
|
||||
pub(super) timeline_id: TimelineId,
|
||||
pub(crate) timeline_id: TimelineId,
|
||||
|
||||
pub(super) layers: Vec<HeatMapLayer>,
|
||||
pub(crate) layers: Vec<HeatMapLayer>,
|
||||
}
|
||||
|
||||
#[serde_as]
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub(crate) struct HeatMapLayer {
|
||||
pub(super) name: LayerName,
|
||||
pub(super) metadata: LayerFileMetadata,
|
||||
pub(crate) name: LayerName,
|
||||
pub(crate) metadata: LayerFileMetadata,
|
||||
|
||||
#[serde_as(as = "TimestampSeconds<i64>")]
|
||||
pub(super) access_time: SystemTime,
|
||||
|
||||
@@ -2977,11 +2977,7 @@ impl Timeline {
|
||||
LayerVisibilityHint::Visible => {
|
||||
// Layer is visible to one or more read LSNs: elegible for inclusion in layer map
|
||||
let last_activity_ts = layer.latest_activity();
|
||||
Some(HeatMapLayer::new(
|
||||
layer.layer_desc().layer_name(),
|
||||
layer.metadata(),
|
||||
last_activity_ts,
|
||||
))
|
||||
Some((layer.layer_desc(), layer.metadata(), last_activity_ts))
|
||||
}
|
||||
LayerVisibilityHint::Covered => {
|
||||
// Layer is resident but unlikely to be read: not elegible for inclusion in heatmap.
|
||||
@@ -2990,7 +2986,23 @@ impl Timeline {
|
||||
}
|
||||
});
|
||||
|
||||
let layers = resident.collect();
|
||||
let mut layers = resident.collect::<Vec<_>>();
|
||||
|
||||
// Sort layers in order of which to download first. For a large set of layers to download, we
|
||||
// want to prioritize those layers which are most likely to still be in the resident many minutes
|
||||
// or hours later:
|
||||
// - Download L0s last, because they churn the fastest: L0s on a fast-writing tenant might
|
||||
// only exist for a few minutes before being compacted into L1s.
|
||||
// - For L1 & image layers, download most recent LSNs first: the older the LSN, the sooner
|
||||
// the layer is likely to be covered by an image layer during compaction.
|
||||
layers.sort_by_key(|(desc, _meta, _atime)| {
|
||||
std::cmp::Reverse((!LayerMap::is_l0(&desc.key_range), desc.lsn_range.end))
|
||||
});
|
||||
|
||||
let layers = layers
|
||||
.into_iter()
|
||||
.map(|(desc, meta, atime)| HeatMapLayer::new(desc.layer_name(), meta, atime))
|
||||
.collect();
|
||||
|
||||
Some(HeatMapTimeline::new(self.timeline_id, layers))
|
||||
}
|
||||
@@ -4516,6 +4528,7 @@ impl DurationRecorder {
|
||||
/// the layer descriptor requires the user to provide the ranges, which should cover all
|
||||
/// keys specified in the `data` field.
|
||||
#[cfg(test)]
|
||||
#[derive(Clone)]
|
||||
pub struct DeltaLayerTestDesc {
|
||||
pub lsn_range: Range<Lsn>,
|
||||
pub key_range: Range<Key>,
|
||||
@@ -4545,6 +4558,13 @@ impl DeltaLayerTestDesc {
|
||||
data,
|
||||
}
|
||||
}
|
||||
|
||||
pub(crate) fn layer_name(&self) -> LayerName {
|
||||
LayerName::Delta(super::storage_layer::DeltaLayerName {
|
||||
key_range: self.key_range.clone(),
|
||||
lsn_range: self.lsn_range.clone(),
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
impl Timeline {
|
||||
@@ -5768,12 +5788,110 @@ fn is_send() {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use pageserver_api::key::Key;
|
||||
use utils::{id::TimelineId, lsn::Lsn};
|
||||
|
||||
use crate::tenant::{
|
||||
harness::TenantHarness, storage_layer::Layer, timeline::EvictionError, Timeline,
|
||||
use crate::{
|
||||
repository::Value,
|
||||
tenant::{
|
||||
harness::{test_img, TenantHarness},
|
||||
layer_map::LayerMap,
|
||||
storage_layer::{Layer, LayerName},
|
||||
timeline::{DeltaLayerTestDesc, EvictionError},
|
||||
Timeline,
|
||||
},
|
||||
};
|
||||
|
||||
#[tokio::test]
|
||||
async fn test_heatmap_generation() {
|
||||
let harness = TenantHarness::create("heatmap_generation").await.unwrap();
|
||||
|
||||
let covered_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x10)..Lsn(0x20),
|
||||
vec![(
|
||||
Key::from_hex("620000000033333333444444445500000000").unwrap(),
|
||||
Lsn(0x11),
|
||||
Value::Image(test_img("foo")),
|
||||
)],
|
||||
);
|
||||
let visible_delta = DeltaLayerTestDesc::new_with_inferred_key_range(
|
||||
Lsn(0x10)..Lsn(0x20),
|
||||
vec![(
|
||||
Key::from_hex("720000000033333333444444445500000000").unwrap(),
|
||||
Lsn(0x11),
|
||||
Value::Image(test_img("foo")),
|
||||
)],
|
||||
);
|
||||
let l0_delta = DeltaLayerTestDesc::new(
|
||||
Lsn(0x20)..Lsn(0x30),
|
||||
Key::from_hex("000000000000000000000000000000000000").unwrap()
|
||||
..Key::from_hex("FFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF").unwrap(),
|
||||
vec![(
|
||||
Key::from_hex("720000000033333333444444445500000000").unwrap(),
|
||||
Lsn(0x25),
|
||||
Value::Image(test_img("foo")),
|
||||
)],
|
||||
);
|
||||
let delta_layers = vec![
|
||||
covered_delta.clone(),
|
||||
visible_delta.clone(),
|
||||
l0_delta.clone(),
|
||||
];
|
||||
|
||||
let image_layer = (
|
||||
Lsn(0x40),
|
||||
vec![(
|
||||
Key::from_hex("620000000033333333444444445500000000").unwrap(),
|
||||
test_img("bar"),
|
||||
)],
|
||||
);
|
||||
let image_layers = vec![image_layer];
|
||||
|
||||
let (tenant, ctx) = harness.load().await;
|
||||
let timeline = tenant
|
||||
.create_test_timeline_with_layers(
|
||||
TimelineId::generate(),
|
||||
Lsn(0x10),
|
||||
14,
|
||||
&ctx,
|
||||
delta_layers,
|
||||
image_layers,
|
||||
Lsn(0x100),
|
||||
)
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
// Layer visibility is an input to heatmap generation, so refresh it first
|
||||
timeline.update_layer_visibility().await.unwrap();
|
||||
|
||||
let heatmap = timeline
|
||||
.generate_heatmap()
|
||||
.await
|
||||
.expect("Infallible while timeline is not shut down");
|
||||
|
||||
assert_eq!(heatmap.timeline_id, timeline.timeline_id);
|
||||
|
||||
// L0 should come last
|
||||
assert_eq!(heatmap.layers.last().unwrap().name, l0_delta.layer_name());
|
||||
|
||||
let mut last_lsn = Lsn::MAX;
|
||||
for layer in heatmap.layers {
|
||||
// Covered layer should be omitted
|
||||
assert!(layer.name != covered_delta.layer_name());
|
||||
|
||||
let layer_lsn = match &layer.name {
|
||||
LayerName::Delta(d) => d.lsn_range.end,
|
||||
LayerName::Image(i) => i.lsn,
|
||||
};
|
||||
|
||||
// Apart from L0s, newest Layers should come first
|
||||
if !LayerMap::is_l0(layer.name.key_range()) {
|
||||
assert!(layer_lsn <= last_lsn);
|
||||
last_lsn = layer_lsn;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn two_layer_eviction_attempts_at_the_same_time() {
|
||||
let harness = TenantHarness::create("two_layer_eviction_attempts_at_the_same_time")
|
||||
|
||||
Reference in New Issue
Block a user