pageserver: evict covered layers earlier (#8679)

## Problem

When pageservers do compaction, they frequently create image layers that
make earlier layers un-needed for reads, but then keep those earlier
layers around for 24 hours waiting for time-based eviction to expire
them.

Now that we track layer visibility, we can use it as an input to
eviction, and avoid the 24 hour "disk bump" that happens around
pageserver restarts.

## Summary of changes

- During time-based eviction, if a layer is marked Covered, use the
eviction period as the threshold: i.e. these layers get to remain
resident for at least one iteration of the eviction loop, but then get
evicted. With current settings this means they get evicted after 1h
instead of 24h.
- During disk usage eviction, prioritized evicting covered layers above
all other layers.


Caveats:
- Using the period as the threshold for time based eviction in this case
is a bit of a hack, but it avoids adding yet another configuration
property, and in any case the value of a new property would be somewhat
arbitrary: there's no "right" length of time to keep covered layers
around just in case.
- We had previously planned on removing time-based eviction: this change
would motivate us to keep it around, but we can still simplify the code
later to just do the eviction of covered layers, rather than applying a
TTL policy to all layers.
This commit is contained in:
John Spray
2024-08-14 12:10:15 +01:00
committed by GitHub
parent 485d76ac62
commit 19d69d515c
4 changed files with 62 additions and 18 deletions

View File

@@ -64,7 +64,7 @@ use crate::{
mgr::TenantManager,
remote_timeline_client::LayerFileMetadata,
secondary::SecondaryTenant,
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName},
storage_layer::{AsLayerDesc, EvictionError, Layer, LayerName, LayerVisibilityHint},
},
CancellableTask, DiskUsageEvictionTask,
};
@@ -114,7 +114,7 @@ fn default_highest_layer_count_loses_first() -> bool {
}
impl EvictionOrder {
fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) {
fn sort(&self, candidates: &mut [(EvictionPartition, EvictionCandidate)]) {
use EvictionOrder::*;
match self {
@@ -644,6 +644,7 @@ pub(crate) struct EvictionCandidate {
pub(crate) layer: EvictionLayer,
pub(crate) last_activity_ts: SystemTime,
pub(crate) relative_last_activity: finite_f32::FiniteF32,
pub(crate) visibility: LayerVisibilityHint,
}
impl std::fmt::Display for EvictionLayer {
@@ -685,14 +686,22 @@ impl std::fmt::Debug for EvictionCandidate {
}
#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)]
enum MinResidentSizePartition {
enum EvictionPartition {
// A layer that is un-wanted by the tenant: evict all these first, before considering
// any other layers
EvictNow,
// Above the minimum size threshold: this layer is a candidate for eviction.
Above,
// Below the minimum size threshold: this layer should only be evicted if all the
// tenants' layers above the minimum size threshold have already been considered.
Below,
}
enum EvictionCandidates {
Cancelled,
Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>),
Finished(Vec<(EvictionPartition, EvictionCandidate)>),
}
/// Gather the eviction candidates.
@@ -890,8 +899,10 @@ async fn collect_eviction_candidates(
max_layer_size
};
// Sort layers most-recently-used first, then partition by
// cumsum above/below min_resident_size.
// Sort layers most-recently-used first, then calculate [`EvictionPartition`] for each layer,
// where the inputs are:
// - whether the layer is visible
// - whether the layer is above/below the min_resident_size cutline
tenant_candidates
.sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts));
let mut cumsum: i128 = 0;
@@ -908,12 +919,23 @@ async fn collect_eviction_candidates(
candidate.relative_last_activity =
eviction_order.relative_last_activity(total, i);
let partition = if cumsum > min_resident_size as i128 {
MinResidentSizePartition::Above
} else {
MinResidentSizePartition::Below
let partition = match candidate.visibility {
LayerVisibilityHint::Covered => {
// Covered layers are evicted first
EvictionPartition::EvictNow
}
LayerVisibilityHint::Visible => {
cumsum += i128::from(candidate.layer.get_file_size());
if cumsum > min_resident_size as i128 {
EvictionPartition::Above
} else {
// The most recent layers below the min_resident_size threshold
// are the last to be evicted.
EvictionPartition::Below
}
}
};
cumsum += i128::from(candidate.layer.get_file_size());
(partition, candidate)
});
@@ -981,7 +1003,7 @@ async fn collect_eviction_candidates(
// Secondary locations' layers are always considered above the min resident size,
// i.e. secondary locations are permitted to be trimmed to zero layers if all
// the layers have sufficiently old access times.
MinResidentSizePartition::Above,
EvictionPartition::Above,
candidate,
)
});
@@ -1009,7 +1031,9 @@ async fn collect_eviction_candidates(
}
}
debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below,
debug_assert!(EvictionPartition::Above < EvictionPartition::Below,
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
debug_assert!(EvictionPartition::EvictNow < EvictionPartition::Above,
"as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first");
eviction_order.sort(&mut candidates);
@@ -1022,7 +1046,7 @@ async fn collect_eviction_candidates(
///
/// Returns the amount of candidates selected, with the planned usage.
fn select_victims<U: Usage>(
candidates: &[(MinResidentSizePartition, EvictionCandidate)],
candidates: &[(EvictionPartition, EvictionCandidate)],
usage_pre: U,
) -> VictimSelection<U> {
let mut usage_when_switched = None;
@@ -1034,7 +1058,7 @@ fn select_victims<U: Usage>(
break;
}
if partition == &MinResidentSizePartition::Below && usage_when_switched.is_none() {
if partition == &EvictionPartition::Below && usage_when_switched.is_none() {
usage_when_switched = Some((usage_planned, i));
}

View File

@@ -22,7 +22,7 @@ use crate::{
FAILED_REMOTE_OP_RETRIES,
},
span::debug_assert_current_span_has_tenant_id,
storage_layer::{layer::local_layer_path, LayerName},
storage_layer::{layer::local_layer_path, LayerName, LayerVisibilityHint},
tasks::{warn_when_period_overrun, BackgroundLoopKind},
},
virtual_file::{on_fatal_io_error, MaybeFatalIo, VirtualFile},
@@ -296,6 +296,9 @@ impl SecondaryDetail {
}),
last_activity_ts: ods.access_time,
relative_last_activity: finite_f32::FiniteF32::ZERO,
// Secondary location layers are presumed visible, because Covered layers
// are excluded from the heatmap
visibility: LayerVisibilityHint::Visible,
}
}));

View File

@@ -5261,6 +5261,7 @@ impl Timeline {
layer: layer.to_owned().into(),
last_activity_ts,
relative_last_activity: finite_f32::FiniteF32::ZERO,
visibility: layer.visibility(),
}
})
.collect();

View File

@@ -30,7 +30,8 @@ use crate::{
pgdatadir_mapping::CollectKeySpaceError,
task_mgr::{self, TaskKind, BACKGROUND_RUNTIME},
tenant::{
tasks::BackgroundLoopKind, timeline::EvictionError, LogicalSizeCalculationCause, Tenant,
storage_layer::LayerVisibilityHint, tasks::BackgroundLoopKind, timeline::EvictionError,
LogicalSizeCalculationCause, Tenant,
},
};
@@ -241,7 +242,22 @@ impl Timeline {
}
};
no_activity_for > p.threshold
match layer.visibility() {
LayerVisibilityHint::Visible => {
// Usual case: a visible layer might be read any time, and we will keep it
// resident until it hits our configured TTL threshold.
no_activity_for > p.threshold
}
LayerVisibilityHint::Covered => {
// Covered layers: this is probably a layer that was recently covered by
// an image layer during compaction. We don't evict it immediately, but
// it doesn't stay resident for the full `threshold`: we just keep it
// for a shorter time in case
// - it is used for Timestamp->LSN lookups
// - a new branch is created in recent history which will read this layer
no_activity_for > p.period
}
}
})
.cloned()
.for_each(|layer| {