diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 6611338491..932413ed8e 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -289,6 +289,7 @@ pub async fn disk_usage_eviction_task_iteration_impl( usage_pre: U, cancel: &CancellationToken, ) -> anyhow::Result> { + // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex) let _g = state .mutex .try_lock() @@ -561,7 +562,13 @@ async fn collect_eviction_candidates( }; // collect layers from all timelines in this tenant + // + // If one of the timelines becomes `!is_active()` during the iteration, + // for example because we're shutting down, then `max_layer_size` can be too small. + // That's OK. This code only runs under a disk pressure situation, and being + // a little unfair to tenants during shutdown in such a situation is tolerable. let mut tenant_candidates = Vec::new(); + let mut max_layer_size = 0; for tl in tenant.list_timelines() { if !tl.is_active() { continue; @@ -573,12 +580,24 @@ async fn collect_eviction_candidates( .into_iter() .map(|layer_infos| (tl.clone(), layer_infos)), ); + max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0)); if cancel.is_cancelled() { return Ok(EvictionCandidates::Cancelled); } } + // `min_resident_size` defaults to maximum layer file size of the tenant. + // This ensures that each tenant can have at least one layer resident at a given time, + // ensuring forward progress for a single Timeline::get in that tenant. + // It's a questionable heuristic since there are many Timeline::get + // requests going on and multiple layers are needed, and, at least in Neon prod, + // the median layer file size is much smaller than the compaction target size. + // We could be better here, e.g., sum of all L0 layers + most recent L1 layer. + // That's what's typically used by the various background loops. + // + // The default can be overriden with a fixed value in the tenant conf. + // A default override can be put in the default tenant conf in the pageserver.toml. let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() { info!( "using overridden min resident size {} for tenant {}", @@ -587,18 +606,12 @@ async fn collect_eviction_candidates( ); s } else { - // By default, use the size of the largest resident layer - let s = tenant_candidates - .iter() - .map(|(_, layer_info)| layer_info.file_size()) - .max() - .unwrap_or(0); info!( "using max layer size {} for tenant {}", - s, + max_layer_size, tenant.tenant_id() ); - s + max_layer_size }; // Sort layers most-recently-used first diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b03d64712a..91efde8f22 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4038,6 +4038,8 @@ impl Timeline { } pub struct DiskUsageEvictionInfo { + /// Timeline's largest layer (remote or resident) + pub max_layer_size: Option, /// Timeline's resident layers pub resident_layers: Vec, } @@ -4070,9 +4072,13 @@ impl Timeline { pub(crate) fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo { let layers = self.layers.read().unwrap(); + let mut max_layer_size: Option = None; let mut resident_layers = Vec::new(); for l in layers.iter_historic_layers() { + let file_size = l.file_size(); + max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); + if l.is_remote_layer() { continue; } @@ -4085,7 +4091,10 @@ impl Timeline { }); } - DiskUsageEvictionInfo { resident_layers } + DiskUsageEvictionInfo { + max_layer_size, + resident_layers, + } } }