diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 1f0525b045..d5f5a20683 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -623,6 +623,7 @@ impl std::fmt::Display for EvictionLayer { } } +#[derive(Default)] pub(crate) struct DiskUsageEvictionInfo { /// Timeline's largest layer (remote or resident) pub max_layer_size: Option, @@ -854,19 +855,27 @@ async fn collect_eviction_candidates( let total = tenant_candidates.len(); - for (i, mut candidate) in tenant_candidates.into_iter().enumerate() { - // as we iterate this reverse sorted list, the most recently accessed layer will always - // be 1.0; this is for us to evict it last. - candidate.relative_last_activity = eviction_order.relative_last_activity(total, i); + let tenant_candidates = + tenant_candidates + .into_iter() + .enumerate() + .map(|(i, mut candidate)| { + // as we iterate this reverse sorted list, the most recently accessed layer will always + // be 1.0; this is for us to evict it last. + candidate.relative_last_activity = + eviction_order.relative_last_activity(total, i); - let partition = if cumsum > min_resident_size as i128 { - MinResidentSizePartition::Above - } else { - MinResidentSizePartition::Below - }; - cumsum += i128::from(candidate.layer.get_file_size()); - candidates.push((partition, candidate)); - } + let partition = if cumsum > min_resident_size as i128 { + MinResidentSizePartition::Above + } else { + MinResidentSizePartition::Below + }; + cumsum += i128::from(candidate.layer.get_file_size()); + + (partition, candidate) + }); + + candidates.extend(tenant_candidates); } // Note: the same tenant ID might be hit twice, if it transitions from attached to @@ -882,21 +891,41 @@ async fn collect_eviction_candidates( ); for secondary_tenant in secondary_tenants { - let mut layer_info = secondary_tenant.get_layers_for_eviction(); + // for secondary tenants we use a sum of on_disk layers and already evicted layers. this is + // to prevent repeated disk usage based evictions from completely draining less often + // updating secondaries. + let (mut layer_info, total_layers) = secondary_tenant.get_layers_for_eviction(); + + debug_assert!( + total_layers >= layer_info.resident_layers.len(), + "total_layers ({total_layers}) must be at least the resident_layers.len() ({})", + layer_info.resident_layers.len() + ); layer_info .resident_layers .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts)); - candidates.extend(layer_info.resident_layers.into_iter().map(|candidate| { - ( - // Secondary locations' layers are always considered above the min resident size, - // i.e. secondary locations are permitted to be trimmed to zero layers if all - // the layers have sufficiently old access times. - MinResidentSizePartition::Above, - candidate, - ) - })); + let tenant_candidates = + layer_info + .resident_layers + .into_iter() + .enumerate() + .map(|(i, mut candidate)| { + candidate.relative_last_activity = + eviction_order.relative_last_activity(total_layers, i); + ( + // Secondary locations' layers are always considered above the min resident size, + // i.e. secondary locations are permitted to be trimmed to zero layers if all + // the layers have sufficiently old access times. + MinResidentSizePartition::Above, + candidate, + ) + }); + + candidates.extend(tenant_candidates); + + tokio::task::yield_now().await; } debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below, diff --git a/pageserver/src/tenant/secondary.rs b/pageserver/src/tenant/secondary.rs index 4269e1dec1..926cd0302b 100644 --- a/pageserver/src/tenant/secondary.rs +++ b/pageserver/src/tenant/secondary.rs @@ -160,7 +160,7 @@ impl SecondaryTenant { &self.tenant_shard_id } - pub(crate) fn get_layers_for_eviction(self: &Arc) -> DiskUsageEvictionInfo { + pub(crate) fn get_layers_for_eviction(self: &Arc) -> (DiskUsageEvictionInfo, usize) { self.detail.lock().unwrap().get_layers_for_eviction(self) } diff --git a/pageserver/src/tenant/secondary/downloader.rs b/pageserver/src/tenant/secondary/downloader.rs index 55af4f9f2b..9330edf946 100644 --- a/pageserver/src/tenant/secondary/downloader.rs +++ b/pageserver/src/tenant/secondary/downloader.rs @@ -146,14 +146,15 @@ impl SecondaryDetail { } } + /// Additionally returns the total number of layers, used for more stable relative access time + /// based eviction. pub(super) fn get_layers_for_eviction( &self, parent: &Arc, - ) -> DiskUsageEvictionInfo { - let mut result = DiskUsageEvictionInfo { - max_layer_size: None, - resident_layers: Vec::new(), - }; + ) -> (DiskUsageEvictionInfo, usize) { + let mut result = DiskUsageEvictionInfo::default(); + let mut total_layers = 0; + for (timeline_id, timeline_detail) in &self.timelines { result .resident_layers @@ -169,6 +170,10 @@ impl SecondaryDetail { relative_last_activity: finite_f32::FiniteF32::ZERO, } })); + + // total might be missing currently downloading layers, but as a lower than actual + // value it is good enough approximation. + total_layers += timeline_detail.on_disk_layers.len() + timeline_detail.evicted_at.len(); } result.max_layer_size = result .resident_layers @@ -183,7 +188,7 @@ impl SecondaryDetail { result.resident_layers.len() ); - result + (result, total_layers) } } @@ -312,9 +317,7 @@ impl JobGenerator