From 1e9a50bca8ee5887998d59c80a91516508985797 Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Mon, 29 Jan 2024 10:38:40 +0200 Subject: [PATCH] disk_usage_eviction_task: cleanup summaries (#6490) This is the "partial revert" of #6384. The summaries turned out to be expensive due to naive vec usage, but also inconclusive because of the additional context required. In addition to removing summary traces, small refactoring is done. --- pageserver/src/disk_usage_eviction_task.rs | 352 +++++++-------------- 1 file changed, 120 insertions(+), 232 deletions(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 800e52bb51..1f0525b045 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -97,23 +97,86 @@ pub enum EvictionOrder { /// Order the layers to be evicted by how recently they have been accessed relatively within /// the set of resident layers of a tenant. - /// - /// This strategy will evict layers more fairly but is untested. RelativeAccessed { - #[serde(default)] + /// Determines if the tenant with most layers should lose first. + /// + /// Having this enabled is currently the only reasonable option, because the order in which + /// we read tenants is deterministic. If we find the need to use this as `false`, we need + /// to ensure nondeterminism by adding in a random number to break the + /// `relative_last_activity==0.0` ties. + #[serde(default = "default_highest_layer_count_loses_first")] highest_layer_count_loses_first: bool, }, } +fn default_highest_layer_count_loses_first() -> bool { + true +} + impl EvictionOrder { - /// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer - /// counts should be the first ones to have their layers evicted. - fn highest_layer_count_loses_first(&self) -> bool { + fn sort(&self, candidates: &mut [(MinResidentSizePartition, EvictionCandidate)]) { + use EvictionOrder::*; + match self { - EvictionOrder::AbsoluteAccessed => false, - EvictionOrder::RelativeAccessed { + AbsoluteAccessed => { + candidates.sort_unstable_by_key(|(partition, candidate)| { + (*partition, candidate.last_activity_ts) + }); + } + RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| { + (*partition, candidate.relative_last_activity) + }), + } + } + + /// Called to fill in the [`EvictionCandidate::relative_last_activity`] while iterating tenants + /// layers in **most** recently used order. + fn relative_last_activity(&self, total: usize, index: usize) -> finite_f32::FiniteF32 { + use EvictionOrder::*; + + match self { + AbsoluteAccessed => finite_f32::FiniteF32::ZERO, + RelativeAccessed { highest_layer_count_loses_first, - } => *highest_layer_count_loses_first, + } => { + // keeping the -1 or not decides if every tenant should lose their least recently accessed + // layer OR if this should happen in the order of having highest layer count: + let fudge = if *highest_layer_count_loses_first { + // relative_last_activity vs. tenant layer count: + // - 0.1..=1.0 (10 layers) + // - 0.01..=1.0 (100 layers) + // - 0.001..=1.0 (1000 layers) + // + // leading to evicting less of the smallest tenants. + 0 + } else { + // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a + // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could + // be that less than 10k layer evictions is enough, so we would not need to evict from + // all tenants. + // + // as the tenant ordering is now deterministic this could hit the same tenants + // disproportionetly on multiple invocations. alternative could be to remember how many + // layers did we evict last time from this tenant, and inject that as an additional + // fudge here. + 1 + }; + + let total = total.checked_sub(fudge).filter(|&x| x > 1).unwrap_or(1); + let divider = total as f32; + + // most recently used is always (total - 0) / divider == 1.0 + // least recently used depends on the fudge: + // - (total - 1) - (total - 1) / total => 0 / total + // - total - (total - 1) / total => 1 / total + let distance = (total - index) as f32; + + finite_f32::FiniteF32::try_from_normalized(distance / divider) + .unwrap_or_else(|val| { + tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={index}, total={total}: {val}"); + finite_f32::FiniteF32::ZERO + }) + } } } } @@ -389,52 +452,6 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( let selection = select_victims(&candidates, usage_pre); - let mut candidates = candidates; - - let selection = if matches!(eviction_order, EvictionOrder::RelativeAccessed { .. }) { - // we currently have the layers ordered by AbsoluteAccessed so that we can get the summary - // for comparison here. this is a temporary measure to develop alternatives. - use std::fmt::Write; - - let mut summary_buf = String::with_capacity(256); - - { - let absolute_summary = candidates - .iter() - .take(selection.amount) - .map(|(_, candidate)| candidate) - .collect::(); - - write!(summary_buf, "{absolute_summary}").expect("string grows"); - - info!("absolute accessed selection summary: {summary_buf}"); - } - - candidates.sort_unstable_by_key(|(partition, candidate)| { - (*partition, candidate.relative_last_activity) - }); - - let selection = select_victims(&candidates, usage_pre); - - { - summary_buf.clear(); - - let relative_summary = candidates - .iter() - .take(selection.amount) - .map(|(_, candidate)| candidate) - .collect::(); - - write!(summary_buf, "{relative_summary}").expect("string grows"); - - info!("relative accessed selection summary: {summary_buf}"); - } - - selection - } else { - selection - }; - let (evicted_amount, usage_planned) = selection.into_amount_and_planned(); // phase2: evict layers @@ -835,54 +852,12 @@ async fn collect_eviction_candidates( .sort_unstable_by_key(|layer_info| std::cmp::Reverse(layer_info.last_activity_ts)); let mut cumsum: i128 = 0; - // keeping the -1 or not decides if every tenant should lose their least recently accessed - // layer OR if this should happen in the order of having highest layer count: - let fudge = if eviction_order.highest_layer_count_loses_first() { - // relative_age vs. tenant layer count: - // - 0.1..=1.0 (10 layers) - // - 0.01..=1.0 (100 layers) - // - 0.001..=1.0 (1000 layers) - // - // leading to evicting less of the smallest tenants. - 0 - } else { - // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a - // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could - // be that less than 10k layer evictions is enough, so we would not need to evict from - // all tenants. - // - // as the tenant ordering is now deterministic this could hit the same tenants - // disproportionetly on multiple invocations. alternative could be to remember how many - // layers did we evict last time from this tenant, and inject that as an additional - // fudge here. - 1 - }; - - let total = tenant_candidates - .len() - .checked_sub(fudge) - .filter(|&x| x > 0) - // support 0 or 1 resident layer tenants as well - .unwrap_or(1); - let divider = total as f32; + let total = tenant_candidates.len(); for (i, mut candidate) in tenant_candidates.into_iter().enumerate() { // as we iterate this reverse sorted list, the most recently accessed layer will always // be 1.0; this is for us to evict it last. - candidate.relative_last_activity = if matches!( - eviction_order, - EvictionOrder::RelativeAccessed { .. } - ) { - // another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or - // similarly for u16. unsure how it would help. - finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider) - .unwrap_or_else(|val| { - tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}"); - finite_f32::FiniteF32::ZERO - }) - } else { - finite_f32::FiniteF32::ZERO - }; + candidate.relative_last_activity = eviction_order.relative_last_activity(total, i); let partition = if cumsum > min_resident_size as i128 { MinResidentSizePartition::Above @@ -927,10 +902,7 @@ async fn collect_eviction_candidates( debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below, "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); - // always behave as if AbsoluteAccessed was selected. if RelativeAccessed is in use, we - // will sort later by candidate.relative_last_activity to get compare evictions. - candidates - .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts)); + eviction_order.sort(&mut candidates); Ok(EvictionCandidates::Finished(candidates)) } @@ -1070,6 +1042,12 @@ pub(crate) mod finite_f32 { } } + impl From for f32 { + fn from(value: FiniteF32) -> f32 { + value.0 + } + } + impl FiniteF32 { pub const ZERO: FiniteF32 = FiniteF32(0.0); @@ -1082,136 +1060,9 @@ pub(crate) mod finite_f32 { Err(value) } } - } -} -mod summary { - use super::finite_f32::FiniteF32; - use super::{EvictionCandidate, LayerCount}; - use pageserver_api::shard::TenantShardId; - use std::collections::{BTreeMap, HashMap}; - use std::time::SystemTime; - - #[derive(Debug, Default)] - pub(super) struct EvictionSummary { - evicted_per_tenant: HashMap, - total: LayerCount, - - last_absolute: Option, - last_relative: Option, - } - - impl<'a> FromIterator<&'a EvictionCandidate> for EvictionSummary { - fn from_iter>(iter: T) -> Self { - let mut summary = EvictionSummary::default(); - for item in iter { - let counts = summary - .evicted_per_tenant - .entry(*item.layer.get_tenant_shard_id()) - .or_default(); - - let sz = item.layer.get_file_size(); - - counts.file_sizes += sz; - counts.count += 1; - - summary.total.file_sizes += sz; - summary.total.count += 1; - - summary.last_absolute = Some(item.last_activity_ts); - summary.last_relative = Some(item.relative_last_activity); - } - - summary - } - } - - struct SiBytesAmount(u64); - - impl std::fmt::Display for SiBytesAmount { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - if self.0 < 1024 { - return write!(f, "{}B", self.0); - } - - let mut tmp = self.0; - let mut ch = 0; - let suffixes = b"KMGTPE"; - - while tmp > 1024 * 1024 && ch < suffixes.len() - 1 { - tmp /= 1024; - ch += 1; - } - - let ch = suffixes[ch] as char; - - write!(f, "{:.1}{ch}iB", tmp as f64 / 1024.0) - } - } - - impl std::fmt::Display for EvictionSummary { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - // wasteful, but it's for testing - - let mut sorted: BTreeMap> = BTreeMap::new(); - - for (tenant_shard_id, count) in &self.evicted_per_tenant { - sorted - .entry(count.count) - .or_default() - .push((*tenant_shard_id, count.file_sizes)); - } - - let total_file_sizes = SiBytesAmount(self.total.file_sizes); - - writeln!( - f, - "selected {} layers of {total_file_sizes} up to ({:?}, {:.2?}):", - self.total.count, self.last_absolute, self.last_relative, - )?; - - for (count, per_tenant) in sorted.iter().rev().take(10) { - write!(f, "- {count} layers: ")?; - - if per_tenant.len() < 3 { - for (i, (tenant_shard_id, bytes)) in per_tenant.iter().enumerate() { - if i > 0 { - write!(f, ", ")?; - } - let bytes = SiBytesAmount(*bytes); - write!(f, "{tenant_shard_id} ({bytes})")?; - } - } else { - let num_tenants = per_tenant.len(); - let total_bytes = per_tenant.iter().map(|(_id, bytes)| bytes).sum::(); - let total_bytes = SiBytesAmount(total_bytes); - let layers = num_tenants * count; - - write!( - f, - "{num_tenants} tenants {total_bytes} in total {layers} layers", - )?; - } - - writeln!(f)?; - } - - if sorted.len() > 10 { - let (rem_count, rem_bytes) = sorted - .iter() - .rev() - .map(|(count, per_tenant)| { - ( - count, - per_tenant.iter().map(|(_id, bytes)| bytes).sum::(), - ) - }) - .fold((0, 0), |acc, next| (acc.0 + next.0, acc.1 + next.1)); - let rem_bytes = SiBytesAmount(rem_bytes); - writeln!(f, "- rest of tenants ({}) not shown ({rem_count} layers or {:.1}%, {rem_bytes} or {:.1}% bytes)", sorted.len() - 10, 100.0 * rem_count as f64 / self.total.count as f64, 100.0 * rem_bytes.0 as f64 / self.total.file_sizes as f64)?; - } - - Ok(()) + pub fn into_inner(self) -> f32 { + self.into() } } } @@ -1336,3 +1187,40 @@ mod filesystem_level_usage { assert!(!usage.has_pressure()); } } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn relative_equal_bounds() { + let order = EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first: false, + }; + + let len = 10; + let v = (0..len) + .map(|i| order.relative_last_activity(len, i).into_inner()) + .collect::>(); + + assert_eq!(v.first(), Some(&1.0)); + assert_eq!(v.last(), Some(&0.0)); + assert!(v.windows(2).all(|slice| slice[0] > slice[1])); + } + + #[test] + fn relative_spare_bounds() { + let order = EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first: true, + }; + + let len = 10; + let v = (0..len) + .map(|i| order.relative_last_activity(len, i).into_inner()) + .collect::>(); + + assert_eq!(v.first(), Some(&1.0)); + assert_eq!(v.last(), Some(&0.1)); + assert!(v.windows(2).all(|slice| slice[0] > slice[1])); + } +}