From b6b8265450d6a7f369e8c658033510ba2f212b0a Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 28 Mar 2023 01:13:22 +0300 Subject: [PATCH 01/22] Rewrite to make the algorithm more understandable (I hope). The algorithm is the same (with two small exceptions), but rewrite the way it's implemented to make it easier to follow. The exceptions: 1. 'min_resident_size' now protects at least that much data in the first "respectful" phase of the algorithm. Previously, it would evict layers until the resident size fell below min_resident_size. In other words, we know protect one more layer of each tenant, so that the resident size stays just above min_resident_size, while previously we would evict enough to bring the resident size just under min_resident_size. 2. Previously, the "max layer size" that's used as the default min_resident_size was calculated from *all* layers in the tenant, including remote layers. Now it's only calculated across all locally-present layers. I don't know if that was a deliberate choice, but this is slightly simpler. --- libs/utils/src/approx_accurate.rs | 100 ------ libs/utils/src/lib.rs | 1 - pageserver/src/disk_usage_eviction_task.rs | 356 ++++++++++----------- pageserver/src/tenant/timeline.rs | 7 - 4 files changed, 173 insertions(+), 291 deletions(-) delete mode 100644 libs/utils/src/approx_accurate.rs diff --git a/libs/utils/src/approx_accurate.rs b/libs/utils/src/approx_accurate.rs deleted file mode 100644 index e7af36eec4..0000000000 --- a/libs/utils/src/approx_accurate.rs +++ /dev/null @@ -1,100 +0,0 @@ -/// Three-state `max` accumulator. -/// -/// If it accumulates over 0 or many `Some(T)` values, it is `Accurate` maximum of those values. -/// If a single `None` value is merged, it becomes `Approximate` variant. -/// -/// Remove when `Layer::file_size` is no longer an `Option`. -#[derive(Default, Debug, Clone, Copy)] -pub enum ApproxAccurate { - Approximate(T), - Accurate(T), - #[default] - Empty, -} - -impl ApproxAccurate { - /// `max(a, b)` where the approximate is inflicted receiving a `None`, or infected onwards. - #[must_use] - pub fn max(self, next: Option) -> ApproxAccurate { - use ApproxAccurate::*; - match (self, next) { - (Accurate(a) | Approximate(a), None) => Approximate(a), - (Empty, None) => Approximate(T::default()), - (Accurate(a), Some(b)) => Accurate(a.max(b)), - (Approximate(a), Some(b)) => Approximate(a.max(b)), - (Empty, Some(b)) => Accurate(b), - } - } - - pub fn is_approximate(&self) -> bool { - matches!(self, ApproxAccurate::Approximate(_)) - } - - pub fn accurate(self) -> Option { - use ApproxAccurate::*; - match self { - Accurate(a) => Some(a), - Empty => Some(T::default()), - Approximate(_) => None, - } - } - - pub fn unwrap_accurate_or(self, default: T) -> T { - use ApproxAccurate::*; - match self { - Accurate(a) => a, - Approximate(_) => default, - // Empty is still accurate, just special case for above `max` - Empty => T::default(), - } - } -} - -#[cfg(test)] -mod tests { - use super::ApproxAccurate; - - #[test] - fn accumulate_only_some() { - let acc = (0..=5) - .into_iter() - .map(Some) - .fold(ApproxAccurate::default(), |acc, next| acc.max(next)); - - assert_eq!(acc.accurate(), Some(5)); - assert!(!acc.is_approximate()); - assert_eq!(acc.unwrap_accurate_or(42), 5); - } - - #[test] - fn accumulate_some_and_none() { - let acc = [Some(0), None, Some(2)] - .into_iter() - .fold(ApproxAccurate::default(), |acc, next| acc.max(next)); - - assert_eq!(acc.accurate(), None); - assert!(acc.is_approximate()); - assert_eq!(acc.unwrap_accurate_or(42), 42); - } - - #[test] - fn accumulate_none_and_some() { - let acc = [None, Some(1), None] - .into_iter() - .fold(ApproxAccurate::default(), |acc, next| acc.max(next)); - - assert_eq!(acc.accurate(), None); - assert!(acc.is_approximate()); - assert_eq!(acc.unwrap_accurate_or(42), 42); - } - - #[test] - fn accumulate_none() { - let acc = ApproxAccurate::::default(); - - // it is accurate empty - assert_eq!(acc.accurate(), Some(0)); - assert!(!acc.is_approximate()); - assert_eq!(acc.unwrap_accurate_or(42), 0); - } -} diff --git a/libs/utils/src/lib.rs b/libs/utils/src/lib.rs index 6435b18d4b..8f572f0ec3 100644 --- a/libs/utils/src/lib.rs +++ b/libs/utils/src/lib.rs @@ -33,7 +33,6 @@ pub mod pid_file; // Misc pub mod accum; -pub mod approx_accurate; pub mod shutdown; // Utility for binding TcpListeners with proper socket options. diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index a7ab5b8db7..1ceb897e7c 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -41,9 +41,8 @@ use std::{ collections::HashMap, - ops::ControlFlow, - sync::{Arc, Mutex}, - time::Duration, + sync::Arc, + time::{Duration, SystemTime}, }; use anyhow::Context; @@ -54,12 +53,12 @@ use sync_wrapper::SyncWrapper; use tokio::time::Instant; use tokio_util::sync::CancellationToken; use tracing::{debug, error, info, instrument, warn, Instrument}; -use utils::{approx_accurate::ApproxAccurate, id::TenantId, serde_percent::Percent}; +use utils::serde_percent::Percent; use crate::{ config::PageServerConf, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, - tenant::{self, LocalLayerInfoForDiskUsageEviction, Timeline}, + tenant::{self, Timeline, storage_layer::PersistentLayer}, }; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -284,13 +283,6 @@ pub async fn disk_usage_eviction_task_iteration_impl( .try_lock() .map_err(|_| anyhow::anyhow!("iteration is already executing"))?; - // planned post-eviction usage - let mut usage_planned_min_resident_size_respecting = usage_pre; - let mut usage_planned_global_lru = None; - // achieved post-eviction usage according to internal accounting - let mut usage_assumed = usage_pre; - // actual usage read after batched evictions - debug!(?usage_pre, "disk usage"); if !usage_pre.has_pressure() { @@ -302,42 +294,42 @@ pub async fn disk_usage_eviction_task_iteration_impl( "running disk usage based eviction due to pressure" ); - let mut lru_candidates: Vec<(_, LocalLayerInfoForDiskUsageEviction)> = Vec::new(); - - // get a snapshot of the list of tenants - let tenants = tenant::mgr::list_tenants() - .await - .context("get list of tenants")?; - - { - let mut tmp = Vec::new(); - for (tenant_id, _state) in &tenants { - let flow = extend_lru_candidates( - Mode::RespectTenantMinResidentSize, - *tenant_id, - &mut lru_candidates, - &mut tmp, - cancel, - ) - .await; - - if let ControlFlow::Break(()) = flow { - return Ok(IterationOutcome::Cancelled); - } - - assert!(tmp.is_empty(), "tmp has to be fully drained each iteration"); - } - } - + // Collect list of all layers in the system, sorted in the order that they should + // be evicted. + let all_candidates = collect_eviction_candidates(cancel).await?; if cancel.is_cancelled() { return Ok(IterationOutcome::Cancelled); } + // XXX: Print the whole list, for debbugging + let now: SystemTime = SystemTime::now(); + for (i, candidate) in all_candidates.iter().enumerate() { + debug!("cand {}/{}: {}, size {}, at {}, overage {}", + i, + all_candidates.len(), + candidate.layer.local_path().unwrap().display(), + candidate.layer.file_size(), + now.duration_since(candidate.last_activity_ts).unwrap().as_micros(), + candidate.tenant_resident_size_overage + ); + } + // phase1: select victims to relieve pressure - lru_candidates.sort_unstable_by_key(|(_, layer)| layer.last_activity_ts); - let mut batched: HashMap<_, Vec> = HashMap::new(); - for (i, (timeline, layer)) in lru_candidates.into_iter().enumerate() { - if !usage_planned_min_resident_size_respecting.has_pressure() { + // + // Walk through the list of candidates, until we have accumulated enough layers to get + // us back under the pressure threshold. 'usage_planned' is updated so that it tracks + // how much disk space would be used after evicting all the layers up to the current + // point in the list. The layers are collected in 'batched', grouped per timeline. + // + // If we get far enough in the list that we start to evict layers that are below + // the tenant's min-resident-size threshold, print a warning, and memorize the disk + // usage at that point, in 'usage_planned_min_resident_size_respecting'. + let mut batched: HashMap<_, Vec>> = HashMap::new(); + let mut min_resident_size_violated = false; + let mut usage_planned = usage_pre; + let mut usage_planned_min_resident_size_respecting = None; + for (i, candidate) in all_candidates.into_iter().enumerate() { + if !usage_planned.has_pressure() { debug!( no_candidates_evicted = i, "took enough candidates for pressure to be relieved" @@ -345,66 +337,46 @@ pub async fn disk_usage_eviction_task_iteration_impl( break; } - usage_planned_min_resident_size_respecting.add_available_bytes(layer.file_size()); + if !min_resident_size_violated { + if candidate.tenant_resident_size_overage < 0 { + warn!(?usage_pre, ?usage_planned, "tenant_min_resident_size-respecting LRU would not relieve pressure, falling back to global LRU at {}", i); + min_resident_size_violated = true; + usage_planned_min_resident_size_respecting = Some(usage_planned.clone()); + } + } else { + // all layers with overage >= 0 should come first + assert!(candidate.tenant_resident_size_overage < 0); + } + usage_planned.add_available_bytes(candidate.layer.file_size()); batched - .entry(TimelineKey(timeline.clone())) + .entry(TimelineKey(candidate.timeline.clone())) .or_default() - .push(layer); + .push(candidate.layer); } - // If we can't relieve pressure while respecting tenant_min_resident_size, fall back to global LRU. - if usage_planned_min_resident_size_respecting.has_pressure() { - // NB: tests depend on parts of this log message - warn!(?usage_pre, ?usage_planned_min_resident_size_respecting, "tenant_min_resident_size-respecting LRU would not relieve pressure, falling back to global LRU"); - batched.clear(); - let mut usage_planned = usage_pre; - let mut global_lru_candidates = Vec::new(); - let mut tmp = Vec::new(); - for (tenant_id, _state) in &tenants { - let flow = extend_lru_candidates( - Mode::GlobalLru, - *tenant_id, - &mut global_lru_candidates, - &mut tmp, - cancel, - ) - .await; - if let ControlFlow::Break(()) = flow { - return Ok(IterationOutcome::Cancelled); - } - - assert!(tmp.is_empty(), "tmp has to be fully drained each iteration"); + let usage_planned = if min_resident_size_violated { + PlannedUsage { + respecting_tenant_min_resident_size: usage_planned_min_resident_size_respecting.unwrap(), + fallback_to_global_lru: Some(usage_planned), } - global_lru_candidates.sort_unstable_by_key(|(_, layer)| layer.last_activity_ts); - for (timeline, layer) in global_lru_candidates { - usage_planned.add_available_bytes(layer.file_size()); - batched - .entry(TimelineKey(timeline.clone())) - .or_default() - .push(layer); - if cancel.is_cancelled() { - return Ok(IterationOutcome::Cancelled); - } + } else { + PlannedUsage { + respecting_tenant_min_resident_size: usage_planned, + fallback_to_global_lru: None, } - usage_planned_global_lru = Some(usage_planned); - } - let usage_planned = PlannedUsage { - respecting_tenant_min_resident_size: usage_planned_min_resident_size_respecting, - fallback_to_global_lru: usage_planned_global_lru, }; - debug!(?usage_planned, "usage planned"); // phase2: evict victims batched by timeline - let mut batch = Vec::new(); + + // achieved post-eviction usage according to internal accounting + let mut usage_assumed = usage_pre; + let mut evictions_failed = LayerCount::default(); - for (timeline, layers) in batched { + for (timeline, batch) in batched { let tenant_id = timeline.tenant_id; let timeline_id = timeline.timeline_id; - - batch.clear(); - batch.extend(layers.iter().map(|x| &x.layer).cloned()); let batch_size = batch.len(); debug!(%timeline_id, "evicting batch for timeline"); @@ -417,8 +389,8 @@ pub async fn disk_usage_eviction_task_iteration_impl( warn!("failed to evict batch: {:#}", e); } Ok(results) => { - assert_eq!(results.len(), layers.len()); - for (result, layer) in results.into_iter().zip(layers.iter()) { + assert_eq!(results.len(), batch.len()); + for (result, layer) in results.into_iter().zip(batch.iter()) { match result { Some(Ok(true)) => { usage_assumed.add_available_bytes(layer.file_size()); @@ -461,110 +433,128 @@ pub async fn disk_usage_eviction_task_iteration_impl( })) } -/// Different modes of gathering tenant's least recently used layers. -#[derive(Debug)] -enum Mode { - /// Add all but the most recently used `min_resident_size` worth of layers to the candidates - /// list. - /// - /// `min_resident_size` defaults to maximum layer file size of the tenant. This ensures that - /// the tenant will always have one layer resident. If we cannot compute `min_resident_size` - /// accurately because metadata is missing we use hardcoded constant. `min_resident_size` can - /// be overridden per tenant for important tenants. - RespectTenantMinResidentSize, - /// Consider all layer files from all tenants in LRU order. - /// - /// This is done if the `min_resident_size` respecting does not relieve pressure. - GlobalLru, +// Result type of `collect_eviction_candidates` +// +// `collect_eviction_candidates' returns a vector of these, in the preference order +// that they should be evicted. +struct EvictionCandidate { + timeline: Arc, + layer: Arc, + last_activity_ts: SystemTime, + tenant_resident_size_overage: i64, } -#[instrument(skip_all, fields(?mode, %tenant_id))] -async fn extend_lru_candidates( - mode: Mode, - tenant_id: TenantId, - lru_candidates: &mut Vec<(Arc, LocalLayerInfoForDiskUsageEviction)>, - scratch: &mut Vec<(Arc, LocalLayerInfoForDiskUsageEviction)>, - cancel: &CancellationToken, -) -> ControlFlow<()> { - debug!("begin"); - - let tenant = match tenant::mgr::get_tenant(tenant_id, true).await { - Ok(tenant) => tenant, - Err(e) => { - // this can happen if tenant has lifecycle transition after we fetched it - debug!("failed to get tenant: {e:#}"); - return ControlFlow::Continue(()); - } - }; - - if cancel.is_cancelled() { - return ControlFlow::Break(()); - } - - let mut max_layer_size = ApproxAccurate::default(); - for tl in tenant.list_timelines() { - if !tl.is_active() { - continue; - } - let info = tl.get_local_layers_for_disk_usage_eviction(); - debug!(timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len()); - scratch.extend( - info.resident_layers - .into_iter() - .map(|layer_infos| (tl.clone(), layer_infos)), - ); - max_layer_size = max_layer_size.max(info.max_layer_size.accurate()); +/// Collect a list of all non-remote layers in the system, from all timelines in all tenants. +/// +/// Returns all layers in the order that they should be evicted. The current policy is to +/// first evict layers in global LRU order, but retain at least min_resident_size bytes of +/// data for each tenant. After that, if necessary, we evict the remaining layers, also in +/// global LRU order. A different policy could be implemented by changing the returned order +/// here. +/// +/// For each layer, we return its last-activity timestamp, and its "overage" over the +/// tenant's min resident size limit. In other words, 'tenant_resident_size_overage' +/// means: If we evicted this layer, and all the layers of this tenant in the result list +/// before this one, how much would the total size of all the tenant's remaining layers +/// exceed the the tenant's min resident size? Layers that belong to the "reservation", +/// 'tenant_resident_size_overage' is negative. +/// +/// For example, imagine that there are two tenants, A and B, with five layers each, a-e. +/// Each layer has size 100, and both tenant's min_resident_size is 150. +/// `collect_eviction_candidates` would return them in this order: +/// +/// last_activity_ts tenant/layer overage +/// 18:30 A/c 250 +/// 19:00 A/b 150 +/// 18:29 B/c 250 +/// 19:05 B/b 150 +/// 20:00 B/a 50 +/// 20:03 A/a 50 +/// --- min resident size respecting cutoff point --- +/// 20:30 A/d -50 +/// 20:40 B/d -50 +/// 20:45 B/e -150 +/// 20:58 A/e -150 +/// +/// If the task is cancelled by the `cancel` token, returns an empty Vec. The caller +/// should check for `cancel.is_cancelled`. +/// +async fn collect_eviction_candidates( + cancel: &CancellationToken +) -> anyhow::Result> { + // get a snapshot of the list of tenants + let tenants = tenant::mgr::list_tenants() + .await + .context("get list of tenants")?; + let mut candidates: Vec = Vec::new(); + for (tenant_id, _state) in &tenants { if cancel.is_cancelled() { - return ControlFlow::Break(()); + return Ok(candidates); } - } - - let min_resident_size = match mode { - Mode::GlobalLru => { - lru_candidates.append(scratch); - return ControlFlow::Continue(()); - } - Mode::RespectTenantMinResidentSize => match tenant.get_min_resident_size_override() { - Some(size) => size, - None => { - match max_layer_size.accurate() { - Some(size) => size, - None => { - let prod_max_layer_file_size = 332_880_000; - // rate-limit warning in case above comment is wrong and we're missing `LayerMetadata` for many layers - static LAST_WARNED: Mutex> = Mutex::new(None); - let mut last_warned = LAST_WARNED.lock().unwrap(); - if last_warned - .map(|v| v.elapsed() > Duration::from_secs(60)) - .unwrap_or(true) - { - warn!(value=prod_max_layer_file_size, "some layers don't have LayerMetadata to calculate max_layer_file_size, using default value"); - *last_warned = Some(Instant::now()); - } - prod_max_layer_file_size - } - } + let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await { + Ok(tenant) => tenant, + Err(e) => { + // this can happen if tenant has lifecycle transition after we fetched it + debug!("failed to get tenant: {e:#}"); + continue; } - }, - }; + }; - scratch.sort_unstable_by_key(|(_, layer_info)| layer_info.last_activity_ts); + // collect layers from all timelines in this tenant + let mut tenant_candidates = Vec::new(); + for tl in tenant.list_timelines() { + if !tl.is_active() { + continue; + } + let info = tl.get_local_layers_for_disk_usage_eviction(); + debug!(timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len()); + tenant_candidates.extend( + info.resident_layers + .into_iter() + .map(|layer_infos| (tl.clone(), layer_infos)), + ); - let mut current: u64 = scratch.iter().map(|(_, layer)| layer.file_size()).sum(); - for (tl, layer) in scratch.drain(..) { - if cancel.is_cancelled() { - return ControlFlow::Break(()); + if cancel.is_cancelled() { + return Ok(candidates); + } } - if current <= min_resident_size { - break; + + // sort this tenant's layers by last_activity_ts, calculate the "overage" for each + // layer, and add them to the result. + tenant_candidates.sort_unstable_by_key(|(_, layer_info)| layer_info.last_activity_ts); + + let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() { + info!("using overridden min resident size {} for tenant {}", s, tenant.tenant_id()); + s + } else { + // By default, use the size of the largest resident layer + let s = tenant_candidates.iter().map(|(_, layer_info)| layer_info.file_size()).max().unwrap_or(0); + info!("using max layer size {} for tenant {}", s, tenant.tenant_id()); + s + }; + + let mut cumulative_resident_size_overage: i128 = -i128::from(min_resident_size); + for (timeline, layer_info) in tenant_candidates.into_iter() { + let file_size = layer_info.file_size(); + candidates.push(EvictionCandidate { + timeline, + last_activity_ts: layer_info.last_activity_ts, + layer: layer_info.layer, + tenant_resident_size_overage: cumulative_resident_size_overage + .clamp(i64::MIN as i128, i64::MAX as i128) as i64, + }); + cumulative_resident_size_overage += i128::from(file_size); } - current -= layer.file_size(); - debug!(?layer, "adding layer to lru_candidates"); - lru_candidates.push((tl, layer)); } - ControlFlow::Continue(()) + // Final sort. Layers above their tenant's min-resident size threshold first, in + // LRU order, and then all the rest also in LRU order + candidates.sort_unstable_by_key(|candidate| { + (candidate.tenant_resident_size_overage < 0, candidate.last_activity_ts) + }); + + Ok(candidates) } struct TimelineKey(Arc); diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 2433567316..535b51483a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -56,7 +56,6 @@ use pageserver_api::reltag::RelTag; use postgres_connection::PgConnectionConfig; use postgres_ffi::to_pg_timestamp; use utils::{ - approx_accurate::ApproxAccurate, id::{TenantId, TimelineId}, lsn::{AtomicLsn, Lsn, RecordLsn}, seqwait::SeqWait, @@ -4039,8 +4038,6 @@ impl Timeline { } pub struct DiskUsageEvictionInfo { - /// Timeline's largest layer (remote or resident) - pub max_layer_size: ApproxAccurate, /// Timeline's resident layers pub resident_layers: Vec, } @@ -4073,12 +4070,9 @@ impl Timeline { pub(crate) fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo { let layers = self.layers.read().unwrap(); - let mut max_layer_size = ApproxAccurate::default(); let mut resident_layers = Vec::new(); for l in layers.iter_historic_layers() { - max_layer_size = max_layer_size.max(Some(l.file_size())); - if l.is_remote_layer() { continue; } @@ -4092,7 +4086,6 @@ impl Timeline { } DiskUsageEvictionInfo { - max_layer_size, resident_layers, } } From 11b16614a37c1d05136a154cc99da3912d495161 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 28 Mar 2023 01:13:33 +0300 Subject: [PATCH 02/22] Fix test for change in behavior close to the min_resident_size boundary This PR changed the behavior to match my expectation per my comment: https://github.com/neondatabase/neon/pull/3809/files#r1149837135 --- test_runner/regress/test_disk_usage_eviction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index be6f6ff048..45de862388 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -204,9 +204,9 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv) du_by_timeline[large_tenant] - du_by_timeline[small_tenant] > 5 * env.layer_size ), "ensure this test will do more than 1 eviction" - # give the larger tenant a haircut while prevening the smaller tenant from getting one + # give the larger tenant a haircut while preventing the smaller tenant from getting one min_resident_size = du_by_timeline[small_tenant] - target = du_by_timeline[large_tenant] - du_by_timeline[small_tenant] + target = int((du_by_timeline[large_tenant] - du_by_timeline[small_tenant]) * 0.75) assert any( [du > min_resident_size for du in du_by_timeline.values()] ), "ensure the larger tenant will get a haircut" From 041b708dc6f1b5b15a984c6102a54f1ae01b41a7 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Tue, 28 Mar 2023 11:08:35 +0300 Subject: [PATCH 03/22] rustfmt --- pageserver/src/disk_usage_eviction_task.rs | 50 +++++++++++++++------- pageserver/src/tenant/timeline.rs | 4 +- 2 files changed, 36 insertions(+), 18 deletions(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 1ceb897e7c..97f248845d 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -58,7 +58,7 @@ use utils::serde_percent::Percent; use crate::{ config::PageServerConf, task_mgr::{self, TaskKind, BACKGROUND_RUNTIME}, - tenant::{self, Timeline, storage_layer::PersistentLayer}, + tenant::{self, storage_layer::PersistentLayer, Timeline}, }; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] @@ -304,13 +304,16 @@ pub async fn disk_usage_eviction_task_iteration_impl( // XXX: Print the whole list, for debbugging let now: SystemTime = SystemTime::now(); for (i, candidate) in all_candidates.iter().enumerate() { - debug!("cand {}/{}: {}, size {}, at {}, overage {}", - i, - all_candidates.len(), - candidate.layer.local_path().unwrap().display(), - candidate.layer.file_size(), - now.duration_since(candidate.last_activity_ts).unwrap().as_micros(), - candidate.tenant_resident_size_overage + debug!( + "cand {}/{}: {}, size {}, at {}, overage {}", + i, + all_candidates.len(), + candidate.layer.local_path().unwrap().display(), + candidate.layer.file_size(), + now.duration_since(candidate.last_activity_ts) + .unwrap() + .as_micros(), + candidate.tenant_resident_size_overage ); } @@ -357,7 +360,8 @@ pub async fn disk_usage_eviction_task_iteration_impl( let usage_planned = if min_resident_size_violated { PlannedUsage { - respecting_tenant_min_resident_size: usage_planned_min_resident_size_respecting.unwrap(), + respecting_tenant_min_resident_size: usage_planned_min_resident_size_respecting + .unwrap(), fallback_to_global_lru: Some(usage_planned), } } else { @@ -480,7 +484,7 @@ struct EvictionCandidate { /// should check for `cancel.is_cancelled`. /// async fn collect_eviction_candidates( - cancel: &CancellationToken + cancel: &CancellationToken, ) -> anyhow::Result> { // get a snapshot of the list of tenants let tenants = tenant::mgr::list_tenants() @@ -525,12 +529,24 @@ async fn collect_eviction_candidates( tenant_candidates.sort_unstable_by_key(|(_, layer_info)| layer_info.last_activity_ts); let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() { - info!("using overridden min resident size {} for tenant {}", s, tenant.tenant_id()); + info!( + "using overridden min resident size {} for tenant {}", + s, + tenant.tenant_id() + ); s } else { // By default, use the size of the largest resident layer - let s = tenant_candidates.iter().map(|(_, layer_info)| layer_info.file_size()).max().unwrap_or(0); - info!("using max layer size {} for tenant {}", s, tenant.tenant_id()); + let s = tenant_candidates + .iter() + .map(|(_, layer_info)| layer_info.file_size()) + .max() + .unwrap_or(0); + info!( + "using max layer size {} for tenant {}", + s, + tenant.tenant_id() + ); s }; @@ -542,7 +558,8 @@ async fn collect_eviction_candidates( last_activity_ts: layer_info.last_activity_ts, layer: layer_info.layer, tenant_resident_size_overage: cumulative_resident_size_overage - .clamp(i64::MIN as i128, i64::MAX as i128) as i64, + .clamp(i64::MIN as i128, i64::MAX as i128) + as i64, }); cumulative_resident_size_overage += i128::from(file_size); } @@ -551,7 +568,10 @@ async fn collect_eviction_candidates( // Final sort. Layers above their tenant's min-resident size threshold first, in // LRU order, and then all the rest also in LRU order candidates.sort_unstable_by_key(|candidate| { - (candidate.tenant_resident_size_overage < 0, candidate.last_activity_ts) + ( + candidate.tenant_resident_size_overage < 0, + candidate.last_activity_ts, + ) }); Ok(candidates) diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index 535b51483a..b03d64712a 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4085,9 +4085,7 @@ impl Timeline { }); } - DiskUsageEvictionInfo { - resident_layers, - } + DiskUsageEvictionInfo { resident_layers } } } From ea3c76a9d60746da9c08bff101eacd6bf74e686d Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 28 Mar 2023 17:02:56 +0200 Subject: [PATCH 04/22] refactor: instead of 'overage', have two separate lists --- Cargo.lock | 7 + Cargo.toml | 1 + pageserver/Cargo.toml | 1 + pageserver/src/disk_usage_eviction_task.rs | 173 +++++++++++------- .../regress/test_disk_usage_eviction.py | 12 +- 5 files changed, 124 insertions(+), 70 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ccf5fcef00..e2eb361c9a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1916,6 +1916,12 @@ dependencies = [ "windows-sys 0.42.0", ] +[[package]] +name = "is_sorted" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357376465c37db3372ef6a00585d336ed3d0f11d4345eef77ebcb05865392b21" + [[package]] name = "itertools" version = "0.10.5" @@ -2446,6 +2452,7 @@ dependencies = [ "humantime", "humantime-serde", "hyper", + "is_sorted", "itertools", "metrics", "nix", diff --git a/Cargo.toml b/Cargo.toml index e27a50a1cb..8233cca7a5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,6 +57,7 @@ humantime = "2.1" humantime-serde = "1.1.1" hyper = "0.14" hyper-tungstenite = "0.9" +is_sorted = "0.1.1" itertools = "0.10" jsonwebtoken = "8" libc = "0.2" diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 0bc7eba95e..b46020a945 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -31,6 +31,7 @@ hex.workspace = true humantime.workspace = true humantime-serde.workspace = true hyper.workspace = true +is_sorted.workspace = true itertools.workspace = true nix.workspace = true num-traits.workspace = true diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index fbad2e2426..6611338491 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -25,13 +25,10 @@ //! //! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant. //! The reservation is to keep the most recently accessed X bytes per tenant resident. -//! All layers that don't make the cut are put on a list and become eviction candidates. -//! We evict until we're below the two thresholds. +//! If we cannot relieve pressure by evicting layers outside of the reservation, we +//! start evicting layers that are part of the reservation, LRU first. //! -//! If the above strategy wouldn't free enough space, we fall back to global LRU right away, -//! not respecting any per-tenant reservations. -//! -//! This value for the per-tenant reservation is referred to as `tenant_min_resident_size` +//! The value for the per-tenant reservation is referred to as `tenant_min_resident_size` //! throughout the code, but, no actual variable carries that name. //! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`. //! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress @@ -308,26 +305,33 @@ pub async fn disk_usage_eviction_task_iteration_impl( "running disk usage based eviction due to pressure" ); - // Collect list of all layers in the system, sorted in the order that they should - // be evicted. - let all_candidates = collect_eviction_candidates(cancel).await?; - if cancel.is_cancelled() { - return Ok(IterationOutcome::Cancelled); - } + let candidates = match collect_eviction_candidates(cancel).await? { + EvictionCandidates::Cancelled => { + return Ok(IterationOutcome::Cancelled); + } + EvictionCandidates::Finished(partitioned) => partitioned, + }; - // XXX: Print the whole list, for debbugging - let now: SystemTime = SystemTime::now(); - for (i, candidate) in all_candidates.iter().enumerate() { + // Debug-log the list of candidates + let now = SystemTime::now(); + for (i, (partition, candidate)) in candidates + // XXX this clone is costly + .clone() + .into_iter_in_eviction_order() + .enumerate() + { debug!( - "cand {}/{}: {}, size {}, at {}, overage {}", - i, - all_candidates.len(), - candidate.layer.local_path().unwrap().display(), + "cand {}/{}: size={}, no_access_for={}us, parition={:?}, tenant={} timeline={} layer={}", + i + 1, + candidates.num_candidates(), candidate.layer.file_size(), now.duration_since(candidate.last_activity_ts) .unwrap() .as_micros(), - candidate.tenant_resident_size_overage + partition, + candidate.layer.get_tenant_id(), + candidate.layer.get_timeline_id(), + candidate.layer.filename().file_name(), ); } @@ -342,10 +346,9 @@ pub async fn disk_usage_eviction_task_iteration_impl( // the tenant's min-resident-size threshold, print a warning, and memorize the disk // usage at that point, in 'usage_planned_min_resident_size_respecting'. let mut batched: HashMap<_, Vec>> = HashMap::new(); - let mut min_resident_size_violated = false; + let mut warned = None; let mut usage_planned = usage_pre; - let mut usage_planned_min_resident_size_respecting = None; - for (i, candidate) in all_candidates.into_iter().enumerate() { + for (i, (partition, candidate)) in candidates.into_iter_in_eviction_order().enumerate() { if !usage_planned.has_pressure() { debug!( no_candidates_evicted = i, @@ -354,16 +357,11 @@ pub async fn disk_usage_eviction_task_iteration_impl( break; } - if !min_resident_size_violated { - if candidate.tenant_resident_size_overage < 0 { - warn!(?usage_pre, ?usage_planned, "tenant_min_resident_size-respecting LRU would not relieve pressure, falling back to global LRU at {}", i); - min_resident_size_violated = true; - usage_planned_min_resident_size_respecting = Some(usage_planned.clone()); - } - } else { - // all layers with overage >= 0 should come first - assert!(candidate.tenant_resident_size_overage < 0); + if partition == MinResidentSizePartition::Below && warned.is_none() { + warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"); + warned = Some(usage_planned); } + usage_planned.add_available_bytes(candidate.layer.file_size()); batched @@ -372,17 +370,15 @@ pub async fn disk_usage_eviction_task_iteration_impl( .push(candidate.layer); } - let usage_planned = if min_resident_size_violated { - PlannedUsage { - respecting_tenant_min_resident_size: usage_planned_min_resident_size_respecting - .unwrap(), + let usage_planned = match warned { + Some(respecting_tenant_min_resident_size) => PlannedUsage { + respecting_tenant_min_resident_size, fallback_to_global_lru: Some(usage_planned), - } - } else { - PlannedUsage { + }, + None => PlannedUsage { respecting_tenant_min_resident_size: usage_planned, fallback_to_global_lru: None, - } + }, }; debug!(?usage_planned, "usage planned"); @@ -455,11 +451,54 @@ pub async fn disk_usage_eviction_task_iteration_impl( // // `collect_eviction_candidates' returns a vector of these, in the preference order // that they should be evicted. +#[derive(Clone)] struct EvictionCandidate { timeline: Arc, layer: Arc, last_activity_ts: SystemTime, - tenant_resident_size_overage: i64, +} + +#[derive(Clone)] +struct MinResidentSizePartitionedCandidates { + above: Vec, + below: Vec, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum MinResidentSizePartition { + Above, + Below, +} + +impl MinResidentSizePartitionedCandidates { + pub fn num_candidates(&self) -> usize { + self.above.len() + self.below.len() + } + pub fn into_iter_in_eviction_order( + self, + ) -> impl Iterator { + debug_assert!(is_sorted::IsSorted::is_sorted_by_key( + &mut self.above.iter(), + |c| c.last_activity_ts + )); + debug_assert!(is_sorted::IsSorted::is_sorted_by_key( + &mut self.below.iter(), + |c| c.last_activity_ts + )); + self.above + .into_iter() + .map(|c| (MinResidentSizePartition::Above, c)) + .chain( + self.below + .into_iter() + .map(|c| (MinResidentSizePartition::Below, c)), + ) + } +} + +enum EvictionCandidates { + Cancelled, + Finished(MinResidentSizePartitionedCandidates), } /// Collect a list of all non-remote layers in the system, from all timelines in all tenants. @@ -499,16 +538,18 @@ struct EvictionCandidate { /// async fn collect_eviction_candidates( cancel: &CancellationToken, -) -> anyhow::Result> { +) -> anyhow::Result { // get a snapshot of the list of tenants let tenants = tenant::mgr::list_tenants() .await .context("get list of tenants")?; - let mut candidates: Vec = Vec::new(); + let mut above_min_resident_size: Vec = Vec::new(); + let mut below_min_resident_size: Vec = Vec::new(); + for (tenant_id, _state) in &tenants { if cancel.is_cancelled() { - return Ok(candidates); + return Ok(EvictionCandidates::Cancelled); } let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await { Ok(tenant) => tenant, @@ -534,14 +575,10 @@ async fn collect_eviction_candidates( ); if cancel.is_cancelled() { - return Ok(candidates); + return Ok(EvictionCandidates::Cancelled); } } - // sort this tenant's layers by last_activity_ts, calculate the "overage" for each - // layer, and add them to the result. - tenant_candidates.sort_unstable_by_key(|(_, layer_info)| layer_info.last_activity_ts); - let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() { info!( "using overridden min resident size {} for tenant {}", @@ -564,31 +601,37 @@ async fn collect_eviction_candidates( s }; - let mut cumulative_resident_size_overage: i128 = -i128::from(min_resident_size); + // Sort layers most-recently-used first + tenant_candidates + .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts)); + + let mut cumsum: i128 = 0; for (timeline, layer_info) in tenant_candidates.into_iter() { let file_size = layer_info.file_size(); - candidates.push(EvictionCandidate { + let candidate = EvictionCandidate { timeline, last_activity_ts: layer_info.last_activity_ts, layer: layer_info.layer, - tenant_resident_size_overage: cumulative_resident_size_overage - .clamp(i64::MIN as i128, i64::MAX as i128) - as i64, - }); - cumulative_resident_size_overage += i128::from(file_size); + }; + if cumsum > min_resident_size as i128 { + above_min_resident_size.push(candidate); + } else { + below_min_resident_size.push(candidate); + } + cumsum += i128::from(file_size); } } - // Final sort. Layers above their tenant's min-resident size threshold first, in - // LRU order, and then all the rest also in LRU order - candidates.sort_unstable_by_key(|candidate| { - ( - candidate.tenant_resident_size_overage < 0, - candidate.last_activity_ts, - ) - }); + // The MinResidentSizePartitionedCandidates struct expects these to be sorted this way + above_min_resident_size.sort_unstable_by_key(|c| c.last_activity_ts); + below_min_resident_size.sort_unstable_by_key(|c| c.last_activity_ts); - Ok(candidates) + Ok(EvictionCandidates::Finished( + MinResidentSizePartitionedCandidates { + above: above_min_resident_size, + below: below_min_resident_size, + }, + )) } struct TimelineKey(Arc); diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 45de862388..e39556d801 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -16,6 +16,8 @@ from fixtures.neon_fixtures import ( ) from fixtures.types import TenantId, TimelineId +GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" + @pytest.mark.parametrize("config_level_override", [None, 400]) def test_min_resident_size_override_handling( @@ -220,7 +222,7 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv) time.sleep(1) # give log time to flush assert not env.neon_env.pageserver.log_contains( - "falling back to global LRU" + GLOBAL_LRU_LOG_LINE, ), "this test is pointless if it fell back to global LRU" (later_total_on_disk, _, _) = env.timelines_du() @@ -246,8 +248,8 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv) def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): """ - The pageserver should fall back to global LRU if the tenant_min_resident_size-respecting eviction - wouldn't evict enough. + If we can't relieve pressure using tenant_min_resident_size-respecting eviction, + we should continue to evict layers following global LRU. """ env = eviction_env ps_http = env.pageserver_http @@ -264,8 +266,8 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): assert actual_change >= target, "eviction must always evict more than target" time.sleep(1) # give log time to flush - assert env.neon_env.pageserver.log_contains("falling back to global LRU") - env.neon_env.pageserver.allowed_errors.append(".*falling back to global LRU") + assert env.neon_env.pageserver.log_contains(GLOBAL_LRU_LOG_LINE) + env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) def test_partial_evict_tenant(eviction_env: EvictionEnv): From 85becb148fc803a225fcb3e696b0f5a897bf9570 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 28 Mar 2023 17:20:06 +0200 Subject: [PATCH 05/22] feat: bring back min_resident=max(all layers) behavior --- pageserver/src/disk_usage_eviction_task.rs | 29 ++++++++++++++++------ pageserver/src/tenant/timeline.rs | 11 +++++++- 2 files changed, 31 insertions(+), 9 deletions(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 6611338491..932413ed8e 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -289,6 +289,7 @@ pub async fn disk_usage_eviction_task_iteration_impl( usage_pre: U, cancel: &CancellationToken, ) -> anyhow::Result> { + // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex) let _g = state .mutex .try_lock() @@ -561,7 +562,13 @@ async fn collect_eviction_candidates( }; // collect layers from all timelines in this tenant + // + // If one of the timelines becomes `!is_active()` during the iteration, + // for example because we're shutting down, then `max_layer_size` can be too small. + // That's OK. This code only runs under a disk pressure situation, and being + // a little unfair to tenants during shutdown in such a situation is tolerable. let mut tenant_candidates = Vec::new(); + let mut max_layer_size = 0; for tl in tenant.list_timelines() { if !tl.is_active() { continue; @@ -573,12 +580,24 @@ async fn collect_eviction_candidates( .into_iter() .map(|layer_infos| (tl.clone(), layer_infos)), ); + max_layer_size = max_layer_size.max(info.max_layer_size.unwrap_or(0)); if cancel.is_cancelled() { return Ok(EvictionCandidates::Cancelled); } } + // `min_resident_size` defaults to maximum layer file size of the tenant. + // This ensures that each tenant can have at least one layer resident at a given time, + // ensuring forward progress for a single Timeline::get in that tenant. + // It's a questionable heuristic since there are many Timeline::get + // requests going on and multiple layers are needed, and, at least in Neon prod, + // the median layer file size is much smaller than the compaction target size. + // We could be better here, e.g., sum of all L0 layers + most recent L1 layer. + // That's what's typically used by the various background loops. + // + // The default can be overriden with a fixed value in the tenant conf. + // A default override can be put in the default tenant conf in the pageserver.toml. let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() { info!( "using overridden min resident size {} for tenant {}", @@ -587,18 +606,12 @@ async fn collect_eviction_candidates( ); s } else { - // By default, use the size of the largest resident layer - let s = tenant_candidates - .iter() - .map(|(_, layer_info)| layer_info.file_size()) - .max() - .unwrap_or(0); info!( "using max layer size {} for tenant {}", - s, + max_layer_size, tenant.tenant_id() ); - s + max_layer_size }; // Sort layers most-recently-used first diff --git a/pageserver/src/tenant/timeline.rs b/pageserver/src/tenant/timeline.rs index b03d64712a..91efde8f22 100644 --- a/pageserver/src/tenant/timeline.rs +++ b/pageserver/src/tenant/timeline.rs @@ -4038,6 +4038,8 @@ impl Timeline { } pub struct DiskUsageEvictionInfo { + /// Timeline's largest layer (remote or resident) + pub max_layer_size: Option, /// Timeline's resident layers pub resident_layers: Vec, } @@ -4070,9 +4072,13 @@ impl Timeline { pub(crate) fn get_local_layers_for_disk_usage_eviction(&self) -> DiskUsageEvictionInfo { let layers = self.layers.read().unwrap(); + let mut max_layer_size: Option = None; let mut resident_layers = Vec::new(); for l in layers.iter_historic_layers() { + let file_size = l.file_size(); + max_layer_size = max_layer_size.map_or(Some(file_size), |m| Some(m.max(file_size))); + if l.is_remote_layer() { continue; } @@ -4085,7 +4091,10 @@ impl Timeline { }); } - DiskUsageEvictionInfo { resident_layers } + DiskUsageEvictionInfo { + max_layer_size, + resident_layers, + } } } From 0c10e6d3e7a1232587979eeed11c79099f50cc82 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 28 Mar 2023 17:20:44 +0200 Subject: [PATCH 06/22] feat: demote info logs to debug These would be per tenant, we don't want to emit thousands of log lines when this code runs. --- pageserver/src/disk_usage_eviction_task.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 932413ed8e..b994a2fe97 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -599,14 +599,14 @@ async fn collect_eviction_candidates( // The default can be overriden with a fixed value in the tenant conf. // A default override can be put in the default tenant conf in the pageserver.toml. let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() { - info!( + debug!( "using overridden min resident size {} for tenant {}", s, tenant.tenant_id() ); s } else { - info!( + debug!( "using max layer size {} for tenant {}", max_layer_size, tenant.tenant_id() From 07c44f915104871fe4f348cd4a1717a2d5901f14 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 28 Mar 2023 17:47:11 +0200 Subject: [PATCH 07/22] doc: hint that usage_assumed is modified in the loop --- pageserver/src/disk_usage_eviction_task.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index b994a2fe97..b9dd472529 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -385,9 +385,9 @@ pub async fn disk_usage_eviction_task_iteration_impl( // phase2: evict victims batched by timeline - // achieved post-eviction usage according to internal accounting + // After the loop, `usage_assumed` is the post-eviction usage, + // according to internal accounting. let mut usage_assumed = usage_pre; - let mut evictions_failed = LayerCount::default(); for (timeline, batch) in batched { let tenant_id = timeline.tenant_id; From dc72a9534e5b5ee90ce98ba4ff566bcb7ddce6e1 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 28 Mar 2023 18:03:56 +0200 Subject: [PATCH 08/22] doc: update doc comment for `collect_eviction_candidates` And move the impl of MinResidentSizePartitionedCandidates below it because it makes sense when reading the code top-down. --- pageserver/src/disk_usage_eviction_task.rs | 113 ++++++++++----------- 1 file changed, 55 insertions(+), 58 deletions(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index b9dd472529..5079e11f77 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -448,10 +448,6 @@ pub async fn disk_usage_eviction_task_iteration_impl( })) } -// Result type of `collect_eviction_candidates` -// -// `collect_eviction_candidates' returns a vector of these, in the preference order -// that they should be evicted. #[derive(Clone)] struct EvictionCandidate { timeline: Arc, @@ -471,72 +467,45 @@ enum MinResidentSizePartition { Below, } -impl MinResidentSizePartitionedCandidates { - pub fn num_candidates(&self) -> usize { - self.above.len() + self.below.len() - } - pub fn into_iter_in_eviction_order( - self, - ) -> impl Iterator { - debug_assert!(is_sorted::IsSorted::is_sorted_by_key( - &mut self.above.iter(), - |c| c.last_activity_ts - )); - debug_assert!(is_sorted::IsSorted::is_sorted_by_key( - &mut self.below.iter(), - |c| c.last_activity_ts - )); - self.above - .into_iter() - .map(|c| (MinResidentSizePartition::Above, c)) - .chain( - self.below - .into_iter() - .map(|c| (MinResidentSizePartition::Below, c)), - ) - } -} - enum EvictionCandidates { Cancelled, Finished(MinResidentSizePartitionedCandidates), } -/// Collect a list of all non-remote layers in the system, from all timelines in all tenants. +/// Gather the eviction candidates. /// -/// Returns all layers in the order that they should be evicted. The current policy is to -/// first evict layers in global LRU order, but retain at least min_resident_size bytes of -/// data for each tenant. After that, if necessary, we evict the remaining layers, also in -/// global LRU order. A different policy could be implemented by changing the returned order -/// here. +/// The returned `Ok(EvictionCandidates::Finished(candidates))` allows iteration over +/// the candidates in eviction order, using `candidates.into_iter_in_eviction_order()`. +/// A caller that evicts in that order implements the eviction policy outlined in the +/// module comment. /// -/// For each layer, we return its last-activity timestamp, and its "overage" over the -/// tenant's min resident size limit. In other words, 'tenant_resident_size_overage' -/// means: If we evicted this layer, and all the layers of this tenant in the result list -/// before this one, how much would the total size of all the tenant's remaining layers -/// exceed the the tenant's min resident size? Layers that belong to the "reservation", -/// 'tenant_resident_size_overage' is negative. +/// # Example /// -/// For example, imagine that there are two tenants, A and B, with five layers each, a-e. +/// Imagine that there are two tenants, A and B, with five layers each, a-e. /// Each layer has size 100, and both tenant's min_resident_size is 150. -/// `collect_eviction_candidates` would return them in this order: +/// The eviction order would be /// -/// last_activity_ts tenant/layer overage -/// 18:30 A/c 250 -/// 19:00 A/b 150 -/// 18:29 B/c 250 -/// 19:05 B/b 150 -/// 20:00 B/a 50 -/// 20:03 A/a 50 -/// --- min resident size respecting cutoff point --- -/// 20:30 A/d -50 -/// 20:40 B/d -50 -/// 20:45 B/e -150 -/// 20:58 A/e -150 +/// ```text +/// partition last_activity_ts tenant/layer +/// Above 18:30 A/c +/// Above 19:00 A/b +/// Above 18:29 B/c +/// Above 19:05 B/b +/// Above 20:00 B/a +/// Above 20:03 A/a +/// Below 20:30 A/d +/// Below 20:40 B/d +/// Below 20:45 B/e +/// Below 20:58 A/e +/// ``` /// -/// If the task is cancelled by the `cancel` token, returns an empty Vec. The caller -/// should check for `cancel.is_cancelled`. +/// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`. +/// They are all in the `Above` partition, so, we respected each tenant's min_resident_size. /// +/// But, if we need to evict 900 bytes to relieve pressure, we'd evict +/// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition +/// after exhauting the `Above` partition. +/// So, we did not respect each tenant's min_resident_size. async fn collect_eviction_candidates( cancel: &CancellationToken, ) -> anyhow::Result { @@ -647,6 +616,34 @@ async fn collect_eviction_candidates( )) } +impl MinResidentSizePartitionedCandidates { + pub fn num_candidates(&self) -> usize { + self.above.len() + self.below.len() + } + + /// See comment on [`collect_eviction_candidates`]. + pub fn into_iter_in_eviction_order( + self, + ) -> impl Iterator { + debug_assert!(is_sorted::IsSorted::is_sorted_by_key( + &mut self.above.iter(), + |c| c.last_activity_ts + )); + debug_assert!(is_sorted::IsSorted::is_sorted_by_key( + &mut self.below.iter(), + |c| c.last_activity_ts + )); + self.above + .into_iter() + .map(|c| (MinResidentSizePartition::Above, c)) + .chain( + self.below + .into_iter() + .map(|c| (MinResidentSizePartition::Below, c)), + ) + } +} + struct TimelineKey(Arc); impl PartialEq for TimelineKey { From 704d4f4640b77c29e337a65295ca50f7871b7444 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 28 Mar 2023 18:06:44 +0200 Subject: [PATCH 09/22] doc: improve comment on min_resident_size --- pageserver/src/disk_usage_eviction_task.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 5079e11f77..145dff1880 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -559,9 +559,9 @@ async fn collect_eviction_candidates( // `min_resident_size` defaults to maximum layer file size of the tenant. // This ensures that each tenant can have at least one layer resident at a given time, // ensuring forward progress for a single Timeline::get in that tenant. - // It's a questionable heuristic since there are many Timeline::get - // requests going on and multiple layers are needed, and, at least in Neon prod, - // the median layer file size is much smaller than the compaction target size. + // It's a questionable heuristic since, usually, there are many Timeline::get + // requests going on for a tenant, and, at least in Neon prod, the median + // layer file size is much smaller than the compaction target size. // We could be better here, e.g., sum of all L0 layers + most recent L1 layer. // That's what's typically used by the various background loops. // From 386c2d0112ddecff45f4c49920e4a4a51e9a60c7 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 28 Mar 2023 18:25:44 +0200 Subject: [PATCH 10/22] refactor: go back to a single list The MinResidentSizePartition is effectively what `overage` was earlier, but more expressive and outside of EvictionCandidates. So switch the code back to a single list, but use (MinResidentSizePartition, EvictionCandidates) tuples. That eliminates the need for iter_in_eviction_order() alltogether. It consumes 8 bytes more memory per candidate, but, that doesn't matter for now. --- pageserver/src/disk_usage_eviction_task.rs | 84 +++++----------------- 1 file changed, 19 insertions(+), 65 deletions(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 145dff1880..df563b5082 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -315,16 +315,11 @@ pub async fn disk_usage_eviction_task_iteration_impl( // Debug-log the list of candidates let now = SystemTime::now(); - for (i, (partition, candidate)) in candidates - // XXX this clone is costly - .clone() - .into_iter_in_eviction_order() - .enumerate() - { + for (i, (partition, candidate)) in candidates.iter().enumerate() { debug!( "cand {}/{}: size={}, no_access_for={}us, parition={:?}, tenant={} timeline={} layer={}", i + 1, - candidates.num_candidates(), + candidates.len(), candidate.layer.file_size(), now.duration_since(candidate.last_activity_ts) .unwrap() @@ -349,7 +344,7 @@ pub async fn disk_usage_eviction_task_iteration_impl( let mut batched: HashMap<_, Vec>> = HashMap::new(); let mut warned = None; let mut usage_planned = usage_pre; - for (i, (partition, candidate)) in candidates.into_iter_in_eviction_order().enumerate() { + for (i, (partition, candidate)) in candidates.into_iter().enumerate() { if !usage_planned.has_pressure() { debug!( no_candidates_evicted = i, @@ -455,13 +450,7 @@ struct EvictionCandidate { last_activity_ts: SystemTime, } -#[derive(Clone)] -struct MinResidentSizePartitionedCandidates { - above: Vec, - below: Vec, -} - -#[derive(Debug, Clone, Copy, PartialEq, Eq)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] enum MinResidentSizePartition { Above, Below, @@ -469,15 +458,14 @@ enum MinResidentSizePartition { enum EvictionCandidates { Cancelled, - Finished(MinResidentSizePartitionedCandidates), + Finished(Vec<(MinResidentSizePartition, EvictionCandidate)>), } /// Gather the eviction candidates. /// -/// The returned `Ok(EvictionCandidates::Finished(candidates))` allows iteration over -/// the candidates in eviction order, using `candidates.into_iter_in_eviction_order()`. -/// A caller that evicts in that order implements the eviction policy outlined in the -/// module comment. +/// The returned `Ok(EvictionCandidates::Finished(candidates))` is sorted in eviction +/// order. A caller that evicts in that order, until pressure is relieved, implements +/// the eviction policy outlined in the module comment. /// /// # Example /// @@ -514,8 +502,7 @@ async fn collect_eviction_candidates( .await .context("get list of tenants")?; - let mut above_min_resident_size: Vec = Vec::new(); - let mut below_min_resident_size: Vec = Vec::new(); + let mut candidates = Vec::new(); for (tenant_id, _state) in &tenants { if cancel.is_cancelled() { @@ -583,10 +570,10 @@ async fn collect_eviction_candidates( max_layer_size }; - // Sort layers most-recently-used first + // Sort layers most-recently-used first, then partition by + // cumsum above/below min_resident_size. tenant_candidates .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts)); - let mut cumsum: i128 = 0; for (timeline, layer_info) in tenant_candidates.into_iter() { let file_size = layer_info.file_size(); @@ -595,53 +582,20 @@ async fn collect_eviction_candidates( last_activity_ts: layer_info.last_activity_ts, layer: layer_info.layer, }; - if cumsum > min_resident_size as i128 { - above_min_resident_size.push(candidate); + let partition = if cumsum > min_resident_size as i128 { + MinResidentSizePartition::Above } else { - below_min_resident_size.push(candidate); - } + MinResidentSizePartition::Below + }; + candidates.push((partition, candidate)); cumsum += i128::from(file_size); } } - // The MinResidentSizePartitionedCandidates struct expects these to be sorted this way - above_min_resident_size.sort_unstable_by_key(|c| c.last_activity_ts); - below_min_resident_size.sort_unstable_by_key(|c| c.last_activity_ts); + candidates + .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts)); - Ok(EvictionCandidates::Finished( - MinResidentSizePartitionedCandidates { - above: above_min_resident_size, - below: below_min_resident_size, - }, - )) -} - -impl MinResidentSizePartitionedCandidates { - pub fn num_candidates(&self) -> usize { - self.above.len() + self.below.len() - } - - /// See comment on [`collect_eviction_candidates`]. - pub fn into_iter_in_eviction_order( - self, - ) -> impl Iterator { - debug_assert!(is_sorted::IsSorted::is_sorted_by_key( - &mut self.above.iter(), - |c| c.last_activity_ts - )); - debug_assert!(is_sorted::IsSorted::is_sorted_by_key( - &mut self.below.iter(), - |c| c.last_activity_ts - )); - self.above - .into_iter() - .map(|c| (MinResidentSizePartition::Above, c)) - .chain( - self.below - .into_iter() - .map(|c| (MinResidentSizePartition::Below, c)), - ) - } + Ok(EvictionCandidates::Finished(candidates)) } struct TimelineKey(Arc); From d6c2867b46b0e289d163d49b293c21f5bcf962d8 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 28 Mar 2023 19:16:26 +0200 Subject: [PATCH 11/22] doc: add debug_assert for self-documenting candidates.sort_unstable_by_kye() --- pageserver/src/disk_usage_eviction_task.rs | 1 + 1 file changed, 1 insertion(+) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index df563b5082..8a0e03094c 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -592,6 +592,7 @@ async fn collect_eviction_candidates( } } + debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below); candidates .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts)); From bb5947afde4c2d48519e92649b4918c7507e6570 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Tue, 28 Mar 2023 19:16:59 +0200 Subject: [PATCH 12/22] test: test_pageserver_respects_overridden_resident_size: use absolute wiggle room instead of percentage Heikki added the `*0.75` in commit 11b16614a37c1d05136a154cc99da3912d495161 Author: Heikki Linnakangas Date: Tue Mar 28 01:13:33 2023 +0300 Fix test for change in behavior close to the min_resident_size boundary This PR changed the behavior to match my expectation per my comment: https://github.com/neondatabase/neon/pull/3809/files#r1149837135 Without it, the test fails because we fall back to global LRU, and we have an assert on that. The reason why it falls back to global LRU is that `target = delta_between_small_and_big_tenant` doesn't leave any wiggle-room to go over min_resident_size boundary. But, we redefined min_resident_size to include up to 1 layer above it in this branch. Multiply that by two because we're dealing with 2 tenants here. --- test_runner/regress/test_disk_usage_eviction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index e39556d801..3897a80780 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -208,7 +208,7 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv) # give the larger tenant a haircut while preventing the smaller tenant from getting one min_resident_size = du_by_timeline[small_tenant] - target = int((du_by_timeline[large_tenant] - du_by_timeline[small_tenant]) * 0.75) + target = (du_by_timeline[large_tenant] - du_by_timeline[small_tenant]) - 2*env.layer_size assert any( [du > min_resident_size for du in du_by_timeline.values()] ), "ensure the larger tenant will get a haircut" From 370b3637dbaee7b7cbcca7917736f54bc4365fb9 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 29 Mar 2023 10:47:15 +0200 Subject: [PATCH 13/22] doc: add explainer to debug_assert --- pageserver/src/disk_usage_eviction_task.rs | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 8a0e03094c..d14cc051d1 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -592,7 +592,8 @@ async fn collect_eviction_candidates( } } - debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below); + debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below, + "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); candidates .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts)); From a6f9ebf1789d853f540c9644f2d6854a34f77dc3 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 29 Mar 2023 10:49:06 +0200 Subject: [PATCH 14/22] fix: repeat tenant_id in debug message --- pageserver/src/disk_usage_eviction_task.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index d14cc051d1..14a30a9631 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -530,7 +530,7 @@ async fn collect_eviction_candidates( continue; } let info = tl.get_local_layers_for_disk_usage_eviction(); - debug!(timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len()); + debug!(tenant_id=%tl.tenant_id, timeline_id=%tl.timeline_id, "timeline resident layers count: {}", info.resident_layers.len()); tenant_candidates.extend( info.resident_layers .into_iter() From 0b9a44a8794ad8d7a4dc94ef56c75749dabb353d Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 29 Mar 2023 10:50:36 +0200 Subject: [PATCH 15/22] fix: structured logging of tenant_id Co-authored-by: Joonas Koivunen --- pageserver/src/disk_usage_eviction_task.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 14a30a9631..f890111047 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -563,9 +563,9 @@ async fn collect_eviction_candidates( s } else { debug!( - "using max layer size {} for tenant {}", - max_layer_size, - tenant.tenant_id() + tenant_id=%tenant.tenant_id(), + max_layer_size, + "using max layer size as min_resident_size for tenant", ); max_layer_size }; From 9a55e4f90941bd057ff659304339a76b78d29045 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 29 Mar 2023 10:51:14 +0200 Subject: [PATCH 16/22] fix: structured logging of tenant_id Co-authored-by: Joonas Koivunen --- pageserver/src/disk_usage_eviction_task.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index f890111047..39f3faa05e 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -556,9 +556,9 @@ async fn collect_eviction_candidates( // A default override can be put in the default tenant conf in the pageserver.toml. let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() { debug!( - "using overridden min resident size {} for tenant {}", - s, - tenant.tenant_id() + tenant_id=%tenant.tenant_id(), + override=s, + "using overridden min resident size for tenant" ); s } else { From bdc7f8d1929295c255f718c4504396d8507e429b Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 29 Mar 2023 10:52:39 +0200 Subject: [PATCH 17/22] fix: remove now-unused is_sorted --- Cargo.lock | 7 ------- Cargo.toml | 1 - pageserver/Cargo.toml | 1 - 3 files changed, 9 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e2eb361c9a..ccf5fcef00 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1916,12 +1916,6 @@ dependencies = [ "windows-sys 0.42.0", ] -[[package]] -name = "is_sorted" -version = "0.1.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "357376465c37db3372ef6a00585d336ed3d0f11d4345eef77ebcb05865392b21" - [[package]] name = "itertools" version = "0.10.5" @@ -2452,7 +2446,6 @@ dependencies = [ "humantime", "humantime-serde", "hyper", - "is_sorted", "itertools", "metrics", "nix", diff --git a/Cargo.toml b/Cargo.toml index 8233cca7a5..e27a50a1cb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,7 +57,6 @@ humantime = "2.1" humantime-serde = "1.1.1" hyper = "0.14" hyper-tungstenite = "0.9" -is_sorted = "0.1.1" itertools = "0.10" jsonwebtoken = "8" libc = "0.2" diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index b46020a945..0bc7eba95e 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -31,7 +31,6 @@ hex.workspace = true humantime.workspace = true humantime-serde.workspace = true hyper.workspace = true -is_sorted.workspace = true itertools.workspace = true nix.workspace = true num-traits.workspace = true From 83813f2cb1a8888ce0125560ecb4516f803899b2 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 29 Mar 2023 10:55:45 +0200 Subject: [PATCH 18/22] fix: remove unneeded clippy allow --- pageserver/src/disk_usage_eviction_task.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 39f3faa05e..11bdfb305f 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -282,7 +282,6 @@ struct LayerCount { count: usize, } -#[allow(clippy::needless_late_init)] pub async fn disk_usage_eviction_task_iteration_impl( state: &State, storage: &GenericRemoteStorage, From 57d215e6bb46bd93f912f03eadb0784e3d3ce187 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 29 Mar 2023 10:55:57 +0200 Subject: [PATCH 19/22] fix: suggestions commited from GitHub web didn't compile --- pageserver/src/disk_usage_eviction_task.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 11bdfb305f..c61e27374a 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -556,14 +556,14 @@ async fn collect_eviction_candidates( let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() { debug!( tenant_id=%tenant.tenant_id(), - override=s, + overriden_size=s, "using overridden min resident size for tenant" ); s } else { debug!( tenant_id=%tenant.tenant_id(), - max_layer_size, + max_layer_size, "using max layer size as min_resident_size for tenant", ); max_layer_size From a698ddb8a411ab2ddae5d0fce32e47209910fd6d Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 29 Mar 2023 10:58:59 +0200 Subject: [PATCH 20/22] fix: avoid needless timeline.clone() --- pageserver/src/disk_usage_eviction_task.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index c61e27374a..b903a22463 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -360,7 +360,7 @@ pub async fn disk_usage_eviction_task_iteration_impl( usage_planned.add_available_bytes(candidate.layer.file_size()); batched - .entry(TimelineKey(candidate.timeline.clone())) + .entry(TimelineKey(candidate.timeline)) .or_default() .push(candidate.layer); } From 699ca672a48c307355960924f8b8e767e68ea508 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 29 Mar 2023 11:22:53 +0200 Subject: [PATCH 21/22] test: refine test_pageserver_respects_overridden_resident_size --- test_runner/regress/test_disk_usage_eviction.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 3897a80780..fd3b85f345 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -206,9 +206,11 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv) du_by_timeline[large_tenant] - du_by_timeline[small_tenant] > 5 * env.layer_size ), "ensure this test will do more than 1 eviction" - # give the larger tenant a haircut while preventing the smaller tenant from getting one + # Give the larger tenant a haircut while preventing the smaller tenant from getting one. + # To prevent the smaller from getting a haircut, we set min_resident_size to its current size. + # To ensure the larger tenant is getting a haircut, any non-zero `target` will do. min_resident_size = du_by_timeline[small_tenant] - target = (du_by_timeline[large_tenant] - du_by_timeline[small_tenant]) - 2*env.layer_size + target = 1 assert any( [du > min_resident_size for du in du_by_timeline.values()] ), "ensure the larger tenant will get a haircut" @@ -216,6 +218,11 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv) ps_http.set_tenant_config(small_tenant[0], {"min_resident_size_override": min_resident_size}) ps_http.set_tenant_config(large_tenant[0], {"min_resident_size_override": min_resident_size}) + # Make the large tenant more-recently used. An incorrect implemention would try to evict + # from the smaller tenant first, since its layers would be the least-recently-used + with env.neon_env.postgres.create_start("main", tenant_id=large_tenant[0]) as pg: + env.pg_bin.run(["pgbench", "-S", pg.connstr()]) + # do one run response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) log.info(f"{response}") From b47a02569f04c36c938d17cb4fd9de4267c10254 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Wed, 29 Mar 2023 11:55:02 +0200 Subject: [PATCH 22/22] tests: fully read-only warmup + wait for remote storage upload --- .../regress/test_disk_usage_eviction.py | 30 +++++++++++++++---- 1 file changed, 24 insertions(+), 6 deletions(-) diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index fd3b85f345..2c836f04c7 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -13,8 +13,9 @@ from fixtures.neon_fixtures import ( PgBin, RemoteStorageKind, wait_for_last_flush_lsn, + wait_for_upload_queue_empty, ) -from fixtures.types import TenantId, TimelineId +from fixtures.types import Lsn, TenantId, TimelineId GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" @@ -70,6 +71,7 @@ class EvictionEnv: pg_bin: PgBin pageserver_http: PageserverHttpClient layer_size: int + pgbench_init_lsns: Dict[TenantId, Lsn] def timelines_du(self) -> Tuple[int, int, int]: return poor_mans_du(self.neon_env, [(tid, tlid) for tid, tlid, _ in self.timelines]) @@ -80,6 +82,15 @@ class EvictionEnv: for tid, tlid, _ in self.timelines } + def warm_up_tenant(self, tenant_id: TenantId): + """ + Start a read-only compute at the LSN after pgbench -i, and run pgbench -S against it. + This assumes that the tenant is still at the state after pbench -i. + """ + lsn = self.pgbench_init_lsns[tenant_id] + with self.neon_env.postgres.create_start("main", tenant_id=tenant_id, lsn=lsn) as pg: + self.pg_bin.run(["pgbench", "-S", pg.connstr()]) + @pytest.fixture def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Iterator[EvictionEnv]: @@ -120,6 +131,8 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> It pgbench_scales = [4, 6] layer_size = 5 * 1024**2 + pgbench_init_lsns = {} + for scale in pgbench_scales: tenant_id, timeline_id = env.neon_cli.create_tenant( conf={ @@ -136,6 +149,12 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> It wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id) pageserver_http.timeline_checkpoint(tenant_id, timeline_id) + wait_for_upload_queue_empty(env.pageserver, tenant_id, timeline_id) + tl_info = pageserver_http.timeline_detail(tenant_id, timeline_id) + assert tl_info["last_record_lsn"] == tl_info["disk_consistent_lsn"] + assert tl_info["disk_consistent_lsn"] == tl_info["remote_consistent_lsn"] + pgbench_init_lsns[tenant_id] = Lsn(tl_info["last_record_lsn"]) + layers = pageserver_http.layer_map_info(tenant_id, timeline_id) log.info(f"{layers}") assert len(layers.historic_layers) >= 4 @@ -148,6 +167,7 @@ def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> It pageserver_http=pageserver_http, layer_size=layer_size, pg_bin=pg_bin, + pgbench_init_lsns=pgbench_init_lsns, ) yield eviction_env @@ -219,9 +239,8 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv) ps_http.set_tenant_config(large_tenant[0], {"min_resident_size_override": min_resident_size}) # Make the large tenant more-recently used. An incorrect implemention would try to evict - # from the smaller tenant first, since its layers would be the least-recently-used - with env.neon_env.postgres.create_start("main", tenant_id=large_tenant[0]) as pg: - env.pg_bin.run(["pgbench", "-S", pg.connstr()]) + # from the smaller tenant first, since its layers would be the least-recently-used. + env.warm_up_tenant(large_tenant[0]) # do one run response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) @@ -290,8 +309,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv): tenant_usage = du_by_timeline[our_tenant] # make our tenant more recently used than the other one - with env.neon_env.postgres.create_start("main", tenant_id=tenant_id) as pg: - env.pg_bin.run(["pgbench", "-S", pg.connstr()]) + env.warm_up_tenant(tenant_id) target = total_on_disk - (tenant_usage // 2) response = ps_http.disk_usage_eviction_run({"evict_bytes": target})