diff --git a/Cargo.lock b/Cargo.lock index ccf5fcef00..e2eb361c9a 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1916,6 +1916,12 @@ dependencies = [ "windows-sys 0.42.0", ] +[[package]] +name = "is_sorted" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "357376465c37db3372ef6a00585d336ed3d0f11d4345eef77ebcb05865392b21" + [[package]] name = "itertools" version = "0.10.5" @@ -2446,6 +2452,7 @@ dependencies = [ "humantime", "humantime-serde", "hyper", + "is_sorted", "itertools", "metrics", "nix", diff --git a/Cargo.toml b/Cargo.toml index e27a50a1cb..8233cca7a5 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -57,6 +57,7 @@ humantime = "2.1" humantime-serde = "1.1.1" hyper = "0.14" hyper-tungstenite = "0.9" +is_sorted = "0.1.1" itertools = "0.10" jsonwebtoken = "8" libc = "0.2" diff --git a/pageserver/Cargo.toml b/pageserver/Cargo.toml index 0bc7eba95e..b46020a945 100644 --- a/pageserver/Cargo.toml +++ b/pageserver/Cargo.toml @@ -31,6 +31,7 @@ hex.workspace = true humantime.workspace = true humantime-serde.workspace = true hyper.workspace = true +is_sorted.workspace = true itertools.workspace = true nix.workspace = true num-traits.workspace = true diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index fbad2e2426..6611338491 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -25,13 +25,10 @@ //! //! The iteration evicts layers in LRU fashion, but, with a weak reservation per tenant. //! The reservation is to keep the most recently accessed X bytes per tenant resident. -//! All layers that don't make the cut are put on a list and become eviction candidates. -//! We evict until we're below the two thresholds. +//! If we cannot relieve pressure by evicting layers outside of the reservation, we +//! start evicting layers that are part of the reservation, LRU first. //! -//! If the above strategy wouldn't free enough space, we fall back to global LRU right away, -//! not respecting any per-tenant reservations. -//! -//! This value for the per-tenant reservation is referred to as `tenant_min_resident_size` +//! The value for the per-tenant reservation is referred to as `tenant_min_resident_size` //! throughout the code, but, no actual variable carries that name. //! The per-tenant default value is the `max(tenant's layer file sizes, regardless of local or remote)`. //! The idea is to allow at least one layer to be resident per tenant, to ensure it can make forward progress @@ -308,26 +305,33 @@ pub async fn disk_usage_eviction_task_iteration_impl( "running disk usage based eviction due to pressure" ); - // Collect list of all layers in the system, sorted in the order that they should - // be evicted. - let all_candidates = collect_eviction_candidates(cancel).await?; - if cancel.is_cancelled() { - return Ok(IterationOutcome::Cancelled); - } + let candidates = match collect_eviction_candidates(cancel).await? { + EvictionCandidates::Cancelled => { + return Ok(IterationOutcome::Cancelled); + } + EvictionCandidates::Finished(partitioned) => partitioned, + }; - // XXX: Print the whole list, for debbugging - let now: SystemTime = SystemTime::now(); - for (i, candidate) in all_candidates.iter().enumerate() { + // Debug-log the list of candidates + let now = SystemTime::now(); + for (i, (partition, candidate)) in candidates + // XXX this clone is costly + .clone() + .into_iter_in_eviction_order() + .enumerate() + { debug!( - "cand {}/{}: {}, size {}, at {}, overage {}", - i, - all_candidates.len(), - candidate.layer.local_path().unwrap().display(), + "cand {}/{}: size={}, no_access_for={}us, parition={:?}, tenant={} timeline={} layer={}", + i + 1, + candidates.num_candidates(), candidate.layer.file_size(), now.duration_since(candidate.last_activity_ts) .unwrap() .as_micros(), - candidate.tenant_resident_size_overage + partition, + candidate.layer.get_tenant_id(), + candidate.layer.get_timeline_id(), + candidate.layer.filename().file_name(), ); } @@ -342,10 +346,9 @@ pub async fn disk_usage_eviction_task_iteration_impl( // the tenant's min-resident-size threshold, print a warning, and memorize the disk // usage at that point, in 'usage_planned_min_resident_size_respecting'. let mut batched: HashMap<_, Vec>> = HashMap::new(); - let mut min_resident_size_violated = false; + let mut warned = None; let mut usage_planned = usage_pre; - let mut usage_planned_min_resident_size_respecting = None; - for (i, candidate) in all_candidates.into_iter().enumerate() { + for (i, (partition, candidate)) in candidates.into_iter_in_eviction_order().enumerate() { if !usage_planned.has_pressure() { debug!( no_candidates_evicted = i, @@ -354,16 +357,11 @@ pub async fn disk_usage_eviction_task_iteration_impl( break; } - if !min_resident_size_violated { - if candidate.tenant_resident_size_overage < 0 { - warn!(?usage_pre, ?usage_planned, "tenant_min_resident_size-respecting LRU would not relieve pressure, falling back to global LRU at {}", i); - min_resident_size_violated = true; - usage_planned_min_resident_size_respecting = Some(usage_planned.clone()); - } - } else { - // all layers with overage >= 0 should come first - assert!(candidate.tenant_resident_size_overage < 0); + if partition == MinResidentSizePartition::Below && warned.is_none() { + warn!(?usage_pre, ?usage_planned, candidate_no=i, "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy"); + warned = Some(usage_planned); } + usage_planned.add_available_bytes(candidate.layer.file_size()); batched @@ -372,17 +370,15 @@ pub async fn disk_usage_eviction_task_iteration_impl( .push(candidate.layer); } - let usage_planned = if min_resident_size_violated { - PlannedUsage { - respecting_tenant_min_resident_size: usage_planned_min_resident_size_respecting - .unwrap(), + let usage_planned = match warned { + Some(respecting_tenant_min_resident_size) => PlannedUsage { + respecting_tenant_min_resident_size, fallback_to_global_lru: Some(usage_planned), - } - } else { - PlannedUsage { + }, + None => PlannedUsage { respecting_tenant_min_resident_size: usage_planned, fallback_to_global_lru: None, - } + }, }; debug!(?usage_planned, "usage planned"); @@ -455,11 +451,54 @@ pub async fn disk_usage_eviction_task_iteration_impl( // // `collect_eviction_candidates' returns a vector of these, in the preference order // that they should be evicted. +#[derive(Clone)] struct EvictionCandidate { timeline: Arc, layer: Arc, last_activity_ts: SystemTime, - tenant_resident_size_overage: i64, +} + +#[derive(Clone)] +struct MinResidentSizePartitionedCandidates { + above: Vec, + below: Vec, +} + +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +enum MinResidentSizePartition { + Above, + Below, +} + +impl MinResidentSizePartitionedCandidates { + pub fn num_candidates(&self) -> usize { + self.above.len() + self.below.len() + } + pub fn into_iter_in_eviction_order( + self, + ) -> impl Iterator { + debug_assert!(is_sorted::IsSorted::is_sorted_by_key( + &mut self.above.iter(), + |c| c.last_activity_ts + )); + debug_assert!(is_sorted::IsSorted::is_sorted_by_key( + &mut self.below.iter(), + |c| c.last_activity_ts + )); + self.above + .into_iter() + .map(|c| (MinResidentSizePartition::Above, c)) + .chain( + self.below + .into_iter() + .map(|c| (MinResidentSizePartition::Below, c)), + ) + } +} + +enum EvictionCandidates { + Cancelled, + Finished(MinResidentSizePartitionedCandidates), } /// Collect a list of all non-remote layers in the system, from all timelines in all tenants. @@ -499,16 +538,18 @@ struct EvictionCandidate { /// async fn collect_eviction_candidates( cancel: &CancellationToken, -) -> anyhow::Result> { +) -> anyhow::Result { // get a snapshot of the list of tenants let tenants = tenant::mgr::list_tenants() .await .context("get list of tenants")?; - let mut candidates: Vec = Vec::new(); + let mut above_min_resident_size: Vec = Vec::new(); + let mut below_min_resident_size: Vec = Vec::new(); + for (tenant_id, _state) in &tenants { if cancel.is_cancelled() { - return Ok(candidates); + return Ok(EvictionCandidates::Cancelled); } let tenant = match tenant::mgr::get_tenant(*tenant_id, true).await { Ok(tenant) => tenant, @@ -534,14 +575,10 @@ async fn collect_eviction_candidates( ); if cancel.is_cancelled() { - return Ok(candidates); + return Ok(EvictionCandidates::Cancelled); } } - // sort this tenant's layers by last_activity_ts, calculate the "overage" for each - // layer, and add them to the result. - tenant_candidates.sort_unstable_by_key(|(_, layer_info)| layer_info.last_activity_ts); - let min_resident_size = if let Some(s) = tenant.get_min_resident_size_override() { info!( "using overridden min resident size {} for tenant {}", @@ -564,31 +601,37 @@ async fn collect_eviction_candidates( s }; - let mut cumulative_resident_size_overage: i128 = -i128::from(min_resident_size); + // Sort layers most-recently-used first + tenant_candidates + .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts)); + + let mut cumsum: i128 = 0; for (timeline, layer_info) in tenant_candidates.into_iter() { let file_size = layer_info.file_size(); - candidates.push(EvictionCandidate { + let candidate = EvictionCandidate { timeline, last_activity_ts: layer_info.last_activity_ts, layer: layer_info.layer, - tenant_resident_size_overage: cumulative_resident_size_overage - .clamp(i64::MIN as i128, i64::MAX as i128) - as i64, - }); - cumulative_resident_size_overage += i128::from(file_size); + }; + if cumsum > min_resident_size as i128 { + above_min_resident_size.push(candidate); + } else { + below_min_resident_size.push(candidate); + } + cumsum += i128::from(file_size); } } - // Final sort. Layers above their tenant's min-resident size threshold first, in - // LRU order, and then all the rest also in LRU order - candidates.sort_unstable_by_key(|candidate| { - ( - candidate.tenant_resident_size_overage < 0, - candidate.last_activity_ts, - ) - }); + // The MinResidentSizePartitionedCandidates struct expects these to be sorted this way + above_min_resident_size.sort_unstable_by_key(|c| c.last_activity_ts); + below_min_resident_size.sort_unstable_by_key(|c| c.last_activity_ts); - Ok(candidates) + Ok(EvictionCandidates::Finished( + MinResidentSizePartitionedCandidates { + above: above_min_resident_size, + below: below_min_resident_size, + }, + )) } struct TimelineKey(Arc); diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 45de862388..e39556d801 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -16,6 +16,8 @@ from fixtures.neon_fixtures import ( ) from fixtures.types import TenantId, TimelineId +GLOBAL_LRU_LOG_LINE = "tenant_min_resident_size-respecting LRU would not relieve pressure, evicting more following global LRU policy" + @pytest.mark.parametrize("config_level_override", [None, 400]) def test_min_resident_size_override_handling( @@ -220,7 +222,7 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv) time.sleep(1) # give log time to flush assert not env.neon_env.pageserver.log_contains( - "falling back to global LRU" + GLOBAL_LRU_LOG_LINE, ), "this test is pointless if it fell back to global LRU" (later_total_on_disk, _, _) = env.timelines_du() @@ -246,8 +248,8 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv) def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): """ - The pageserver should fall back to global LRU if the tenant_min_resident_size-respecting eviction - wouldn't evict enough. + If we can't relieve pressure using tenant_min_resident_size-respecting eviction, + we should continue to evict layers following global LRU. """ env = eviction_env ps_http = env.pageserver_http @@ -264,8 +266,8 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): assert actual_change >= target, "eviction must always evict more than target" time.sleep(1) # give log time to flush - assert env.neon_env.pageserver.log_contains("falling back to global LRU") - env.neon_env.pageserver.allowed_errors.append(".*falling back to global LRU") + assert env.neon_env.pageserver.log_contains(GLOBAL_LRU_LOG_LINE) + env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) def test_partial_evict_tenant(eviction_env: EvictionEnv):