diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index bd63c4d860..8516f397ca 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -1468,6 +1468,7 @@ threshold = "20m" period: Duration::from_secs(10), #[cfg(feature = "testing")] mock_statvfs: None, + eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed, }) ); match &conf.default_tenant_conf.eviction_policy { diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 76906cfaf7..23b9b573b6 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -74,6 +74,45 @@ pub struct DiskUsageEvictionTaskConfig { pub period: Duration, #[cfg(feature = "testing")] pub mock_statvfs: Option, + /// Select sorting for evicted layers + #[serde(default)] + pub eviction_order: EvictionOrder, +} + +/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size` +/// partitioning. +#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[serde(tag = "type", content = "args")] +pub enum EvictionOrder { + /// Order the layers to be evicted by how recently they have been accessed in absolute + /// time. + /// + /// This strategy is unfair when some tenants grow faster than others towards the slower + /// growing. + #[default] + AbsoluteAccessed, + + /// Order the layers to be evicted by how recently they have been accessed relatively within + /// the set of resident layers of a tenant. + /// + /// This strategy will evict layers more fairly but is untested. + RelativeAccessed { + #[serde(default)] + highest_layer_count_loses_first: bool, + }, +} + +impl EvictionOrder { + /// Return true, if with [`Self::RelativeAccessed`] order the tenants with the highest layer + /// counts should be the first ones to have their layers evicted. + fn highest_layer_count_loses_first(&self) -> bool { + match self { + EvictionOrder::AbsoluteAccessed => false, + EvictionOrder::RelativeAccessed { + highest_layer_count_loses_first, + } => *highest_layer_count_loses_first, + } + } } #[derive(Default)] @@ -192,7 +231,14 @@ async fn disk_usage_eviction_task_iteration( ) -> anyhow::Result<()> { let usage_pre = filesystem_level_usage::get(tenants_dir, task_config) .context("get filesystem-level disk usage before evictions")?; - let res = disk_usage_eviction_task_iteration_impl(state, storage, usage_pre, cancel).await; + let res = disk_usage_eviction_task_iteration_impl( + state, + storage, + usage_pre, + task_config.eviction_order, + cancel, + ) + .await; match res { Ok(outcome) => { debug!(?outcome, "disk_usage_eviction_iteration finished"); @@ -278,6 +324,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( state: &State, _storage: &GenericRemoteStorage, usage_pre: U, + eviction_order: EvictionOrder, cancel: &CancellationToken, ) -> anyhow::Result> { // use tokio's mutex to get a Sync guard (instead of std::sync::Mutex) @@ -297,7 +344,7 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( "running disk usage based eviction due to pressure" ); - let candidates = match collect_eviction_candidates(cancel).await? { + let candidates = match collect_eviction_candidates(eviction_order, cancel).await? { EvictionCandidates::Cancelled => { return Ok(IterationOutcome::Cancelled); } @@ -307,16 +354,16 @@ pub(crate) async fn disk_usage_eviction_task_iteration_impl( // Debug-log the list of candidates let now = SystemTime::now(); for (i, (partition, candidate)) in candidates.iter().enumerate() { + let nth = i + 1; let desc = candidate.layer.layer_desc(); + let total_candidates = candidates.len(); + let size = desc.file_size; + let rel = candidate.relative_last_activity; debug!( - "cand {}/{}: size={}, no_access_for={}us, partition={:?}, {}/{}/{}", - i + 1, - candidates.len(), - desc.file_size, + "cand {nth}/{total_candidates}: size={size}, rel_last_activity={rel}, no_access_for={}us, partition={partition:?}, {}/{}/{}", now.duration_since(candidate.last_activity_ts) .unwrap() .as_micros(), - partition, desc.tenant_shard_id, desc.timeline_id, candidate.layer, @@ -459,6 +506,7 @@ struct EvictionCandidate { timeline: Arc, layer: Layer, last_activity_ts: SystemTime, + relative_last_activity: finite_f32::FiniteF32, } #[derive(Debug, Clone, Copy, PartialEq, Eq, PartialOrd, Ord)] @@ -478,24 +526,24 @@ enum EvictionCandidates { /// order. A caller that evicts in that order, until pressure is relieved, implements /// the eviction policy outlined in the module comment. /// -/// # Example +/// # Example with EvictionOrder::AbsoluteAccessed /// /// Imagine that there are two tenants, A and B, with five layers each, a-e. /// Each layer has size 100, and both tenant's min_resident_size is 150. /// The eviction order would be /// /// ```text -/// partition last_activity_ts tenant/layer -/// Above 18:30 A/c -/// Above 19:00 A/b -/// Above 18:29 B/c -/// Above 19:05 B/b -/// Above 20:00 B/a -/// Above 20:03 A/a -/// Below 20:30 A/d -/// Below 20:40 B/d -/// Below 20:45 B/e -/// Below 20:58 A/e +/// partition last_activity_ts tenant/layer +/// Above 18:30 A/c +/// Above 19:00 A/b +/// Above 18:29 B/c +/// Above 19:05 B/b +/// Above 20:00 B/a +/// Above 20:03 A/a +/// Below 20:30 A/d +/// Below 20:40 B/d +/// Below 20:45 B/e +/// Below 20:58 A/e /// ``` /// /// Now, if we need to evict 300 bytes to relieve pressure, we'd evict `A/c, A/b, B/c`. @@ -505,7 +553,77 @@ enum EvictionCandidates { /// `A/c, A/b, B/c, B/b, B/a, A/a, A/d, B/d, B/e`, reaching into the `Below` partition /// after exhauting the `Above` partition. /// So, we did not respect each tenant's min_resident_size. +/// +/// # Example with EvictionOrder::RelativeAccessed +/// +/// ```text +/// partition relative_age last_activity_ts tenant/layer +/// Above 0/4 18:30 A/c +/// Above 0/4 18:29 B/c +/// Above 1/4 19:00 A/b +/// Above 1/4 19:05 B/b +/// Above 2/4 20:00 B/a +/// Above 2/4 20:03 A/a +/// Below 3/4 20:30 A/d +/// Below 3/4 20:40 B/d +/// Below 4/4 20:45 B/e +/// Below 4/4 20:58 A/e +/// ``` +/// +/// With tenants having the same number of layers the picture does not change much. The same with +/// A having many more layers **resident** (not all of them listed): +/// +/// ```text +/// Above 0/100 18:30 A/c +/// Above 0/4 18:29 B/c +/// Above 1/100 19:00 A/b +/// Above 2/100 20:03 A/a +/// Above 3/100 20:03 A/nth_3 +/// Above 4/100 20:03 A/nth_4 +/// ... +/// Above 1/4 19:05 B/b +/// Above 25/100 20:04 A/nth_25 +/// ... +/// Above 2/4 20:00 B/a +/// Above 50/100 20:10 A/nth_50 +/// ... +/// Below 3/4 20:40 B/d +/// Below 99/100 20:30 A/nth_99 +/// Below 4/4 20:45 B/e +/// Below 100/100 20:58 A/nth_100 +/// ``` +/// +/// Now it's easier to see that because A has grown fast it has more layers to get evicted. What is +/// difficult to see is what happens on the next round assuming the evicting 23 from the above list +/// relieves the pressure (22 A layers gone, 1 B layers gone) but a new fast growing tenant C has +/// appeared: +/// +/// ```text +/// Above 0/87 20:04 A/nth_23 +/// Above 0/3 19:05 B/b +/// Above 0/50 20:59 C/nth_0 +/// Above 1/87 20:04 A/nth_24 +/// Above 1/50 21:00 C/nth_1 +/// Above 2/87 20:04 A/nth_25 +/// ... +/// Above 16/50 21:02 C/nth_16 +/// Above 1/3 20:00 B/a +/// Above 27/87 20:10 A/nth_50 +/// ... +/// Below 2/3 20:40 B/d +/// Below 49/50 21:05 C/nth_49 +/// Below 86/87 20:30 A/nth_99 +/// Below 3/3 20:45 B/e +/// Below 50/50 21:05 C/nth_50 +/// Below 87/87 20:58 A/nth_100 +/// ``` +/// +/// Now relieving pressure with 23 layers would cost: +/// - tenant A 14 layers +/// - tenant B 1 layer +/// - tenant C 8 layers async fn collect_eviction_candidates( + eviction_order: EvictionOrder, cancel: &CancellationToken, ) -> anyhow::Result { // get a snapshot of the list of tenants @@ -591,12 +709,63 @@ async fn collect_eviction_candidates( tenant_candidates .sort_unstable_by_key(|(_, layer_info)| std::cmp::Reverse(layer_info.last_activity_ts)); let mut cumsum: i128 = 0; - for (timeline, layer_info) in tenant_candidates.into_iter() { + + // keeping the -1 or not decides if every tenant should lose their least recently accessed + // layer OR if this should happen in the order of having highest layer count: + let fudge = if eviction_order.highest_layer_count_loses_first() { + // relative_age vs. tenant layer count: + // - 0.1..=1.0 (10 layers) + // - 0.01..=1.0 (100 layers) + // - 0.001..=1.0 (1000 layers) + // + // leading to evicting less of the smallest tenants. + 0 + } else { + // use full 0.0..=1.0 range, which means even the smallest tenants could always lose a + // layer. the actual ordering is unspecified: for 10k tenants on a pageserver it could + // be that less than 10k layer evictions is enough, so we would not need to evict from + // all tenants. + // + // as the tenant ordering is now deterministic this could hit the same tenants + // disproportionetly on multiple invocations. alternative could be to remember how many + // layers did we evict last time from this tenant, and inject that as an additional + // fudge here. + 1 + }; + + let total = tenant_candidates + .len() + .checked_sub(fudge) + .filter(|&x| x > 0) + // support 0 or 1 resident layer tenants as well + .unwrap_or(1); + let divider = total as f32; + + for (i, (timeline, layer_info)) in tenant_candidates.into_iter().enumerate() { let file_size = layer_info.file_size(); + + // as we iterate this reverse sorted list, the most recently accessed layer will always + // be 1.0; this is for us to evict it last. + let relative_last_activity = if matches!( + eviction_order, + EvictionOrder::RelativeAccessed { .. } + ) { + // another possibility: use buckets, like (256.0 * relative_last_activity) as u8 or + // similarly for u16. unsure how it would help. + finite_f32::FiniteF32::try_from_normalized((total - i) as f32 / divider) + .unwrap_or_else(|val| { + tracing::warn!(%fudge, "calculated invalid relative_last_activity for i={i}, total={total}: {val}"); + finite_f32::FiniteF32::ZERO + }) + } else { + finite_f32::FiniteF32::ZERO + }; + let candidate = EvictionCandidate { timeline, last_activity_ts: layer_info.last_activity_ts, layer: layer_info.layer, + relative_last_activity, }; let partition = if cumsum > min_resident_size as i128 { MinResidentSizePartition::Above @@ -610,8 +779,19 @@ async fn collect_eviction_candidates( debug_assert!(MinResidentSizePartition::Above < MinResidentSizePartition::Below, "as explained in the function's doc comment, layers that aren't in the tenant's min_resident_size are evicted first"); - candidates - .sort_unstable_by_key(|(partition, candidate)| (*partition, candidate.last_activity_ts)); + + match eviction_order { + EvictionOrder::AbsoluteAccessed => { + candidates.sort_unstable_by_key(|(partition, candidate)| { + (*partition, candidate.last_activity_ts) + }); + } + EvictionOrder::RelativeAccessed { .. } => { + candidates.sort_unstable_by_key(|(partition, candidate)| { + (*partition, candidate.relative_last_activity) + }); + } + } Ok(EvictionCandidates::Finished(candidates)) } @@ -640,6 +820,66 @@ impl std::ops::Deref for TimelineKey { } } +/// A totally ordered f32 subset we can use with sorting functions. +mod finite_f32 { + + /// A totally ordered f32 subset we can use with sorting functions. + #[derive(Clone, Copy, PartialEq)] + pub struct FiniteF32(f32); + + impl std::fmt::Debug for FiniteF32 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Debug::fmt(&self.0, f) + } + } + + impl std::fmt::Display for FiniteF32 { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Display::fmt(&self.0, f) + } + } + + impl std::cmp::Eq for FiniteF32 {} + + impl std::cmp::PartialOrd for FiniteF32 { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } + } + + impl std::cmp::Ord for FiniteF32 { + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + self.0.total_cmp(&other.0) + } + } + + impl TryFrom for FiniteF32 { + type Error = f32; + + fn try_from(value: f32) -> Result { + if value.is_finite() { + Ok(FiniteF32(value)) + } else { + Err(value) + } + } + } + + impl FiniteF32 { + pub const ZERO: FiniteF32 = FiniteF32(0.0); + + pub fn try_from_normalized(value: f32) -> Result { + if (0.0..=1.0).contains(&value) { + // -0.0 is within the range, make sure it is assumed 0.0..=1.0 + let value = value.abs(); + Ok(FiniteF32(value)) + } else { + Err(value) + } + } + } +} + mod filesystem_level_usage { use anyhow::Context; use camino::Utf8Path; @@ -721,6 +961,7 @@ mod filesystem_level_usage { #[test] fn max_usage_pct_pressure() { + use super::EvictionOrder; use super::Usage as _; use std::time::Duration; use utils::serde_percent::Percent; @@ -732,6 +973,7 @@ mod filesystem_level_usage { period: Duration::MAX, #[cfg(feature = "testing")] mock_statvfs: None, + eviction_order: EvictionOrder::default(), }, total_bytes: 100_000, avail_bytes: 0, diff --git a/pageserver/src/http/routes.rs b/pageserver/src/http/routes.rs index e641e44b08..3ea79ea4f2 100644 --- a/pageserver/src/http/routes.rs +++ b/pageserver/src/http/routes.rs @@ -1566,19 +1566,22 @@ async fn disk_usage_eviction_run( struct Config { /// How many bytes to evict before reporting that pressure is relieved. evict_bytes: u64, + + #[serde(default)] + eviction_order: crate::disk_usage_eviction_task::EvictionOrder, } #[derive(Debug, Clone, Copy, serde::Serialize)] struct Usage { // remains unchanged after instantiation of the struct - config: Config, + evict_bytes: u64, // updated by `add_available_bytes` freed_bytes: u64, } impl crate::disk_usage_eviction_task::Usage for Usage { fn has_pressure(&self) -> bool { - self.config.evict_bytes > self.freed_bytes + self.evict_bytes > self.freed_bytes } fn add_available_bytes(&mut self, bytes: u64) { @@ -1589,7 +1592,7 @@ async fn disk_usage_eviction_run( let config = json_request::(&mut r).await?; let usage = Usage { - config, + evict_bytes: config.evict_bytes, freed_bytes: 0, }; @@ -1604,7 +1607,11 @@ async fn disk_usage_eviction_run( let state = state.disk_usage_eviction_state.clone(); let res = crate::disk_usage_eviction_task::disk_usage_eviction_task_iteration_impl( - &state, storage, usage, &cancel, + &state, + storage, + usage, + config.eviction_order, + &cancel, ) .await; diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index f3f3a1ddf3..9fdc4d59f5 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -1,6 +1,7 @@ +import enum import time from dataclasses import dataclass -from typing import Dict, Tuple +from typing import Any, Dict, Tuple import pytest import toml @@ -64,6 +65,23 @@ def test_min_resident_size_override_handling( assert_config(tenant_id, None, config_level_override) +@enum.unique +class EvictionOrder(str, enum.Enum): + ABSOLUTE_ORDER = "absolute" + RELATIVE_ORDER_EQUAL = "relative_equal" + RELATIVE_ORDER_SPARE = "relative_spare" + + def config(self) -> Dict[str, Any]: + if self == EvictionOrder.ABSOLUTE_ORDER: + return {"type": "AbsoluteAccessed"} + elif self == EvictionOrder.RELATIVE_ORDER_EQUAL: + return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": False}} + elif self == EvictionOrder.RELATIVE_ORDER_SPARE: + return {"type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": True}} + else: + raise RuntimeError(f"not implemented: {self}") + + @dataclass class EvictionEnv: timelines: list[Tuple[TenantId, TimelineId]] @@ -108,13 +126,14 @@ class EvictionEnv: _avg = cur.fetchone() def pageserver_start_with_disk_usage_eviction( - self, period, max_usage_pct, min_avail_bytes, mock_behavior + self, period, max_usage_pct, min_avail_bytes, mock_behavior, eviction_order: EvictionOrder ): disk_usage_config = { "period": period, "max_usage_pct": max_usage_pct, "min_avail_bytes": min_avail_bytes, "mock_statvfs": mock_behavior, + "eviction_order": eviction_order.config(), } enc = toml.TomlEncoder() @@ -270,7 +289,13 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv): env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) -def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv): +@pytest.mark.parametrize( + "order", + [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], +) +def test_pageserver_evicts_until_pressure_is_relieved( + eviction_env: EvictionEnv, order: EvictionOrder +): """ Basic test to ensure that we evict enough to relieve pressure. """ @@ -281,7 +306,9 @@ def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv) target = total_on_disk // 2 - response = pageserver_http.disk_usage_eviction_run({"evict_bytes": target}) + response = pageserver_http.disk_usage_eviction_run( + {"evict_bytes": target, "eviction_order": order.config()} + ) log.info(f"{response}") (later_total_on_disk, _, _) = env.timelines_du() @@ -296,7 +323,13 @@ def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv) assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected" -def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv): +@pytest.mark.parametrize( + "order", + [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], +) +def test_pageserver_respects_overridden_resident_size( + eviction_env: EvictionEnv, order: EvictionOrder +): """ Override tenant min resident and ensure that it will be respected by eviction. """ @@ -336,7 +369,9 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv) env.warm_up_tenant(large_tenant[0]) # do one run - response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + response = ps_http.disk_usage_eviction_run( + {"evict_bytes": target, "eviction_order": order.config()} + ) log.info(f"{response}") time.sleep(1) # give log time to flush @@ -365,7 +400,11 @@ def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv) assert du_by_timeline[large_tenant] - later_du_by_timeline[large_tenant] >= target -def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): +@pytest.mark.parametrize( + "order", + [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], +) +def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: EvictionOrder): """ If we can't relieve pressure using tenant_min_resident_size-respecting eviction, we should continue to evict layers following global LRU. @@ -376,7 +415,9 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): (total_on_disk, _, _) = env.timelines_du() target = total_on_disk - response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + response = ps_http.disk_usage_eviction_run( + {"evict_bytes": target, "eviction_order": order.config()} + ) log.info(f"{response}") (later_total_on_disk, _, _) = env.timelines_du() @@ -389,7 +430,15 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv): env.neon_env.pageserver.allowed_errors.append(".*" + GLOBAL_LRU_LOG_LINE) -def test_partial_evict_tenant(eviction_env: EvictionEnv): +@pytest.mark.parametrize( + "order", + [ + EvictionOrder.ABSOLUTE_ORDER, + EvictionOrder.RELATIVE_ORDER_EQUAL, + EvictionOrder.RELATIVE_ORDER_SPARE, + ], +) +def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): """ Warm up a tenant, then build up pressure to cause in evictions in both. We expect @@ -402,7 +451,7 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv): (total_on_disk, _, _) = env.timelines_du() du_by_timeline = env.du_by_timeline() - # pick any tenant + # pick smaller or greater (iteration order is insertion order of scale=4 and scale=6) [warm, cold] = list(du_by_timeline.keys()) (tenant_id, timeline_id) = warm @@ -413,7 +462,9 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv): # but not enough to fall into global LRU. # So, set target to all occupied space, except 2*env.layer_size per tenant target = du_by_timeline[cold] + (du_by_timeline[warm] // 2) - 2 * 2 * env.layer_size - response = ps_http.disk_usage_eviction_run({"evict_bytes": target}) + response = ps_http.disk_usage_eviction_run( + {"evict_bytes": target, "eviction_order": order.config()} + ) log.info(f"{response}") (later_total_on_disk, _, _) = env.timelines_du() @@ -428,28 +479,32 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv): ), "all tenants should have lost some layers" warm_size = later_du_by_timeline[warm] - - # bounds for warmed_size - warm_lower = 0.5 * du_by_timeline[warm] - - # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room. - # So, check for up to 3 here. - warm_upper = warm_lower + 3 * env.layer_size - cold_size = later_du_by_timeline[cold] - cold_upper = 2 * env.layer_size - log.info( - f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}" - ) - log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}") + if order == EvictionOrder.ABSOLUTE_ORDER: + # bounds for warmed_size + warm_lower = 0.5 * du_by_timeline[warm] - assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)" - assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)" + # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room. + # So, check for up to 3 here. + warm_upper = warm_lower + 3 * env.layer_size - assert ( - cold_size < cold_upper - ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size" + cold_upper = 2 * env.layer_size + log.info(f"tenants: warm={warm[0]}, cold={cold[0]}") + log.info( + f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}" + ) + log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}") + + assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)" + assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)" + + assert ( + cold_size < cold_upper + ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size" + else: + # just go with the space was freed, find proper limits later + pass def poor_mans_du( @@ -501,6 +556,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv): "type": "Failure", "mocked_error": "EIO", }, + eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) assert env.neon_env.pageserver.log_contains(".*statvfs failed.*EIO") @@ -533,6 +589,7 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv): # This avoids accounting for metadata files & tenant conf in the tests. "name_filter": ".*__.*", }, + eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) def relieved_log_message(): @@ -573,6 +630,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): # This avoids accounting for metadata files & tenant conf in the tests. "name_filter": ".*__.*", }, + eviction_order=EvictionOrder.ABSOLUTE_ORDER, ) def relieved_log_message():