From 98af1e365bbbf71f21a3317313dc2407d7f8937a Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 22 Jul 2024 13:15:55 +0100 Subject: [PATCH] pageserver: remove absolute-order disk usage eviction (#8454) ## Problem Deployed pageserver configurations are all like this: ``` disk_usage_based_eviction: max_usage_pct: 85 min_avail_bytes: 0 period: "10s" eviction_order: type: "RelativeAccessed" args: highest_layer_count_loses_first: true ``` But we're maintaining this optional absolute order eviction, with test cases etc. ## Summary of changes - Remove absolute order eviction. Make the default eviction policy the same as how we really deploy pageservers. --- pageserver/src/config.rs | 2 +- pageserver/src/disk_usage_eviction_task.rs | 24 ++--- .../regress/test_disk_usage_eviction.py | 96 ++++++------------- 3 files changed, 40 insertions(+), 82 deletions(-) diff --git a/pageserver/src/config.rs b/pageserver/src/config.rs index 35b4e79365..6a78d126cf 100644 --- a/pageserver/src/config.rs +++ b/pageserver/src/config.rs @@ -1601,7 +1601,7 @@ threshold = "20m" period: Duration::from_secs(10), #[cfg(feature = "testing")] mock_statvfs: None, - eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed, + eviction_order: Default::default(), }) ); diff --git a/pageserver/src/disk_usage_eviction_task.rs b/pageserver/src/disk_usage_eviction_task.rs index 90bd4294bb..103e549d22 100644 --- a/pageserver/src/disk_usage_eviction_task.rs +++ b/pageserver/src/disk_usage_eviction_task.rs @@ -83,17 +83,9 @@ pub struct DiskUsageEvictionTaskConfig { /// Selects the sort order for eviction candidates *after* per tenant `min_resident_size` /// partitioning. -#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] +#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)] #[serde(tag = "type", content = "args")] pub enum EvictionOrder { - /// Order the layers to be evicted by how recently they have been accessed in absolute - /// time. - /// - /// This strategy is unfair when some tenants grow faster than others towards the slower - /// growing. - #[default] - AbsoluteAccessed, - /// Order the layers to be evicted by how recently they have been accessed relatively within /// the set of resident layers of a tenant. RelativeAccessed { @@ -108,6 +100,14 @@ pub enum EvictionOrder { }, } +impl Default for EvictionOrder { + fn default() -> Self { + Self::RelativeAccessed { + highest_layer_count_loses_first: true, + } + } +} + fn default_highest_layer_count_loses_first() -> bool { true } @@ -117,11 +117,6 @@ impl EvictionOrder { use EvictionOrder::*; match self { - AbsoluteAccessed => { - candidates.sort_unstable_by_key(|(partition, candidate)| { - (*partition, candidate.last_activity_ts) - }); - } RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| { (*partition, candidate.relative_last_activity) }), @@ -134,7 +129,6 @@ impl EvictionOrder { use EvictionOrder::*; match self { - AbsoluteAccessed => finite_f32::FiniteF32::ZERO, RelativeAccessed { highest_layer_count_loses_first, } => { diff --git a/test_runner/regress/test_disk_usage_eviction.py b/test_runner/regress/test_disk_usage_eviction.py index 3c834f430b..930fb14947 100644 --- a/test_runner/regress/test_disk_usage_eviction.py +++ b/test_runner/regress/test_disk_usage_eviction.py @@ -67,14 +67,11 @@ def test_min_resident_size_override_handling( @enum.unique class EvictionOrder(str, enum.Enum): - ABSOLUTE_ORDER = "absolute" RELATIVE_ORDER_EQUAL = "relative_equal" RELATIVE_ORDER_SPARE = "relative_spare" def config(self) -> Dict[str, Any]: - if self == EvictionOrder.ABSOLUTE_ORDER: - return {"type": "AbsoluteAccessed"} - elif self == EvictionOrder.RELATIVE_ORDER_EQUAL: + if self == EvictionOrder.RELATIVE_ORDER_EQUAL: return { "type": "RelativeAccessed", "args": {"highest_layer_count_loses_first": False}, @@ -384,7 +381,7 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv): @pytest.mark.parametrize( "order", - [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], + [EvictionOrder.RELATIVE_ORDER_EQUAL], ) def test_pageserver_evicts_until_pressure_is_relieved( eviction_env: EvictionEnv, order: EvictionOrder @@ -418,7 +415,7 @@ def test_pageserver_evicts_until_pressure_is_relieved( @pytest.mark.parametrize( "order", - [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], + [EvictionOrder.RELATIVE_ORDER_EQUAL], ) def test_pageserver_respects_overridden_resident_size( eviction_env: EvictionEnv, order: EvictionOrder @@ -495,7 +492,7 @@ def test_pageserver_respects_overridden_resident_size( @pytest.mark.parametrize( "order", - [EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL], + [EvictionOrder.RELATIVE_ORDER_EQUAL], ) def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: EvictionOrder): """ @@ -526,7 +523,6 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E @pytest.mark.parametrize( "order", [ - EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE, ], @@ -572,63 +568,38 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder): later_tenant_usage < du_by_timeline[tenant] ), "all tenants should have lost some layers" - warm_size = later_du_by_timeline[warm] - cold_size = later_du_by_timeline[cold] + # with relative order what matters is the amount of layers, with a + # fudge factor of whether the eviction bothers tenants with highest + # layer count the most. last accessed times between tenants does not + # matter. + assert order in [EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE] + layers_now = env.count_layers_per_tenant(env.pageserver) - if order == EvictionOrder.ABSOLUTE_ORDER: - # bounds for warmed_size - warm_lower = 0.5 * du_by_timeline[warm] + expected_ratio = later_total_on_disk / total_on_disk + log.info( + f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio" + ) - # We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room. - # So, check for up to 3 here. - warm_upper = warm_lower + 3 * env.layer_size + for tenant_id, original_count in tenant_layers.items(): + count_now = layers_now[tenant_id] + ratio = count_now / original_count + abs_diff = abs(ratio - expected_ratio) + assert original_count > count_now - cold_upper = 2 * env.layer_size - log.info(f"tenants: warm={warm[0]}, cold={cold[0]}") + expectation = 0.06 log.info( - f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}" + f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}" ) - log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}") - - assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)" - assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)" - - assert ( - cold_size < cold_upper - ), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size" - else: - # with relative order what matters is the amount of layers, with a - # fudge factor of whether the eviction bothers tenants with highest - # layer count the most. last accessed times between tenants does not - # matter. - layers_now = env.count_layers_per_tenant(env.pageserver) - - expected_ratio = later_total_on_disk / total_on_disk - log.info( - f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio" - ) - - for tenant_id, original_count in tenant_layers.items(): - count_now = layers_now[tenant_id] - ratio = count_now / original_count - abs_diff = abs(ratio - expected_ratio) - assert original_count > count_now - - expectation = 0.06 - log.info( - f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}" - ) - # in this test case both relative_spare and relative_equal produce - # the same outcomes; this must be a quantization effect of similar - # sizes (-s4 and -s6) and small (5MB) layer size. - # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02 - assert abs_diff < expectation + # in this test case both relative_spare and relative_equal produce + # the same outcomes; this must be a quantization effect of similar + # sizes (-s4 and -s6) and small (5MB) layer size. + # for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02 + assert abs_diff < expectation @pytest.mark.parametrize( "order", [ - EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE, ], @@ -680,14 +651,7 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or ), "rest of the assertions expect 3 + 1 timelines, ratios, scales, all in order" log.info(f"{ratios}") - if order == EvictionOrder.ABSOLUTE_ORDER: - # first tenant loses most - assert ratios[0] <= ratios[1], "first should lose the most" - assert ratios[1] < ratios[2], "second should lose some" - assert ratios[1] < 1.0 - assert ratios[2] <= ratios[3], "third might not lose" - assert ratios[3] == 1.0, "tenant created last does not lose" - elif order == EvictionOrder.RELATIVE_ORDER_EQUAL: + if order == EvictionOrder.RELATIVE_ORDER_EQUAL: assert all([x for x in ratios if x < 1.0]), "all tenants lose layers" elif order == EvictionOrder.RELATIVE_ORDER_SPARE: # with different layer sizes and pg versions, there are different combinations @@ -750,7 +714,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv): "type": "Failure", "mocked_error": "EIO", }, - eviction_order=EvictionOrder.ABSOLUTE_ORDER, + eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE, ) env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO") @@ -784,7 +748,7 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv): # This avoids accounting for metadata files & tenant conf in the tests. "name_filter": ".*__.*", }, - eviction_order=EvictionOrder.ABSOLUTE_ORDER, + eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE, ) wait_until( @@ -837,7 +801,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv): # This avoids accounting for metadata files & tenant conf in the tests. "name_filter": ".*__.*", }, - eviction_order=EvictionOrder.ABSOLUTE_ORDER, + eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE, ) wait_until(