pageserver: remove absolute-order disk usage eviction (#8454)

## Problem

Deployed pageserver configurations are all like this:
```
      disk_usage_based_eviction:
        max_usage_pct: 85
        min_avail_bytes: 0
        period: "10s"
        eviction_order:
          type: "RelativeAccessed"
          args:
            highest_layer_count_loses_first: true
```

But we're maintaining this optional absolute order eviction, with test
cases etc.

## Summary of changes

- Remove absolute order eviction. Make the default eviction policy the
same as how we really deploy pageservers.
This commit is contained in:
John Spray
2024-07-22 13:15:55 +01:00
committed by GitHub
parent ebda667ef8
commit 98af1e365b
3 changed files with 40 additions and 82 deletions

View File

@@ -1601,7 +1601,7 @@ threshold = "20m"
period: Duration::from_secs(10),
#[cfg(feature = "testing")]
mock_statvfs: None,
eviction_order: crate::disk_usage_eviction_task::EvictionOrder::AbsoluteAccessed,
eviction_order: Default::default(),
})
);

View File

@@ -83,17 +83,9 @@ pub struct DiskUsageEvictionTaskConfig {
/// Selects the sort order for eviction candidates *after* per tenant `min_resident_size`
/// partitioning.
#[derive(Default, Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
#[serde(tag = "type", content = "args")]
pub enum EvictionOrder {
/// Order the layers to be evicted by how recently they have been accessed in absolute
/// time.
///
/// This strategy is unfair when some tenants grow faster than others towards the slower
/// growing.
#[default]
AbsoluteAccessed,
/// Order the layers to be evicted by how recently they have been accessed relatively within
/// the set of resident layers of a tenant.
RelativeAccessed {
@@ -108,6 +100,14 @@ pub enum EvictionOrder {
},
}
impl Default for EvictionOrder {
fn default() -> Self {
Self::RelativeAccessed {
highest_layer_count_loses_first: true,
}
}
}
fn default_highest_layer_count_loses_first() -> bool {
true
}
@@ -117,11 +117,6 @@ impl EvictionOrder {
use EvictionOrder::*;
match self {
AbsoluteAccessed => {
candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.last_activity_ts)
});
}
RelativeAccessed { .. } => candidates.sort_unstable_by_key(|(partition, candidate)| {
(*partition, candidate.relative_last_activity)
}),
@@ -134,7 +129,6 @@ impl EvictionOrder {
use EvictionOrder::*;
match self {
AbsoluteAccessed => finite_f32::FiniteF32::ZERO,
RelativeAccessed {
highest_layer_count_loses_first,
} => {

View File

@@ -67,14 +67,11 @@ def test_min_resident_size_override_handling(
@enum.unique
class EvictionOrder(str, enum.Enum):
ABSOLUTE_ORDER = "absolute"
RELATIVE_ORDER_EQUAL = "relative_equal"
RELATIVE_ORDER_SPARE = "relative_spare"
def config(self) -> Dict[str, Any]:
if self == EvictionOrder.ABSOLUTE_ORDER:
return {"type": "AbsoluteAccessed"}
elif self == EvictionOrder.RELATIVE_ORDER_EQUAL:
if self == EvictionOrder.RELATIVE_ORDER_EQUAL:
return {
"type": "RelativeAccessed",
"args": {"highest_layer_count_loses_first": False},
@@ -384,7 +381,7 @@ def test_broken_tenants_are_skipped(eviction_env: EvictionEnv):
@pytest.mark.parametrize(
"order",
[EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
[EvictionOrder.RELATIVE_ORDER_EQUAL],
)
def test_pageserver_evicts_until_pressure_is_relieved(
eviction_env: EvictionEnv, order: EvictionOrder
@@ -418,7 +415,7 @@ def test_pageserver_evicts_until_pressure_is_relieved(
@pytest.mark.parametrize(
"order",
[EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
[EvictionOrder.RELATIVE_ORDER_EQUAL],
)
def test_pageserver_respects_overridden_resident_size(
eviction_env: EvictionEnv, order: EvictionOrder
@@ -495,7 +492,7 @@ def test_pageserver_respects_overridden_resident_size(
@pytest.mark.parametrize(
"order",
[EvictionOrder.ABSOLUTE_ORDER, EvictionOrder.RELATIVE_ORDER_EQUAL],
[EvictionOrder.RELATIVE_ORDER_EQUAL],
)
def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: EvictionOrder):
"""
@@ -526,7 +523,6 @@ def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv, order: E
@pytest.mark.parametrize(
"order",
[
EvictionOrder.ABSOLUTE_ORDER,
EvictionOrder.RELATIVE_ORDER_EQUAL,
EvictionOrder.RELATIVE_ORDER_SPARE,
],
@@ -572,63 +568,38 @@ def test_partial_evict_tenant(eviction_env: EvictionEnv, order: EvictionOrder):
later_tenant_usage < du_by_timeline[tenant]
), "all tenants should have lost some layers"
warm_size = later_du_by_timeline[warm]
cold_size = later_du_by_timeline[cold]
# with relative order what matters is the amount of layers, with a
# fudge factor of whether the eviction bothers tenants with highest
# layer count the most. last accessed times between tenants does not
# matter.
assert order in [EvictionOrder.RELATIVE_ORDER_EQUAL, EvictionOrder.RELATIVE_ORDER_SPARE]
layers_now = env.count_layers_per_tenant(env.pageserver)
if order == EvictionOrder.ABSOLUTE_ORDER:
# bounds for warmed_size
warm_lower = 0.5 * du_by_timeline[warm]
expected_ratio = later_total_on_disk / total_on_disk
log.info(
f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio"
)
# We don't know exactly whether the cold tenant needs 2 or just 1 env.layer_size wiggle room.
# So, check for up to 3 here.
warm_upper = warm_lower + 3 * env.layer_size
for tenant_id, original_count in tenant_layers.items():
count_now = layers_now[tenant_id]
ratio = count_now / original_count
abs_diff = abs(ratio - expected_ratio)
assert original_count > count_now
cold_upper = 2 * env.layer_size
log.info(f"tenants: warm={warm[0]}, cold={cold[0]}")
expectation = 0.06
log.info(
f"expecting for warm tenant: {human_bytes(warm_lower)} < {human_bytes(warm_size)} < {human_bytes(warm_upper)}"
f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}"
)
log.info(f"expecting for cold tenant: {human_bytes(cold_size)} < {human_bytes(cold_upper)}")
assert warm_size > warm_lower, "warmed up tenant should be at about half size (lower)"
assert warm_size < warm_upper, "warmed up tenant should be at about half size (upper)"
assert (
cold_size < cold_upper
), "the cold tenant should be evicted to its min_resident_size, i.e., max layer file size"
else:
# with relative order what matters is the amount of layers, with a
# fudge factor of whether the eviction bothers tenants with highest
# layer count the most. last accessed times between tenants does not
# matter.
layers_now = env.count_layers_per_tenant(env.pageserver)
expected_ratio = later_total_on_disk / total_on_disk
log.info(
f"freed up {100 * expected_ratio}%, expecting the layer counts to decrease in similar ratio"
)
for tenant_id, original_count in tenant_layers.items():
count_now = layers_now[tenant_id]
ratio = count_now / original_count
abs_diff = abs(ratio - expected_ratio)
assert original_count > count_now
expectation = 0.06
log.info(
f"tenant {tenant_id} layer count {original_count} -> {count_now}, ratio: {ratio}, expecting {abs_diff} < {expectation}"
)
# in this test case both relative_spare and relative_equal produce
# the same outcomes; this must be a quantization effect of similar
# sizes (-s4 and -s6) and small (5MB) layer size.
# for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02
assert abs_diff < expectation
# in this test case both relative_spare and relative_equal produce
# the same outcomes; this must be a quantization effect of similar
# sizes (-s4 and -s6) and small (5MB) layer size.
# for pg15 and pg16 the absdiff is < 0.01, for pg14 it is closer to 0.02
assert abs_diff < expectation
@pytest.mark.parametrize(
"order",
[
EvictionOrder.ABSOLUTE_ORDER,
EvictionOrder.RELATIVE_ORDER_EQUAL,
EvictionOrder.RELATIVE_ORDER_SPARE,
],
@@ -680,14 +651,7 @@ def test_fast_growing_tenant(neon_env_builder: NeonEnvBuilder, pg_bin: PgBin, or
), "rest of the assertions expect 3 + 1 timelines, ratios, scales, all in order"
log.info(f"{ratios}")
if order == EvictionOrder.ABSOLUTE_ORDER:
# first tenant loses most
assert ratios[0] <= ratios[1], "first should lose the most"
assert ratios[1] < ratios[2], "second should lose some"
assert ratios[1] < 1.0
assert ratios[2] <= ratios[3], "third might not lose"
assert ratios[3] == 1.0, "tenant created last does not lose"
elif order == EvictionOrder.RELATIVE_ORDER_EQUAL:
if order == EvictionOrder.RELATIVE_ORDER_EQUAL:
assert all([x for x in ratios if x < 1.0]), "all tenants lose layers"
elif order == EvictionOrder.RELATIVE_ORDER_SPARE:
# with different layer sizes and pg versions, there are different combinations
@@ -750,7 +714,7 @@ def test_statvfs_error_handling(eviction_env: EvictionEnv):
"type": "Failure",
"mocked_error": "EIO",
},
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE,
)
env.neon_env.pageserver.assert_log_contains(".*statvfs failed.*EIO")
@@ -784,7 +748,7 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
# This avoids accounting for metadata files & tenant conf in the tests.
"name_filter": ".*__.*",
},
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE,
)
wait_until(
@@ -837,7 +801,7 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
# This avoids accounting for metadata files & tenant conf in the tests.
"name_filter": ".*__.*",
},
eviction_order=EvictionOrder.ABSOLUTE_ORDER,
eviction_order=EvictionOrder.RELATIVE_ORDER_SPARE,
)
wait_until(