disk-usage-based layer eviction

This patch adds a pageserver-global background loop that evicts layers
in response to a shortage of available bytes in the $repo/tenants
directory's filesystem.

The loop runs periodically at a configurable `period`.

Each loop iteration uses `statvfs` to determine filesystem-level space
usage.  It compares the returned usage data against two different types
of thresholds. The iteration tries to evict layers until app-internal
accounting says we should be below the thresholds.  We cross-check this
internal accounting with the real world by making another `statvfs` at
the end of the iteration.  We're good if that second statvfs shows that
we're _actually_ below the configured thresholds.  If we're still above
one or more thresholds, we emit a warning log message, leaving it to the
operator to investigate further.

There are two thresholds: `max_usage_pct` is the relative available
space, expressed in percent of the total filesystem space. If the actual
usage is higher, the threshold is exceeded.  `min_avail_bytes` is the
absolute available space in bytes. If the actual usage is lower, the
threshold is exceeded.

The iteration evicts layers in LRU fashion with a reservation of up to
`min_resident_size` bytes of the most recent layers per tenant.
The layers not part of the per-tenant reservation are evicted
least-recently-used first until we're below all thresholds.
If the above doesn't relieve enough pressure, we fall back to Global LRU.

In addition to the loop, there is also an HTTP endpoint to perform
one loop iteration synchronous to the request.
The endpoint takes an absolute number of bytes that the iteration
needs to evict before pressure is relieved.
The tests use this endpoint, which is a great simplification over
setting up loopback-mounts in the tests, which would be required to
test the statvfs part of the implementation.
We will rely on manual testing in staging to test the statvfs parts.

The HTTP endpoint is also handy in emergencies where an operator wants
the pageserver to evict a given amount of space _now.
Hence, it's arguments documented in openapi_spec.yml.
The response type isn't documented though because we don't consider
it stable. The endpoint should _not_ be used by Console.

Co-authored-by: Joonas Koivunen <joonas@neon.tech>

fixes https://github.com/neondatabase/neon/issues/3728
This commit is contained in:
Christian Schwarz
2023-03-20 16:46:23 +01:00
parent 881356c417
commit 5aef192bf2
20 changed files with 1373 additions and 14 deletions

View File

@@ -1214,6 +1214,14 @@ class PageserverHttpClient(requests.Session):
self.verbose_error(res)
return TenantConfig.from_json(res.json())
def set_tenant_config(self, tenant_id: TenantId, config: dict[str, Any]):
assert "tenant_id" not in config.keys()
res = self.put(
f"http://localhost:{self.port}/v1/tenant/config",
json={**config, "tenant_id": str(tenant_id)},
)
self.verbose_error(res)
def tenant_size(self, tenant_id: TenantId) -> int:
return self.tenant_size_and_modelinputs(tenant_id)[0]
@@ -1530,6 +1538,14 @@ class PageserverHttpClient(requests.Session):
for layer in info.historic_layers:
self.evict_layer(tenant_id, timeline_id, layer.layer_file_name)
def disk_usage_eviction_run(self, request: dict[str, Any]):
res = self.put(
f"http://localhost:{self.port}/v1/disk_usage_eviction/run",
json=request,
)
self.verbose_error(res)
return res.json()
@dataclass
class TenantConfig:

View File

@@ -0,0 +1,330 @@
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Iterator, Tuple
import pytest
from fixtures.log_helper import log
from fixtures.neon_fixtures import (
LayerMapInfo,
NeonEnv,
NeonEnvBuilder,
PageserverHttpClient,
PgBin,
RemoteStorageKind,
wait_for_last_flush_lsn,
)
from fixtures.types import TenantId, TimelineId
@pytest.mark.parametrize("config_level_override", [None, 400])
def test_min_resident_size_override_handling(
neon_env_builder: NeonEnvBuilder, config_level_override: int
):
env = neon_env_builder.init_start()
ps_http = env.pageserver.http_client()
def assert_config(tenant_id, expect_override, expect_effective):
config = ps_http.tenant_config(tenant_id)
assert config.tenant_specific_overrides.get("min_resident_size_override") == expect_override
assert config.effective_config.get("min_resident_size_override") == expect_effective
def assert_overrides(tenant_id, default_tenant_conf_value):
ps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 200})
assert_config(tenant_id, 200, 200)
ps_http.set_tenant_config(tenant_id, {"min_resident_size_override": 0})
assert_config(tenant_id, 0, 0)
ps_http.set_tenant_config(tenant_id, {})
assert_config(tenant_id, None, default_tenant_conf_value)
env.pageserver.stop()
if config_level_override is not None:
env.pageserver.start(
overrides=(
"--pageserver-config-override=tenant_config={ min_resident_size_override = "
+ str(config_level_override)
+ " }",
)
)
else:
env.pageserver.start()
tenant_id, _ = env.neon_cli.create_tenant()
assert_overrides(tenant_id, config_level_override)
# Also ensure that specifying the paramter to create_tenant works, in addition to http-level recconfig.
tenant_id, _ = env.neon_cli.create_tenant(conf={"min_resident_size_override": "100"})
assert_config(tenant_id, 100, 100)
ps_http.set_tenant_config(tenant_id, {})
assert_config(tenant_id, None, config_level_override)
@dataclass
class EvictionEnv:
timelines: list[Tuple[TenantId, TimelineId, LayerMapInfo]]
neon_env: NeonEnv
pg_bin: PgBin
pageserver_http: PageserverHttpClient
layer_size: int
def timelines_du(self) -> Tuple[int, int, int]:
return poor_mans_du(self.neon_env, [(tid, tlid) for tid, tlid, _ in self.timelines])
def du_by_timeline(self) -> dict[Tuple[TenantId, TimelineId], int]:
return {
(tid, tlid): poor_mans_du(self.neon_env, [(tid, tlid)])[0]
for tid, tlid, _ in self.timelines
}
@pytest.fixture
def eviction_env(request, neon_env_builder: NeonEnvBuilder, pg_bin: PgBin) -> Iterator[EvictionEnv]:
"""
Creates two tenants, one somewhat larger than the other.
"""
log.info(f"setting up eviction_env for test {request.node.name}")
neon_env_builder.enable_remote_storage(RemoteStorageKind.LOCAL_FS, f"{request.node.name}")
env = neon_env_builder.init_start()
pageserver_http = env.pageserver.http_client()
# allow because we are invoking this manually; we always warn on executing disk based eviction
env.pageserver.allowed_errors.append(r".* running disk usage based eviction due to pressure.*")
env.pageserver.allowed_errors.append(
r".* Changing Active tenant to Broken state, reason: broken from test"
)
# break the difficult to use initial default tenant, later assert that it has not been evicted
broken_tenant_id, broken_timeline_id = (env.initial_tenant, env.initial_timeline)
assert broken_timeline_id is not None
res = pageserver_http.put(
f"http://localhost:{pageserver_http.port}/v1/tenant/{env.initial_tenant}/break"
)
pageserver_http.verbose_error(res)
(broken_on_disk_before, _, _) = poor_mans_du(
env, timelines=[(broken_tenant_id, broken_timeline_id)]
)
timelines = []
# Choose small layer_size so that we can use low pgbench_scales and still get a large count of layers.
# Large count of layers and small layer size is good for testing because it makes evictions predictable.
# Predictable in the sense that many layer evictions will be required to reach the eviction target, because
# each eviction only makes small progress. That means little overshoot, and thereby stable asserts.
pgbench_scales = [4, 6]
layer_size = 5 * 1024**2
for scale in pgbench_scales:
tenant_id, timeline_id = env.neon_cli.create_tenant(
conf={
"gc_period": "0s",
"compaction_period": "0s",
"checkpoint_distance": f"{layer_size}",
"image_creation_threshold": "100",
"compaction_target_size": f"{layer_size}",
}
)
with env.postgres.create_start("main", tenant_id=tenant_id) as pg:
pg_bin.run(["pgbench", "-i", f"-s{scale}", pg.connstr()])
wait_for_last_flush_lsn(env, pg, tenant_id, timeline_id)
pageserver_http.timeline_checkpoint(tenant_id, timeline_id)
layers = pageserver_http.layer_map_info(tenant_id, timeline_id)
log.info(f"{layers}")
assert len(layers.historic_layers) >= 4
timelines.append((tenant_id, timeline_id, layers))
eviction_env = EvictionEnv(
timelines=timelines, neon_env=env, pageserver_http=pageserver_http, layer_size=layer_size, pg_bin=pg_bin
)
yield eviction_env
(broken_on_disk_after, _, _) = poor_mans_du(
eviction_env.neon_env, [(broken_tenant_id, broken_timeline_id)]
)
assert (
broken_on_disk_before == broken_on_disk_after
), "only touch active tenants with disk_usage_eviction"
def test_pageserver_evicts_until_pressure_is_relieved(eviction_env: EvictionEnv):
"""
Basic test to ensure that we evict enough to relieve pressure.
"""
env = eviction_env
pageserver_http = env.pageserver_http
(total_on_disk, _, _) = env.timelines_du()
target = total_on_disk // 2
response = pageserver_http.disk_usage_eviction_run({"wanted_trimmed_bytes": target})
log.info(f"{response}")
(later_total_on_disk, _, _) = env.timelines_du()
actual_change = total_on_disk - later_total_on_disk
assert 0 <= actual_change, "nothing can load layers during this test"
assert actual_change >= target, "must evict more than half"
assert (
response["Finished"]["assumed"]["projected_after"]["freed_bytes"] >= actual_change
), "report accurately evicted bytes"
assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected"
def test_pageserver_respects_overridden_resident_size(eviction_env: EvictionEnv):
"""
Override tenant min resident and ensure that it will be respected by eviction.
"""
env = eviction_env
ps_http = env.pageserver_http
(total_on_disk, _, _) = env.timelines_du()
du_by_timeline = env.du_by_timeline()
log.info("du_by_timeline: %s", du_by_timeline)
assert len(du_by_timeline) == 2, "this test assumes two tenants"
large_tenant = max(du_by_timeline, key=du_by_timeline.__getitem__)
small_tenant = min(du_by_timeline, key=du_by_timeline.__getitem__)
assert du_by_timeline[large_tenant] > du_by_timeline[small_tenant]
assert (
du_by_timeline[large_tenant] - du_by_timeline[small_tenant] > 5 * env.layer_size
), "ensure this test will do more than 1 eviction"
# give the larger tenant a haircut while prevening the smaller tenant from getting one
min_resident_size = du_by_timeline[small_tenant]
target = du_by_timeline[large_tenant] - du_by_timeline[small_tenant]
assert any(
[du > min_resident_size for du in du_by_timeline.values()]
), "ensure the larger tenant will get a haircut"
ps_http.set_tenant_config(small_tenant[0], {"min_resident_size_override": min_resident_size})
ps_http.set_tenant_config(large_tenant[0], {"min_resident_size_override": min_resident_size})
# do one run
response = ps_http.disk_usage_eviction_run({"wanted_trimmed_bytes": target})
log.info(f"{response}")
time.sleep(1) # give log time to flush
assert not env.neon_env.pageserver.log_contains(
"falling back to global LRU"
), "this test is pointless if it fell back to global LRU"
(later_total_on_disk, _, _) = env.timelines_du()
later_du_by_timeline = env.du_by_timeline()
log.info("later_du_by_timeline: %s", later_du_by_timeline)
actual_change = total_on_disk - later_total_on_disk
assert 0 <= actual_change, "nothing can load layers during this test"
assert actual_change >= target, "eviction must always evict more than target"
assert (
response["Finished"]["assumed"]["projected_after"]["freed_bytes"] >= actual_change
), "report accurately evicted bytes"
assert response["Finished"]["assumed"]["failed"]["count"] == 0, "zero failures expected"
assert (
later_du_by_timeline[small_tenant] == du_by_timeline[small_tenant]
), "small tenant sees no haircut"
assert (
later_du_by_timeline[large_tenant] < du_by_timeline[large_tenant]
), "large tenant gets a haircut"
assert du_by_timeline[large_tenant] - later_du_by_timeline[large_tenant] >= target
def test_pageserver_falls_back_to_global_lru(eviction_env: EvictionEnv):
"""
The pageserver should fall back to global LRU if the tenant_min_resident_size-respecting eviction
wouldn't evict enough.
"""
env = eviction_env
ps_http = env.pageserver_http
(total_on_disk, _, _) = env.timelines_du()
target = total_on_disk
response = ps_http.disk_usage_eviction_run({"wanted_trimmed_bytes": target})
log.info(f"{response}")
(later_total_on_disk, _, _) = env.timelines_du()
actual_change = total_on_disk - later_total_on_disk
assert 0 <= actual_change, "nothing can load layers during this test"
assert actual_change >= target, "eviction must always evict more than target"
time.sleep(1) # give log time to flush
assert env.neon_env.pageserver.log_contains("falling back to global LRU")
env.neon_env.pageserver.allowed_errors.append(".*falling back to global LRU")
def test_partial_evict_tenant(eviction_env: EvictionEnv):
env = eviction_env
ps_http = env.pageserver_http
(total_on_disk, _, _) = env.timelines_du()
du_by_timeline = env.du_by_timeline()
# pick any tenant
[our_tenant, other_tenant] = list(du_by_timeline.keys())
(tenant_id, timeline_id) = our_tenant
tenant_usage = du_by_timeline[our_tenant]
# make our tenant more recently used than the other one
with env.neon_env.postgres.create_start("main", tenant_id=tenant_id) as pg:
env.pg_bin.run(["pgbench", "-S" , pg.connstr()])
target = total_on_disk - (tenant_usage//2)
response = ps_http.disk_usage_eviction_run({"wanted_trimmed_bytes": target})
log.info(f"{response}")
(later_total_on_disk, _, _) = env.timelines_du()
actual_change = total_on_disk - later_total_on_disk
assert 0 <= actual_change, "nothing can load layers during this test"
assert actual_change >= target, "eviction must always evict more than target"
later_du_by_timeline = env.du_by_timeline()
for tenant, later_tenant_usage in later_du_by_timeline.items():
assert later_tenant_usage < du_by_timeline[tenant], "all tenants should have lost some layers"
assert later_du_by_timeline[our_tenant] > 0.4 * tenant_usage, "our warmed up tenant should be at about half capacity"
assert later_du_by_timeline[other_tenant] < 2 * env.layer_size, "the other tenant should be completely evicted"
def poor_mans_du(
env: NeonEnv, timelines: list[Tuple[TenantId, TimelineId]]
) -> Tuple[int, int, int]:
"""
Disk usage, largest, smallest layer for layer files over the given (tenant, timeline) tuples;
this could be done over layers endpoint just as well.
"""
total_on_disk = 0
largest_layer = 0
smallest_layer = None
for tenant_id, timeline_id in timelines:
dir = Path(env.repo_dir) / "tenants" / str(tenant_id) / "timelines" / str(timeline_id)
assert dir.exists(), f"timeline dir does not exist: {dir}"
sum = 0
for file in dir.iterdir():
if "__" not in file.name:
continue
size = file.stat().st_size
sum += size
largest_layer = max(largest_layer, size)
if smallest_layer:
smallest_layer = min(smallest_layer, size)
else:
smallest_layer = size
log.info(f"{tenant_id}/{timeline_id} => {file.name} {size}")
log.info(f"{tenant_id}/{timeline_id}: sum {sum}")
total_on_disk += sum
assert smallest_layer is not None or total_on_disk == 0 and largest_layer == 0
return (total_on_disk, largest_layer, smallest_layer or 0)