test: assert we eventually free space (#6536)

in `test_statvfs_pressure_{usage,min_avail_bytes}` we now race against
initial logical size calculation on-demand downloading the layers. first
wait out the initial logical sizes, then change the final asserts to be
"eventual", which is not great but it is faster than failing and
retrying.

this issue seems to happen only in debug mode tests.

Fixes: #6510
This commit is contained in:
Joonas Koivunen
2024-02-02 19:46:47 +02:00
committed by GitHub
parent 7e2436695d
commit caf868e274
3 changed files with 43 additions and 33 deletions

View File

@@ -831,3 +831,16 @@ class PageserverHttpClient(requests.Session):
self.put(
f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}"
).raise_for_status()
def timeline_wait_logical_size(self, tenant_id: TenantId, timeline_id: TimelineId) -> int:
detail = self.timeline_detail(
tenant_id,
timeline_id,
include_non_incremental_logical_size=True,
force_await_initial_logical_size=True,
)
current_logical_size = detail["current_logical_size"]
non_incremental = detail["current_logical_size_non_incremental"]
assert current_logical_size == non_incremental
assert isinstance(current_logical_size, int)
return current_logical_size

View File

@@ -155,6 +155,15 @@ class EvictionEnv:
mock_behavior,
eviction_order: EvictionOrder,
):
"""
Starts pageserver up with mocked statvfs setup. The startup is
problematic because of dueling initial logical size calculations
requiring layers and disk usage based task evicting.
Returns after initial logical sizes are complete, but the phase of disk
usage eviction task is unknown; it might need to run one more iteration
before assertions can be made.
"""
disk_usage_config = {
"period": period,
"max_usage_pct": max_usage_pct,
@@ -183,9 +192,15 @@ class EvictionEnv:
),
)
# we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction
for tenant_id, timeline_id in self.timelines:
pageserver_http = self.neon_env.get_tenant_pageserver(tenant_id).http_client()
pageserver_http.timeline_wait_logical_size(tenant_id, timeline_id)
def statvfs_called():
assert pageserver.log_contains(".*running mocked statvfs.*")
# we most likely have already completed multiple runs
wait_until(10, 1, statvfs_called)
@@ -789,9 +804,11 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):
wait_until(10, 1, relieved_log_message)
post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
def less_than_max_usage_pct():
post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
assert post_eviction_total_size < 0.33 * total_size, "we requested max 33% usage"
assert post_eviction_total_size <= 0.33 * total_size, "we requested max 33% usage"
wait_until(2, 2, less_than_max_usage_pct)
def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
@@ -831,11 +848,13 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
wait_until(10, 1, relieved_log_message)
post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
def more_than_min_avail_bytes_freed():
post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
assert (
total_size - post_eviction_total_size >= min_avail_bytes
), f"we requested at least {min_avail_bytes} worth of free space"
assert (
total_size - post_eviction_total_size >= min_avail_bytes
), "we requested at least min_avail_bytes worth of free space"
wait_until(2, 2, more_than_min_avail_bytes_freed)
def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):

View File

@@ -20,7 +20,7 @@ from fixtures.neon_fixtures import (
VanillaPostgres,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
from fixtures.pageserver.http import PageserverApiException
from fixtures.pageserver.utils import (
assert_tenant_state,
timeline_delete_wait_completed,
@@ -40,7 +40,7 @@ def test_timeline_size(neon_simple_env: NeonEnv):
new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty")
client = env.pageserver.http_client()
wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
endpoint_main = env.endpoints.create_start("test_timeline_size")
log.info("postgres is running on 'test_timeline_size' branch")
@@ -73,7 +73,7 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv):
new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty")
client = env.pageserver.http_client()
wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
timeline_details = client.timeline_detail(
env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True
)
@@ -153,7 +153,7 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
client = env.pageserver.http_client()
new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota_on_startup")
wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
endpoint_main = env.endpoints.create(
"test_timeline_size_quota_on_startup",
@@ -219,7 +219,7 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
client = env.pageserver.http_client()
new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota")
wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
endpoint_main = env.endpoints.create(
"test_timeline_size_quota",
@@ -715,28 +715,6 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
# XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
# Timeline logical size initialization is an asynchronous background task that runs once,
# try a few times to ensure it's activated properly
def wait_for_timeline_size_init(
client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
):
for i in range(10):
timeline_details = client.timeline_detail(
tenant, timeline, include_non_incremental_logical_size=True
)
current_logical_size = timeline_details["current_logical_size"]
non_incremental = timeline_details["current_logical_size_non_incremental"]
if current_logical_size == non_incremental:
return
log.info(
f"waiting for current_logical_size of a timeline to be calculated, iteration {i}: {current_logical_size} vs {non_incremental}"
)
time.sleep(1)
raise Exception(
f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}"
)
def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
"""
Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete