test: assert we eventually free space (#6536)

in `test_statvfs_pressure_{usage,min_avail_bytes}` we now race against initial logical size calculation on-demand downloading the layers. first wait out the initial logical sizes, then change the final asserts to be "eventual", which is not great but it is faster than failing and retrying. this issue seems to happen only in debug mode tests. Fixes: #6510
2026-01-08 14:02:55 +00:00 · 2024-02-02 19:46:47 +02:00
parent 7e2436695d
commit caf868e274
3 changed files with 43 additions and 33 deletions
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -831,3 +831,16 @@ class PageserverHttpClient(requests.Session):
        self.put(
            f"http://localhost:{self.port}/v1/deletion_queue/flush?execute={'true' if execute else 'false'}"
        ).raise_for_status()
+
+    def timeline_wait_logical_size(self, tenant_id: TenantId, timeline_id: TimelineId) -> int:
+        detail = self.timeline_detail(
+            tenant_id,
+            timeline_id,
+            include_non_incremental_logical_size=True,
+            force_await_initial_logical_size=True,
+        )
+        current_logical_size = detail["current_logical_size"]
+        non_incremental = detail["current_logical_size_non_incremental"]
+        assert current_logical_size == non_incremental
+        assert isinstance(current_logical_size, int)
+        return current_logical_size
--- a/test_runner/regress/test_disk_usage_eviction.py
+++ b/test_runner/regress/test_disk_usage_eviction.py
@@ -155,6 +155,15 @@ class EvictionEnv:
        mock_behavior,
        eviction_order: EvictionOrder,
    ):
+        """
+        Starts pageserver up with mocked statvfs setup. The startup is
+        problematic because of dueling initial logical size calculations
+        requiring layers and disk usage based task evicting.
+
+        Returns after initial logical sizes are complete, but the phase of disk
+        usage eviction task is unknown; it might need to run one more iteration
+        before assertions can be made.
+        """
        disk_usage_config = {
            "period": period,
            "max_usage_pct": max_usage_pct,
@@ -183,9 +192,15 @@ class EvictionEnv:
            ),
        )

+        # we now do initial logical size calculation on startup, which on debug builds can fight with disk usage based eviction
+        for tenant_id, timeline_id in self.timelines:
+            pageserver_http = self.neon_env.get_tenant_pageserver(tenant_id).http_client()
+            pageserver_http.timeline_wait_logical_size(tenant_id, timeline_id)
+
        def statvfs_called():
            assert pageserver.log_contains(".*running mocked statvfs.*")

+        # we most likely have already completed multiple runs
        wait_until(10, 1, statvfs_called)


@@ -789,9 +804,11 @@ def test_statvfs_pressure_usage(eviction_env: EvictionEnv):

    wait_until(10, 1, relieved_log_message)

-    post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
+    def less_than_max_usage_pct():
+        post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
+        assert post_eviction_total_size < 0.33 * total_size, "we requested max 33% usage"

-    assert post_eviction_total_size <= 0.33 * total_size, "we requested max 33% usage"
+    wait_until(2, 2, less_than_max_usage_pct)


 def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):
@@ -831,11 +848,13 @@ def test_statvfs_pressure_min_avail_bytes(eviction_env: EvictionEnv):

    wait_until(10, 1, relieved_log_message)

-    post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
+    def more_than_min_avail_bytes_freed():
+        post_eviction_total_size, _, _ = env.timelines_du(env.pageserver)
+        assert (
+            total_size - post_eviction_total_size >= min_avail_bytes
+        ), f"we requested at least {min_avail_bytes} worth of free space"

-    assert (
-        total_size - post_eviction_total_size >= min_avail_bytes
-    ), "we requested at least min_avail_bytes worth of free space"
+    wait_until(2, 2, more_than_min_avail_bytes_freed)


 def test_secondary_mode_eviction(eviction_env_ha: EvictionEnv):
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -20,7 +20,7 @@ from fixtures.neon_fixtures import (
    VanillaPostgres,
    wait_for_last_flush_lsn,
 )
-from fixtures.pageserver.http import PageserverApiException, PageserverHttpClient
+from fixtures.pageserver.http import PageserverApiException
 from fixtures.pageserver.utils import (
    assert_tenant_state,
    timeline_delete_wait_completed,
@@ -40,7 +40,7 @@ def test_timeline_size(neon_simple_env: NeonEnv):
    new_timeline_id = env.neon_cli.create_branch("test_timeline_size", "empty")

    client = env.pageserver.http_client()
-    wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
+    client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)

    endpoint_main = env.endpoints.create_start("test_timeline_size")
    log.info("postgres is running on 'test_timeline_size' branch")
@@ -73,7 +73,7 @@ def test_timeline_size_createdropdb(neon_simple_env: NeonEnv):
    new_timeline_id = env.neon_cli.create_branch("test_timeline_size_createdropdb", "empty")

    client = env.pageserver.http_client()
-    wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
+    client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)
    timeline_details = client.timeline_detail(
        env.initial_tenant, new_timeline_id, include_non_incremental_logical_size=True
    )
@@ -153,7 +153,7 @@ def test_timeline_size_quota_on_startup(neon_env_builder: NeonEnvBuilder):
    client = env.pageserver.http_client()
    new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota_on_startup")

-    wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
+    client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)

    endpoint_main = env.endpoints.create(
        "test_timeline_size_quota_on_startup",
@@ -219,7 +219,7 @@ def test_timeline_size_quota(neon_env_builder: NeonEnvBuilder):
    client = env.pageserver.http_client()
    new_timeline_id = env.neon_cli.create_branch("test_timeline_size_quota")

-    wait_for_timeline_size_init(client, tenant=env.initial_tenant, timeline=new_timeline_id)
+    client.timeline_wait_logical_size(env.initial_tenant, new_timeline_id)

    endpoint_main = env.endpoints.create(
        "test_timeline_size_quota",
@@ -715,28 +715,6 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
    # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS


-# Timeline logical size initialization is an asynchronous background task that runs once,
-# try a few times to ensure it's activated properly
-def wait_for_timeline_size_init(
-    client: PageserverHttpClient, tenant: TenantId, timeline: TimelineId
-):
-    for i in range(10):
-        timeline_details = client.timeline_detail(
-            tenant, timeline, include_non_incremental_logical_size=True
-        )
-        current_logical_size = timeline_details["current_logical_size"]
-        non_incremental = timeline_details["current_logical_size_non_incremental"]
-        if current_logical_size == non_incremental:
-            return
-        log.info(
-            f"waiting for current_logical_size of a timeline to be calculated, iteration {i}: {current_logical_size} vs {non_incremental}"
-        )
-        time.sleep(1)
-    raise Exception(
-        f"timed out while waiting for current_logical_size of a timeline to reach its non-incremental value, details: {timeline_details}"
-    )
-
-
 def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
    """
    Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete