test: fix on demand activation test flakyness (#7180)

Warm-up (and the "tenant startup complete" metric update) happens in a background tokio task. The tenant map is eagerly updated (can happen before the task finishes). The test assumed that if the tenant map was updated, then the metric should reflect that. That's not the case, so we tweak the test to wait for the metric. Fixes https://github.com/neondatabase/neon/issues/7158
2026-06-04 22:10:39 +00:00 · 2024-03-20 10:24:59 +00:00
parent a5d5c2a6a0
commit 4ba3f3518e
1 changed files with 11 additions and 9 deletions
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -20,6 +20,7 @@ from fixtures.neon_fixtures import (
    VanillaPostgres,
    wait_for_last_flush_lsn,
 )
+from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.utils import (
    assert_tenant_state,
    timeline_delete_wait_completed,
@@ -684,6 +685,13 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
    # XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS


+def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int):
+    def condition():
+        assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count
+
+    wait_until(5, 1.0, condition)
+
+
 def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
    """
    Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete
@@ -767,10 +775,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
    # That one that we successfully accessed is now Active
    expect_activated += 1
    assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
-    assert (
-        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
-        == expect_activated - 1
-    )
+    wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1)

    # The ones we didn't touch are still in Attaching
    assert (
@@ -790,10 +795,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
        == n_tenants - expect_activated
    )

-    assert (
-        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
-        == expect_activated - 1
-    )
+    wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1)

    # When we unblock logical size calculation, all tenants should proceed to active state via
    # the warmup route.
@@ -813,7 +815,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
    assert (
        pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
    )
-    assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants
+    wait_for_tenant_startup_completions(pageserver_http, count=n_tenants)

    # Check that tenant deletion/detach proactively wakes tenants: this is done separately to the main
    # body of the test because it will disrupt tenant counts