test: fix on demand activation test flakyness (#7180)

Warm-up (and the "tenant startup complete" metric update) happens in
a background tokio task. The tenant map is eagerly updated (can happen
before the task finishes).

The test assumed that if the tenant map was updated, then the metric
should reflect that. That's not the case, so we tweak the test to wait
for the metric.

Fixes https://github.com/neondatabase/neon/issues/7158
This commit is contained in:
Vlad Lazar
2024-03-20 10:24:59 +00:00
committed by GitHub
parent a5d5c2a6a0
commit 4ba3f3518e

View File

@@ -20,6 +20,7 @@ from fixtures.neon_fixtures import (
VanillaPostgres,
wait_for_last_flush_lsn,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.utils import (
assert_tenant_state,
timeline_delete_wait_completed,
@@ -684,6 +685,13 @@ def assert_physical_size_invariants(sizes: TimelinePhysicalSizeValues):
# XXX would be nice to assert layer file physical storage utilization here as well, but we can only do that for LocalFS
def wait_for_tenant_startup_completions(client: PageserverHttpClient, count: int):
def condition():
assert client.get_metric_value("pageserver_tenant_startup_complete_total") == count
wait_until(5, 1.0, condition)
def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
"""
Tenants warmuping up opportunistically will wait for one another's logical size calculations to complete
@@ -767,10 +775,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
# That one that we successfully accessed is now Active
expect_activated += 1
assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
assert (
pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
== expect_activated - 1
)
wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1)
# The ones we didn't touch are still in Attaching
assert (
@@ -790,10 +795,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
== n_tenants - expect_activated
)
assert (
pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
== expect_activated - 1
)
wait_for_tenant_startup_completions(pageserver_http, count=expect_activated - 1)
# When we unblock logical size calculation, all tenants should proceed to active state via
# the warmup route.
@@ -813,7 +815,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
assert (
pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
)
assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants
wait_for_tenant_startup_completions(pageserver_http, count=n_tenants)
# Check that tenant deletion/detach proactively wakes tenants: this is done separately to the main
# body of the test because it will disrupt tenant counts