counters metrics for tenants completing warmup

2026-05-20 22:50:38 +00:00 · 2023-12-15 15:10:16 +00:00
parent e9a9e6be30
commit c6867e9f32
4 changed files with 56 additions and 6 deletions
--- a/test_runner/regress/test_timeline_size.py
+++ b/test_runner/regress/test_timeline_size.py
@@ -757,7 +757,7 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):

        # Empty tenants are not subject to waiting for logical size calculations, because
        # those hapen on timeline level
-        branch_name = f"{tenant_id}-main"
+        branch_name = "main"
        timeline_id = TimelineId.generate()
        env.neon_cli.create_timeline(
            new_branch_name=branch_name, tenant_id=tenant_id, timeline_id=timeline_id
@@ -795,15 +795,22 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
    assert len([s for s in states.values() if s == "Active"]) == expect_activated
    assert len([s for s in states.values() if s == "Attaching"]) == n_tenants - expect_activated

+    assert (
+        pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
+    )
+
+    # This is zero, and subsequent checks are expect_activated - 1, because this counter does not
+    # count how may tenants are Active, it counts how many have finished warmup.  The first tenant
+    # that reached Active is still stuck in its local size calculation, and has therefore not finished warmup.
+    assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == 0
+
    # If a client accesses one of the blocked tenants, it should skip waiting for warmup and
    # go active as fast as it can.
    stuck_tenant_id = list(
        [(tid, s) for (tid, s) in get_tenant_states().items() if s == "Attaching"]
    )[0][0]

-    endpoint = env.endpoints.create_start(
-        branch_name=f"{stuck_tenant_id}-main", tenant_id=stuck_tenant_id
-    )
+    endpoint = env.endpoints.create_start(branch_name="main", tenant_id=stuck_tenant_id)
    endpoint.safe_psql_many(
        [
            "CREATE TABLE foo (x INTEGER)",
@@ -815,6 +822,10 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
    # That one that we successfully accessed is now Active
    expect_activated += 1
    assert pageserver_http.tenant_status(tenant_id=stuck_tenant_id)["state"]["slug"] == "Active"
+    assert (
+        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
+        == expect_activated - 1
+    )

    # The ones we didn't touch are still in Attaching
    assert (
@@ -834,6 +845,11 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
        == n_tenants - expect_activated
    )

+    assert (
+        pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total")
+        == expect_activated - 1
+    )
+
    # When we unblock logical size calculation, all tenants should proceed to active state via
    # the warmup route.
    pageserver_http.configure_failpoints(("timeline-calculate-logical-size-pause", "off"))
@@ -848,3 +864,8 @@ def test_ondemand_activation(neon_env_builder: NeonEnvBuilder):
    env.pageserver.stop()
    env.pageserver.start()
    wait_until(10, 1, all_active)
+
+    assert (
+        pageserver_http.get_metric_value("pageserver_tenant_startup_scheduled_total") == n_tenants
+    )
+    assert pageserver_http.get_metric_value("pageserver_tenant_startup_complete_total") == n_tenants