chore(compute): Simplify new compute_ctl metrics and fix flaky test (#10560)

## Problem 1. d04d924 added separate metrics for total requests and failures separately, but it doesn't make much sense. We could just have a unified counter with `http_status`. 2. `test_compute_migrations_retry` had a race, i.e., it was waiting for the last successful migration, not an actual failure. This was revealed after adding an assert on failure metric in d04d924. ## Summary of changes 1. Switch to unified counters for `compute_ctl` requests. 2. Add a waiting loop into `test_compute_migrations_retry` to eliminate the race. Part of neondatabase/cloud#17590
2026-01-09 14:32:57 +00:00 · 2025-01-29 19:09:25 +01:00
parent fdfbc7b358
commit 34322b2424
4 changed files with 61 additions and 74 deletions
--- a/test_runner/regress/test_compute_migrations.py
+++ b/test_runner/regress/test_compute_migrations.py
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, cast
 import pytest
 from fixtures.compute_migrations import COMPUTE_MIGRATIONS, NUM_COMPUTE_MIGRATIONS
 from fixtures.metrics import parse_metrics
+from fixtures.utils import wait_until

 if TYPE_CHECKING:
    from fixtures.neon_fixtures import NeonEnv
@@ -24,7 +25,26 @@ def test_compute_migrations_retry(neon_simple_env: NeonEnv, compute_migrations_d
    for i in range(1, NUM_COMPUTE_MIGRATIONS + 1):
        endpoint.start(env={"FAILPOINTS": f"compute-migration=return({i})"})

-        # Make sure that the migrations ran
+        # Check that migration failure is properly recorded in the metrics
+        #
+        # N.B. wait_for_migrations() only waits till the last successful
+        # migration is applied. It doesn't wait till the migration failure due
+        # to the failpoint. This opens a race for checking the metrics. To avoid
+        # this, we first wait until the migration failure metric is seen.
+        def check_migration_failure_metrics():
+            client = endpoint.http_client()
+            raw_metrics = client.metrics()
+            metrics = parse_metrics(raw_metrics)
+            failed_migration = metrics.query_all(
+                "compute_ctl_db_migration_failed_total",
+            )
+            assert len(failed_migration) == 1
+            for sample in failed_migration:
+                assert sample.value == 1
+
+        wait_until(check_migration_failure_metrics)
+
+        # Make sure that all migrations before the failed one are applied
        endpoint.wait_for_migrations(wait_for=i - 1)

        # Confirm that we correctly recorded that in the
@@ -34,17 +54,6 @@ def test_compute_migrations_retry(neon_simple_env: NeonEnv, compute_migrations_d
            migration_id = cast("int", cur.fetchall()[0][0])
            assert migration_id == i - 1

-        # Check that migration failure is properly recorded in the metrics
-        client = endpoint.http_client()
-        raw_metrics = client.metrics()
-        metrics = parse_metrics(raw_metrics)
-        failed_migration = metrics.query_all(
-            "compute_ctl_db_migration_failed_total",
-        )
-        assert len(failed_migration) == 1
-        for sample in failed_migration:
-            assert sample.value == 1
-
        endpoint.stop()

    endpoint.start()