chore(compute): Simplify new compute_ctl metrics and fix flaky test (#10560)

## Problem

1. d04d924 added separate metrics for total requests and failures
separately, but it doesn't make much sense. We could just have a unified
counter with `http_status`.
2. `test_compute_migrations_retry` had a race, i.e., it was waiting for
the last successful migration, not an actual failure. This was revealed
after adding an assert on failure metric in d04d924.

## Summary of changes

1. Switch to unified counters for `compute_ctl` requests.
2. Add a waiting loop into `test_compute_migrations_retry` to eliminate
the race.

Part of neondatabase/cloud#17590
This commit is contained in:
Alexey Kondratov
2025-01-29 19:09:25 +01:00
committed by GitHub
parent fdfbc7b358
commit 34322b2424
4 changed files with 61 additions and 74 deletions

View File

@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, cast
import pytest
from fixtures.compute_migrations import COMPUTE_MIGRATIONS, NUM_COMPUTE_MIGRATIONS
from fixtures.metrics import parse_metrics
from fixtures.utils import wait_until
if TYPE_CHECKING:
from fixtures.neon_fixtures import NeonEnv
@@ -24,7 +25,26 @@ def test_compute_migrations_retry(neon_simple_env: NeonEnv, compute_migrations_d
for i in range(1, NUM_COMPUTE_MIGRATIONS + 1):
endpoint.start(env={"FAILPOINTS": f"compute-migration=return({i})"})
# Make sure that the migrations ran
# Check that migration failure is properly recorded in the metrics
#
# N.B. wait_for_migrations() only waits till the last successful
# migration is applied. It doesn't wait till the migration failure due
# to the failpoint. This opens a race for checking the metrics. To avoid
# this, we first wait until the migration failure metric is seen.
def check_migration_failure_metrics():
client = endpoint.http_client()
raw_metrics = client.metrics()
metrics = parse_metrics(raw_metrics)
failed_migration = metrics.query_all(
"compute_ctl_db_migration_failed_total",
)
assert len(failed_migration) == 1
for sample in failed_migration:
assert sample.value == 1
wait_until(check_migration_failure_metrics)
# Make sure that all migrations before the failed one are applied
endpoint.wait_for_migrations(wait_for=i - 1)
# Confirm that we correctly recorded that in the
@@ -34,17 +54,6 @@ def test_compute_migrations_retry(neon_simple_env: NeonEnv, compute_migrations_d
migration_id = cast("int", cur.fetchall()[0][0])
assert migration_id == i - 1
# Check that migration failure is properly recorded in the metrics
client = endpoint.http_client()
raw_metrics = client.metrics()
metrics = parse_metrics(raw_metrics)
failed_migration = metrics.query_all(
"compute_ctl_db_migration_failed_total",
)
assert len(failed_migration) == 1
for sample in failed_migration:
assert sample.value == 1
endpoint.stop()
endpoint.start()