mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-09 14:32:57 +00:00
chore(compute): Simplify new compute_ctl metrics and fix flaky test (#10560)
## Problem 1.d04d924added separate metrics for total requests and failures separately, but it doesn't make much sense. We could just have a unified counter with `http_status`. 2. `test_compute_migrations_retry` had a race, i.e., it was waiting for the last successful migration, not an actual failure. This was revealed after adding an assert on failure metric ind04d924. ## Summary of changes 1. Switch to unified counters for `compute_ctl` requests. 2. Add a waiting loop into `test_compute_migrations_retry` to eliminate the race. Part of neondatabase/cloud#17590
This commit is contained in:
@@ -6,6 +6,7 @@ from typing import TYPE_CHECKING, cast
|
||||
import pytest
|
||||
from fixtures.compute_migrations import COMPUTE_MIGRATIONS, NUM_COMPUTE_MIGRATIONS
|
||||
from fixtures.metrics import parse_metrics
|
||||
from fixtures.utils import wait_until
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from fixtures.neon_fixtures import NeonEnv
|
||||
@@ -24,7 +25,26 @@ def test_compute_migrations_retry(neon_simple_env: NeonEnv, compute_migrations_d
|
||||
for i in range(1, NUM_COMPUTE_MIGRATIONS + 1):
|
||||
endpoint.start(env={"FAILPOINTS": f"compute-migration=return({i})"})
|
||||
|
||||
# Make sure that the migrations ran
|
||||
# Check that migration failure is properly recorded in the metrics
|
||||
#
|
||||
# N.B. wait_for_migrations() only waits till the last successful
|
||||
# migration is applied. It doesn't wait till the migration failure due
|
||||
# to the failpoint. This opens a race for checking the metrics. To avoid
|
||||
# this, we first wait until the migration failure metric is seen.
|
||||
def check_migration_failure_metrics():
|
||||
client = endpoint.http_client()
|
||||
raw_metrics = client.metrics()
|
||||
metrics = parse_metrics(raw_metrics)
|
||||
failed_migration = metrics.query_all(
|
||||
"compute_ctl_db_migration_failed_total",
|
||||
)
|
||||
assert len(failed_migration) == 1
|
||||
for sample in failed_migration:
|
||||
assert sample.value == 1
|
||||
|
||||
wait_until(check_migration_failure_metrics)
|
||||
|
||||
# Make sure that all migrations before the failed one are applied
|
||||
endpoint.wait_for_migrations(wait_for=i - 1)
|
||||
|
||||
# Confirm that we correctly recorded that in the
|
||||
@@ -34,17 +54,6 @@ def test_compute_migrations_retry(neon_simple_env: NeonEnv, compute_migrations_d
|
||||
migration_id = cast("int", cur.fetchall()[0][0])
|
||||
assert migration_id == i - 1
|
||||
|
||||
# Check that migration failure is properly recorded in the metrics
|
||||
client = endpoint.http_client()
|
||||
raw_metrics = client.metrics()
|
||||
metrics = parse_metrics(raw_metrics)
|
||||
failed_migration = metrics.query_all(
|
||||
"compute_ctl_db_migration_failed_total",
|
||||
)
|
||||
assert len(failed_migration) == 1
|
||||
for sample in failed_migration:
|
||||
assert sample.value == 1
|
||||
|
||||
endpoint.stop()
|
||||
|
||||
endpoint.start()
|
||||
|
||||
Reference in New Issue
Block a user