storage controller: fix spurious reconciles after pageserver restarts (#6814)

## Problem When investigating test failures (https://github.com/neondatabase/neon/issues/6813) I noticed we were doing a bunch of Reconciler runs right after splitting a tenant. It's because the splitting test does a pageserver restart, and there was a bug in /re-attach handling, where we would update the generation correctly in the database and intent state, but not observed state, thereby triggering a reconciliation on the next call to maybe_reconcile. This didn't break anything profound (underlying rules about generations were respected), but caused the storage controller to do an un-needed extra round of bumping the generation and reconciling. ## Summary of changes - Start adding metrics to the storage controller - Assert on the number of reconciles done in test_sharding_split_smoke - Fix /re-attach to update `observed` such that we don't spuriously re-reconcile tenants.
2026-01-08 05:52:55 +00:00 · 2024-02-19 17:44:20 +00:00
parent e0c12faabd
commit 4f7704af24
12 changed files with 155 additions and 42 deletions
--- a/test_runner/fixtures/metrics.py
+++ b/test_runner/fixtures/metrics.py
@@ -4,6 +4,8 @@ from typing import Dict, List, Optional, Tuple
 from prometheus_client.parser import text_string_to_metric_families
 from prometheus_client.samples import Sample

+from fixtures.log_helper import log
+

 class Metrics:
    metrics: Dict[str, List[Sample]]
@@ -31,6 +33,55 @@ class Metrics:
        return res[0]


+class MetricsGetter:
+    """
+    Mixin for types that implement a `get_metrics` function and would like associated
+    helpers for querying the metrics
+    """
+
+    def get_metrics(self) -> Metrics:
+        raise NotImplementedError()
+
+    def get_metric_value(
+        self, name: str, filter: Optional[Dict[str, str]] = None
+    ) -> Optional[float]:
+        metrics = self.get_metrics()
+        results = metrics.query_all(name, filter=filter)
+        if not results:
+            log.info(f'could not find metric "{name}"')
+            return None
+        assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
+        return results[0].value
+
+    def get_metrics_values(
+        self, names: list[str], filter: Optional[Dict[str, str]] = None
+    ) -> Dict[str, float]:
+        """
+        When fetching multiple named metrics, it is more efficient to use this
+        than to call `get_metric_value` repeatedly.
+
+        Throws RuntimeError if no metrics matching `names` are found, or if
+        not all of `names` are found: this method is intended for loading sets
+        of metrics whose existence is coupled.
+        """
+        metrics = self.get_metrics()
+        samples = []
+        for name in names:
+            samples.extend(metrics.query_all(name, filter=filter))
+
+        result = {}
+        for sample in samples:
+            if sample.name in result:
+                raise RuntimeError(f"Multiple values found for {sample.name}")
+            result[sample.name] = sample.value
+
+        if len(result) != len(names):
+            log.info(f"Metrics found: {metrics.metrics}")
+            raise RuntimeError(f"could not find all metrics {' '.join(names)}")
+
+        return result
+
+
 def parse_metrics(text: str, name: str = "") -> Metrics:
    metrics = Metrics(name)
    gen = text_string_to_metric_families(text)
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -46,6 +46,7 @@ from urllib3.util.retry import Retry
 from fixtures import overlayfs
 from fixtures.broker import NeonBroker
 from fixtures.log_helper import log
+from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pageserver.allowed_errors import (
    DEFAULT_PAGESERVER_ALLOWED_ERRORS,
    scan_pageserver_log_for_errors,
@@ -1913,7 +1914,7 @@ class Pagectl(AbstractNeonCli):
        return IndexPartDump.from_json(parsed)


-class NeonAttachmentService:
+class NeonAttachmentService(MetricsGetter):
    def __init__(self, env: NeonEnv, auth_enabled: bool):
        self.env = env
        self.running = False
@@ -1951,6 +1952,11 @@ class NeonAttachmentService:

        return headers

+    def get_metrics(self) -> Metrics:
+        res = self.request("GET", f"{self.env.attachment_service_api}/metrics")
+        res.raise_for_status()
+        return parse_metrics(res.text)
+
    def ready(self) -> bool:
        resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
        if resp.status_code == 503:
--- a/test_runner/fixtures/pageserver/http.py
+++ b/test_runner/fixtures/pageserver/http.py
@@ -12,7 +12,7 @@ from requests.adapters import HTTPAdapter
 from urllib3.util.retry import Retry

 from fixtures.log_helper import log
-from fixtures.metrics import Metrics, parse_metrics
+from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pg_version import PgVersion
 from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
 from fixtures.utils import Fn
@@ -125,7 +125,7 @@ class TenantConfig:
        )


-class PageserverHttpClient(requests.Session):
+class PageserverHttpClient(requests.Session, MetricsGetter):
    def __init__(
        self,
        port: int,
@@ -721,45 +721,6 @@ class PageserverHttpClient(requests.Session):
            assert len(matches) < 2, "above filter should uniquely identify metric"
        return value

-    def get_metric_value(
-        self, name: str, filter: Optional[Dict[str, str]] = None
-    ) -> Optional[float]:
-        metrics = self.get_metrics()
-        results = metrics.query_all(name, filter=filter)
-        if not results:
-            log.info(f'could not find metric "{name}"')
-            return None
-        assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
-        return results[0].value
-
-    def get_metrics_values(
-        self, names: list[str], filter: Optional[Dict[str, str]] = None
-    ) -> Dict[str, float]:
-        """
-        When fetching multiple named metrics, it is more efficient to use this
-        than to call `get_metric_value` repeatedly.
-
-        Throws RuntimeError if no metrics matching `names` are found, or if
-        not all of `names` are found: this method is intended for loading sets
-        of metrics whose existence is coupled.
-        """
-        metrics = self.get_metrics()
-        samples = []
-        for name in names:
-            samples.extend(metrics.query_all(name, filter=filter))
-
-        result = {}
-        for sample in samples:
-            if sample.name in result:
-                raise RuntimeError(f"Multiple values found for {sample.name}")
-            result[sample.name] = sample.value
-
-        if len(result) != len(names):
-            log.info(f"Metrics found: {metrics.metrics}")
-            raise RuntimeError(f"could not find all metrics {' '.join(names)}")
-
-        return result
-
    def layer_map_info(
        self,
        tenant_id: Union[TenantId, TenantShardId],
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -255,3 +255,26 @@ def test_sharding_split_smoke(
        env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10)

    workload.validate()
+
+    # Check that we didn't do any spurious reconciliations.
+    # Total number of reconciles should have been one per original shard, plus
+    # one for each shard that was migrated.
+    reconcile_ok = env.attachment_service.get_metric_value(
+        "storage_controller_reconcile_complete_total", filter={"status": "ok"}
+    )
+    assert reconcile_ok == shard_count + split_shard_count // 2
+
+    # Check that no cancelled or errored reconciliations occurred: this test does no
+    # failure injection and should run clean.
+    assert (
+        env.attachment_service.get_metric_value(
+            "storage_controller_reconcile_complete_total", filter={"status": "cancel"}
+        )
+        is None
+    )
+    assert (
+        env.attachment_service.get_metric_value(
+            "storage_controller_reconcile_complete_total", filter={"status": "error"}
+        )
+        is None
+    )