mirror of
https://github.com/neondatabase/neon.git
synced 2026-01-08 05:52:55 +00:00
storage controller: fix spurious reconciles after pageserver restarts (#6814)
## Problem When investigating test failures (https://github.com/neondatabase/neon/issues/6813) I noticed we were doing a bunch of Reconciler runs right after splitting a tenant. It's because the splitting test does a pageserver restart, and there was a bug in /re-attach handling, where we would update the generation correctly in the database and intent state, but not observed state, thereby triggering a reconciliation on the next call to maybe_reconcile. This didn't break anything profound (underlying rules about generations were respected), but caused the storage controller to do an un-needed extra round of bumping the generation and reconciling. ## Summary of changes - Start adding metrics to the storage controller - Assert on the number of reconciles done in test_sharding_split_smoke - Fix /re-attach to update `observed` such that we don't spuriously re-reconcile tenants.
This commit is contained in:
@@ -4,6 +4,8 @@ from typing import Dict, List, Optional, Tuple
|
||||
from prometheus_client.parser import text_string_to_metric_families
|
||||
from prometheus_client.samples import Sample
|
||||
|
||||
from fixtures.log_helper import log
|
||||
|
||||
|
||||
class Metrics:
|
||||
metrics: Dict[str, List[Sample]]
|
||||
@@ -31,6 +33,55 @@ class Metrics:
|
||||
return res[0]
|
||||
|
||||
|
||||
class MetricsGetter:
|
||||
"""
|
||||
Mixin for types that implement a `get_metrics` function and would like associated
|
||||
helpers for querying the metrics
|
||||
"""
|
||||
|
||||
def get_metrics(self) -> Metrics:
|
||||
raise NotImplementedError()
|
||||
|
||||
def get_metric_value(
|
||||
self, name: str, filter: Optional[Dict[str, str]] = None
|
||||
) -> Optional[float]:
|
||||
metrics = self.get_metrics()
|
||||
results = metrics.query_all(name, filter=filter)
|
||||
if not results:
|
||||
log.info(f'could not find metric "{name}"')
|
||||
return None
|
||||
assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
|
||||
return results[0].value
|
||||
|
||||
def get_metrics_values(
|
||||
self, names: list[str], filter: Optional[Dict[str, str]] = None
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
When fetching multiple named metrics, it is more efficient to use this
|
||||
than to call `get_metric_value` repeatedly.
|
||||
|
||||
Throws RuntimeError if no metrics matching `names` are found, or if
|
||||
not all of `names` are found: this method is intended for loading sets
|
||||
of metrics whose existence is coupled.
|
||||
"""
|
||||
metrics = self.get_metrics()
|
||||
samples = []
|
||||
for name in names:
|
||||
samples.extend(metrics.query_all(name, filter=filter))
|
||||
|
||||
result = {}
|
||||
for sample in samples:
|
||||
if sample.name in result:
|
||||
raise RuntimeError(f"Multiple values found for {sample.name}")
|
||||
result[sample.name] = sample.value
|
||||
|
||||
if len(result) != len(names):
|
||||
log.info(f"Metrics found: {metrics.metrics}")
|
||||
raise RuntimeError(f"could not find all metrics {' '.join(names)}")
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def parse_metrics(text: str, name: str = "") -> Metrics:
|
||||
metrics = Metrics(name)
|
||||
gen = text_string_to_metric_families(text)
|
||||
|
||||
@@ -46,6 +46,7 @@ from urllib3.util.retry import Retry
|
||||
from fixtures import overlayfs
|
||||
from fixtures.broker import NeonBroker
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
|
||||
from fixtures.pageserver.allowed_errors import (
|
||||
DEFAULT_PAGESERVER_ALLOWED_ERRORS,
|
||||
scan_pageserver_log_for_errors,
|
||||
@@ -1913,7 +1914,7 @@ class Pagectl(AbstractNeonCli):
|
||||
return IndexPartDump.from_json(parsed)
|
||||
|
||||
|
||||
class NeonAttachmentService:
|
||||
class NeonAttachmentService(MetricsGetter):
|
||||
def __init__(self, env: NeonEnv, auth_enabled: bool):
|
||||
self.env = env
|
||||
self.running = False
|
||||
@@ -1951,6 +1952,11 @@ class NeonAttachmentService:
|
||||
|
||||
return headers
|
||||
|
||||
def get_metrics(self) -> Metrics:
|
||||
res = self.request("GET", f"{self.env.attachment_service_api}/metrics")
|
||||
res.raise_for_status()
|
||||
return parse_metrics(res.text)
|
||||
|
||||
def ready(self) -> bool:
|
||||
resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
|
||||
if resp.status_code == 503:
|
||||
|
||||
@@ -12,7 +12,7 @@ from requests.adapters import HTTPAdapter
|
||||
from urllib3.util.retry import Retry
|
||||
|
||||
from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, parse_metrics
|
||||
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
|
||||
from fixtures.pg_version import PgVersion
|
||||
from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
|
||||
from fixtures.utils import Fn
|
||||
@@ -125,7 +125,7 @@ class TenantConfig:
|
||||
)
|
||||
|
||||
|
||||
class PageserverHttpClient(requests.Session):
|
||||
class PageserverHttpClient(requests.Session, MetricsGetter):
|
||||
def __init__(
|
||||
self,
|
||||
port: int,
|
||||
@@ -721,45 +721,6 @@ class PageserverHttpClient(requests.Session):
|
||||
assert len(matches) < 2, "above filter should uniquely identify metric"
|
||||
return value
|
||||
|
||||
def get_metric_value(
|
||||
self, name: str, filter: Optional[Dict[str, str]] = None
|
||||
) -> Optional[float]:
|
||||
metrics = self.get_metrics()
|
||||
results = metrics.query_all(name, filter=filter)
|
||||
if not results:
|
||||
log.info(f'could not find metric "{name}"')
|
||||
return None
|
||||
assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
|
||||
return results[0].value
|
||||
|
||||
def get_metrics_values(
|
||||
self, names: list[str], filter: Optional[Dict[str, str]] = None
|
||||
) -> Dict[str, float]:
|
||||
"""
|
||||
When fetching multiple named metrics, it is more efficient to use this
|
||||
than to call `get_metric_value` repeatedly.
|
||||
|
||||
Throws RuntimeError if no metrics matching `names` are found, or if
|
||||
not all of `names` are found: this method is intended for loading sets
|
||||
of metrics whose existence is coupled.
|
||||
"""
|
||||
metrics = self.get_metrics()
|
||||
samples = []
|
||||
for name in names:
|
||||
samples.extend(metrics.query_all(name, filter=filter))
|
||||
|
||||
result = {}
|
||||
for sample in samples:
|
||||
if sample.name in result:
|
||||
raise RuntimeError(f"Multiple values found for {sample.name}")
|
||||
result[sample.name] = sample.value
|
||||
|
||||
if len(result) != len(names):
|
||||
log.info(f"Metrics found: {metrics.metrics}")
|
||||
raise RuntimeError(f"could not find all metrics {' '.join(names)}")
|
||||
|
||||
return result
|
||||
|
||||
def layer_map_info(
|
||||
self,
|
||||
tenant_id: Union[TenantId, TenantShardId],
|
||||
|
||||
@@ -255,3 +255,26 @@ def test_sharding_split_smoke(
|
||||
env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10)
|
||||
|
||||
workload.validate()
|
||||
|
||||
# Check that we didn't do any spurious reconciliations.
|
||||
# Total number of reconciles should have been one per original shard, plus
|
||||
# one for each shard that was migrated.
|
||||
reconcile_ok = env.attachment_service.get_metric_value(
|
||||
"storage_controller_reconcile_complete_total", filter={"status": "ok"}
|
||||
)
|
||||
assert reconcile_ok == shard_count + split_shard_count // 2
|
||||
|
||||
# Check that no cancelled or errored reconciliations occurred: this test does no
|
||||
# failure injection and should run clean.
|
||||
assert (
|
||||
env.attachment_service.get_metric_value(
|
||||
"storage_controller_reconcile_complete_total", filter={"status": "cancel"}
|
||||
)
|
||||
is None
|
||||
)
|
||||
assert (
|
||||
env.attachment_service.get_metric_value(
|
||||
"storage_controller_reconcile_complete_total", filter={"status": "error"}
|
||||
)
|
||||
is None
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user