storage controller: fix spurious reconciles after pageserver restarts (#6814)

## Problem

When investigating test failures
(https://github.com/neondatabase/neon/issues/6813) I noticed we were
doing a bunch of Reconciler runs right after splitting a tenant.

It's because the splitting test does a pageserver restart, and there was
a bug in /re-attach handling, where we would update the generation
correctly in the database and intent state, but not observed state,
thereby triggering a reconciliation on the next call to maybe_reconcile.
This didn't break anything profound (underlying rules about generations
were respected), but caused the storage controller to do an un-needed
extra round of bumping the generation and reconciling.

## Summary of changes

- Start adding metrics to the storage controller
- Assert on the number of reconciles done in test_sharding_split_smoke
- Fix /re-attach to update `observed` such that we don't spuriously
re-reconcile tenants.
This commit is contained in:
John Spray
2024-02-19 17:44:20 +00:00
committed by GitHub
parent e0c12faabd
commit 4f7704af24
12 changed files with 155 additions and 42 deletions

View File

@@ -4,6 +4,8 @@ from typing import Dict, List, Optional, Tuple
from prometheus_client.parser import text_string_to_metric_families
from prometheus_client.samples import Sample
from fixtures.log_helper import log
class Metrics:
metrics: Dict[str, List[Sample]]
@@ -31,6 +33,55 @@ class Metrics:
return res[0]
class MetricsGetter:
"""
Mixin for types that implement a `get_metrics` function and would like associated
helpers for querying the metrics
"""
def get_metrics(self) -> Metrics:
raise NotImplementedError()
def get_metric_value(
self, name: str, filter: Optional[Dict[str, str]] = None
) -> Optional[float]:
metrics = self.get_metrics()
results = metrics.query_all(name, filter=filter)
if not results:
log.info(f'could not find metric "{name}"')
return None
assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
return results[0].value
def get_metrics_values(
self, names: list[str], filter: Optional[Dict[str, str]] = None
) -> Dict[str, float]:
"""
When fetching multiple named metrics, it is more efficient to use this
than to call `get_metric_value` repeatedly.
Throws RuntimeError if no metrics matching `names` are found, or if
not all of `names` are found: this method is intended for loading sets
of metrics whose existence is coupled.
"""
metrics = self.get_metrics()
samples = []
for name in names:
samples.extend(metrics.query_all(name, filter=filter))
result = {}
for sample in samples:
if sample.name in result:
raise RuntimeError(f"Multiple values found for {sample.name}")
result[sample.name] = sample.value
if len(result) != len(names):
log.info(f"Metrics found: {metrics.metrics}")
raise RuntimeError(f"could not find all metrics {' '.join(names)}")
return result
def parse_metrics(text: str, name: str = "") -> Metrics:
metrics = Metrics(name)
gen = text_string_to_metric_families(text)

View File

@@ -46,6 +46,7 @@ from urllib3.util.retry import Retry
from fixtures import overlayfs
from fixtures.broker import NeonBroker
from fixtures.log_helper import log
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
from fixtures.pageserver.allowed_errors import (
DEFAULT_PAGESERVER_ALLOWED_ERRORS,
scan_pageserver_log_for_errors,
@@ -1913,7 +1914,7 @@ class Pagectl(AbstractNeonCli):
return IndexPartDump.from_json(parsed)
class NeonAttachmentService:
class NeonAttachmentService(MetricsGetter):
def __init__(self, env: NeonEnv, auth_enabled: bool):
self.env = env
self.running = False
@@ -1951,6 +1952,11 @@ class NeonAttachmentService:
return headers
def get_metrics(self) -> Metrics:
res = self.request("GET", f"{self.env.attachment_service_api}/metrics")
res.raise_for_status()
return parse_metrics(res.text)
def ready(self) -> bool:
resp = self.request("GET", f"{self.env.attachment_service_api}/ready")
if resp.status_code == 503:

View File

@@ -12,7 +12,7 @@ from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry
from fixtures.log_helper import log
from fixtures.metrics import Metrics, parse_metrics
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
from fixtures.pg_version import PgVersion
from fixtures.types import Lsn, TenantId, TenantShardId, TimelineId
from fixtures.utils import Fn
@@ -125,7 +125,7 @@ class TenantConfig:
)
class PageserverHttpClient(requests.Session):
class PageserverHttpClient(requests.Session, MetricsGetter):
def __init__(
self,
port: int,
@@ -721,45 +721,6 @@ class PageserverHttpClient(requests.Session):
assert len(matches) < 2, "above filter should uniquely identify metric"
return value
def get_metric_value(
self, name: str, filter: Optional[Dict[str, str]] = None
) -> Optional[float]:
metrics = self.get_metrics()
results = metrics.query_all(name, filter=filter)
if not results:
log.info(f'could not find metric "{name}"')
return None
assert len(results) == 1, f"metric {name} with given filters is not unique, got: {results}"
return results[0].value
def get_metrics_values(
self, names: list[str], filter: Optional[Dict[str, str]] = None
) -> Dict[str, float]:
"""
When fetching multiple named metrics, it is more efficient to use this
than to call `get_metric_value` repeatedly.
Throws RuntimeError if no metrics matching `names` are found, or if
not all of `names` are found: this method is intended for loading sets
of metrics whose existence is coupled.
"""
metrics = self.get_metrics()
samples = []
for name in names:
samples.extend(metrics.query_all(name, filter=filter))
result = {}
for sample in samples:
if sample.name in result:
raise RuntimeError(f"Multiple values found for {sample.name}")
result[sample.name] = sample.value
if len(result) != len(names):
log.info(f"Metrics found: {metrics.metrics}")
raise RuntimeError(f"could not find all metrics {' '.join(names)}")
return result
def layer_map_info(
self,
tenant_id: Union[TenantId, TenantShardId],

View File

@@ -255,3 +255,26 @@ def test_sharding_split_smoke(
env.neon_cli.tenant_migrate(migrate_shard, destination, timeout_secs=10)
workload.validate()
# Check that we didn't do any spurious reconciliations.
# Total number of reconciles should have been one per original shard, plus
# one for each shard that was migrated.
reconcile_ok = env.attachment_service.get_metric_value(
"storage_controller_reconcile_complete_total", filter={"status": "ok"}
)
assert reconcile_ok == shard_count + split_shard_count // 2
# Check that no cancelled or errored reconciliations occurred: this test does no
# failure injection and should run clean.
assert (
env.attachment_service.get_metric_value(
"storage_controller_reconcile_complete_total", filter={"status": "cancel"}
)
is None
)
assert (
env.attachment_service.get_metric_value(
"storage_controller_reconcile_complete_total", filter={"status": "error"}
)
is None
)