mirror of
https://github.com/neondatabase/neon.git
synced 2025-12-23 06:09:59 +00:00
tests: log hygiene checks for storage controller (#6710)
## Problem As with the pageserver, we should fail tests that emit unexpected log errors/warnings. ## Summary of changes - Refactor existing log checks to be reusable - Run log checks for attachment_service - Add allow lists as needed.
This commit is contained in:
@@ -7,7 +7,9 @@ use std::{
|
||||
time::{Duration, Instant},
|
||||
};
|
||||
|
||||
use crate::{id_lock_map::IdLockMap, persistence::AbortShardSplitStatus};
|
||||
use crate::{
|
||||
id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
|
||||
};
|
||||
use anyhow::Context;
|
||||
use control_plane::storage_controller::{
|
||||
AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
|
||||
@@ -733,7 +735,19 @@ impl Service {
|
||||
tenant.waiter.advance(result.sequence);
|
||||
}
|
||||
Err(e) => {
|
||||
tracing::warn!("Reconcile error: {}", e);
|
||||
match e {
|
||||
ReconcileError::Cancel => {
|
||||
tracing::info!("Reconciler was cancelled");
|
||||
}
|
||||
ReconcileError::Remote(mgmt_api::Error::Cancelled) => {
|
||||
// This might be due to the reconciler getting cancelled, or it might
|
||||
// be due to the `Node` being marked offline.
|
||||
tracing::info!("Reconciler cancelled during pageserver API call");
|
||||
}
|
||||
_ => {
|
||||
tracing::warn!("Reconcile error: {}", e);
|
||||
}
|
||||
}
|
||||
|
||||
// Ordering: populate last_error before advancing error_seq,
|
||||
// so that waiters will see the correct error after waiting.
|
||||
@@ -3631,6 +3645,13 @@ impl Service {
|
||||
observed_loc.conf = None;
|
||||
}
|
||||
|
||||
if new_nodes.len() == 1 {
|
||||
// Special case for single-node cluster: there is no point trying to reschedule
|
||||
// any tenant shards: avoid doing so, in order to avoid spewing warnings about
|
||||
// failures to schedule them.
|
||||
continue;
|
||||
}
|
||||
|
||||
if tenant_state.intent.demote_attached(node_id) {
|
||||
tenant_state.sequence = tenant_state.sequence.next();
|
||||
match tenant_state.schedule(scheduler) {
|
||||
|
||||
@@ -51,7 +51,7 @@ from fixtures.log_helper import log
|
||||
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
|
||||
from fixtures.pageserver.allowed_errors import (
|
||||
DEFAULT_PAGESERVER_ALLOWED_ERRORS,
|
||||
scan_pageserver_log_for_errors,
|
||||
DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
|
||||
)
|
||||
from fixtures.pageserver.http import PageserverHttpClient
|
||||
from fixtures.pageserver.types import IndexPartDump
|
||||
@@ -77,6 +77,7 @@ from fixtures.utils import (
|
||||
ATTACHMENT_NAME_REGEX,
|
||||
allure_add_grafana_links,
|
||||
allure_attach_from_dir,
|
||||
assert_no_errors,
|
||||
get_self_dir,
|
||||
subprocess_capture,
|
||||
wait_until,
|
||||
@@ -944,6 +945,8 @@ class NeonEnvBuilder:
|
||||
for pageserver in self.env.pageservers:
|
||||
pageserver.assert_no_errors()
|
||||
|
||||
self.env.storage_controller.assert_no_errors()
|
||||
|
||||
try:
|
||||
self.overlay_cleanup_teardown()
|
||||
except Exception as e:
|
||||
@@ -1961,6 +1964,7 @@ class NeonStorageController(MetricsGetter):
|
||||
self.env = env
|
||||
self.running = False
|
||||
self.auth_enabled = auth_enabled
|
||||
self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
|
||||
|
||||
def start(self):
|
||||
assert not self.running
|
||||
@@ -1985,6 +1989,11 @@ class NeonStorageController(MetricsGetter):
|
||||
msg = ""
|
||||
raise StorageControllerApiException(msg, res.status_code) from e
|
||||
|
||||
def assert_no_errors(self):
|
||||
assert_no_errors(
|
||||
self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors
|
||||
)
|
||||
|
||||
def pageserver_api(self) -> PageserverHttpClient:
|
||||
"""
|
||||
The storage controller implements a subset of the pageserver REST API, for mapping
|
||||
@@ -2357,18 +2366,9 @@ class NeonPageserver(PgProtocol):
|
||||
return self.env.repo_dir / f"pageserver_{self.id}"
|
||||
|
||||
def assert_no_errors(self):
|
||||
logfile = self.workdir / "pageserver.log"
|
||||
if not logfile.exists():
|
||||
log.warning(f"Skipping log check: {logfile} does not exist")
|
||||
return
|
||||
|
||||
with logfile.open("r") as f:
|
||||
errors = scan_pageserver_log_for_errors(f, self.allowed_errors)
|
||||
|
||||
for _lineno, error in errors:
|
||||
log.info(f"not allowed error: {error.strip()}")
|
||||
|
||||
assert not errors
|
||||
assert_no_errors(
|
||||
self.workdir / "pageserver.log", f"pageserver_{self.id}", self.allowed_errors
|
||||
)
|
||||
|
||||
def assert_no_metric_errors(self):
|
||||
"""
|
||||
|
||||
@@ -89,6 +89,16 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
|
||||
)
|
||||
|
||||
|
||||
DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
|
||||
# Many tests will take pageservers offline, resulting in log warnings on the controller
|
||||
# failing to connect to them.
|
||||
".*Call to node.*management API.*failed.*receive body.*",
|
||||
".*Call to node.*management API.*failed.*ReceiveBody.*",
|
||||
# Many tests will start up with a node offline
|
||||
".*startup_reconcile: Could not scan node.*",
|
||||
]
|
||||
|
||||
|
||||
def _check_allowed_errors(input):
|
||||
allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)
|
||||
|
||||
|
||||
@@ -11,6 +11,7 @@ from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Dict,
|
||||
Iterable,
|
||||
List,
|
||||
Optional,
|
||||
Tuple,
|
||||
@@ -447,3 +448,39 @@ def humantime_to_ms(humantime: str) -> float:
|
||||
)
|
||||
|
||||
return round(total_ms, 3)
|
||||
|
||||
|
||||
def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]:
|
||||
error_or_warn = re.compile(r"\s(ERROR|WARN)")
|
||||
errors = []
|
||||
for lineno, line in enumerate(input, start=1):
|
||||
if len(line) == 0:
|
||||
continue
|
||||
|
||||
if error_or_warn.search(line):
|
||||
# Is this a torn log line? This happens when force-killing a process and restarting
|
||||
# Example: "2023-10-25T09:38:31.752314Z WARN deletion executo2023-10-25T09:38:31.875947Z INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192"
|
||||
if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line):
|
||||
continue
|
||||
|
||||
# It's an ERROR or WARN. Is it in the allow-list?
|
||||
for a in allowed_errors:
|
||||
if re.match(a, line):
|
||||
break
|
||||
else:
|
||||
errors.append((lineno, line))
|
||||
return errors
|
||||
|
||||
|
||||
def assert_no_errors(log_file, service, allowed_errors):
|
||||
if not log_file.exists():
|
||||
log.warning(f"Skipping {service} log check: {log_file} does not exist")
|
||||
return
|
||||
|
||||
with log_file.open("r") as f:
|
||||
errors = scan_log_for_errors(f, allowed_errors)
|
||||
|
||||
for _lineno, error in errors:
|
||||
log.info(f"not allowed {service} error: {error.strip()}")
|
||||
|
||||
assert not errors, f"Log errors on {service}: {errors[0]}"
|
||||
|
||||
@@ -120,12 +120,12 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
|
||||
env = neon_simple_env
|
||||
pageserver_http_client = env.pageserver.http_client()
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[
|
||||
".*invalid branch start lsn: less than latest GC cutoff.*",
|
||||
".*invalid branch start lsn: less than planned GC cutoff.*",
|
||||
]
|
||||
)
|
||||
error_regexes = [
|
||||
".*invalid branch start lsn: less than latest GC cutoff.*",
|
||||
".*invalid branch start lsn: less than planned GC cutoff.*",
|
||||
]
|
||||
env.pageserver.allowed_errors.extend(error_regexes)
|
||||
env.storage_controller.allowed_errors.extend(error_regexes)
|
||||
|
||||
# Disable background GC but set the `pitr_interval` to be small, so GC can delete something
|
||||
tenant, _ = env.neon_cli.create_tenant(
|
||||
|
||||
@@ -14,9 +14,12 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
|
||||
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
|
||||
env = neon_env_builder.init_start()
|
||||
|
||||
env.pageserver.allowed_errors.extend(
|
||||
[".*invalid branch start lsn.*", ".*invalid start lsn .* for ancestor timeline.*"]
|
||||
)
|
||||
error_regexes = [
|
||||
".*invalid branch start lsn.*",
|
||||
".*invalid start lsn .* for ancestor timeline.*",
|
||||
]
|
||||
env.pageserver.allowed_errors.extend(error_regexes)
|
||||
env.storage_controller.allowed_errors.extend(error_regexes)
|
||||
|
||||
# Branch at the point where only 100 rows were inserted
|
||||
branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind")
|
||||
|
||||
@@ -238,6 +238,10 @@ def test_forward_compatibility(
|
||||
pg_distrib_dir=compatibility_postgres_distrib_dir,
|
||||
)
|
||||
|
||||
# TODO: remove this workaround after release-5090 is no longer the most recent release.
|
||||
# There was a bug in that code that generates a warning in the storage controller log.
|
||||
env.storage_controller.allowed_errors.append(".*no tenant_shard_id specified.*")
|
||||
|
||||
# Use current neon_local even though we're using old binaries for
|
||||
# everything else: our test code is written for latest CLI args.
|
||||
env.neon_local_binpath = neon_local_binpath
|
||||
|
||||
@@ -725,6 +725,20 @@ def test_sharding_split_failures(
|
||||
tenant_id = env.initial_tenant
|
||||
timeline_id = env.initial_timeline
|
||||
|
||||
env.storage_controller.allowed_errors.extend(
|
||||
[
|
||||
# All split failures log a warning when then enqueue the abort operation
|
||||
".*Enqueuing background abort.*",
|
||||
# We exercise failure cases where abort itself will also fail (node offline)
|
||||
".*abort_tenant_shard_split.*",
|
||||
".*Failed to abort.*",
|
||||
# Tolerate any error lots that mention a failpoint
|
||||
".*failpoint.*",
|
||||
# Node offline cases will fail to send requests
|
||||
".*Reconcile error: receive body: error sending request for url.*",
|
||||
]
|
||||
)
|
||||
|
||||
for ps in env.pageservers:
|
||||
# When we do node failures and abandon a shard, it will de-facto have old generation and
|
||||
# thereby be unable to publish remote consistent LSN updates
|
||||
|
||||
@@ -177,6 +177,7 @@ def test_node_status_after_restart(
|
||||
assert len(nodes) == 2
|
||||
|
||||
env.pageservers[1].stop()
|
||||
env.storage_controller.allowed_errors.extend([".*Could not scan node"])
|
||||
|
||||
env.storage_controller.stop()
|
||||
env.storage_controller.start()
|
||||
@@ -681,6 +682,9 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
|
||||
tenant_id = TenantId.generate()
|
||||
body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
|
||||
|
||||
env.storage_controller.allowed_errors.append(".*Unauthorized.*")
|
||||
env.storage_controller.allowed_errors.append(".*Forbidden.*")
|
||||
|
||||
# No token
|
||||
with pytest.raises(
|
||||
StorageControllerApiException,
|
||||
@@ -843,6 +847,12 @@ def test_sharding_service_heartbeats(
|
||||
env = neon_env_builder.init_configs()
|
||||
env.start()
|
||||
|
||||
# Default log allow list permits connection errors, but this test will use error responses on
|
||||
# the utilization endpoint.
|
||||
env.storage_controller.allowed_errors.append(
|
||||
".*Call to node.*management API.*failed.*failpoint.*"
|
||||
)
|
||||
|
||||
# Initially we have two online pageservers
|
||||
nodes = env.storage_controller.node_list()
|
||||
assert len(nodes) == 2
|
||||
|
||||
@@ -36,7 +36,9 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
|
||||
)
|
||||
[d for d in tenants_dir.iterdir()]
|
||||
|
||||
neon_simple_env.pageserver.allowed_errors.append(".*tenant-config-before-write.*")
|
||||
error_regexes = [".*tenant-config-before-write.*"]
|
||||
neon_simple_env.pageserver.allowed_errors.extend(error_regexes)
|
||||
neon_simple_env.storage_controller.allowed_errors.extend(error_regexes)
|
||||
|
||||
pageserver_http = neon_simple_env.pageserver.http_client()
|
||||
pageserver_http.configure_failpoints(("tenant-config-before-write", "return"))
|
||||
|
||||
Reference in New Issue
Block a user