tests: log hygiene checks for storage controller (#6710)

## Problem

As with the pageserver, we should fail tests that emit unexpected log
errors/warnings.

## Summary of changes

- Refactor existing log checks to be reusable
- Run log checks for attachment_service
- Add allow lists as needed.
This commit is contained in:
John Spray
2024-03-19 10:30:33 +00:00
committed by GitHub
parent 49be446d95
commit b80704cd34
10 changed files with 126 additions and 25 deletions

View File

@@ -7,7 +7,9 @@ use std::{
time::{Duration, Instant},
};
use crate::{id_lock_map::IdLockMap, persistence::AbortShardSplitStatus};
use crate::{
id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
};
use anyhow::Context;
use control_plane::storage_controller::{
AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
@@ -733,7 +735,19 @@ impl Service {
tenant.waiter.advance(result.sequence);
}
Err(e) => {
tracing::warn!("Reconcile error: {}", e);
match e {
ReconcileError::Cancel => {
tracing::info!("Reconciler was cancelled");
}
ReconcileError::Remote(mgmt_api::Error::Cancelled) => {
// This might be due to the reconciler getting cancelled, or it might
// be due to the `Node` being marked offline.
tracing::info!("Reconciler cancelled during pageserver API call");
}
_ => {
tracing::warn!("Reconcile error: {}", e);
}
}
// Ordering: populate last_error before advancing error_seq,
// so that waiters will see the correct error after waiting.
@@ -3631,6 +3645,13 @@ impl Service {
observed_loc.conf = None;
}
if new_nodes.len() == 1 {
// Special case for single-node cluster: there is no point trying to reschedule
// any tenant shards: avoid doing so, in order to avoid spewing warnings about
// failures to schedule them.
continue;
}
if tenant_state.intent.demote_attached(node_id) {
tenant_state.sequence = tenant_state.sequence.next();
match tenant_state.schedule(scheduler) {

View File

@@ -51,7 +51,7 @@ from fixtures.log_helper import log
from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
from fixtures.pageserver.allowed_errors import (
DEFAULT_PAGESERVER_ALLOWED_ERRORS,
scan_pageserver_log_for_errors,
DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
)
from fixtures.pageserver.http import PageserverHttpClient
from fixtures.pageserver.types import IndexPartDump
@@ -77,6 +77,7 @@ from fixtures.utils import (
ATTACHMENT_NAME_REGEX,
allure_add_grafana_links,
allure_attach_from_dir,
assert_no_errors,
get_self_dir,
subprocess_capture,
wait_until,
@@ -944,6 +945,8 @@ class NeonEnvBuilder:
for pageserver in self.env.pageservers:
pageserver.assert_no_errors()
self.env.storage_controller.assert_no_errors()
try:
self.overlay_cleanup_teardown()
except Exception as e:
@@ -1961,6 +1964,7 @@ class NeonStorageController(MetricsGetter):
self.env = env
self.running = False
self.auth_enabled = auth_enabled
self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS
def start(self):
assert not self.running
@@ -1985,6 +1989,11 @@ class NeonStorageController(MetricsGetter):
msg = ""
raise StorageControllerApiException(msg, res.status_code) from e
def assert_no_errors(self):
assert_no_errors(
self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors
)
def pageserver_api(self) -> PageserverHttpClient:
"""
The storage controller implements a subset of the pageserver REST API, for mapping
@@ -2357,18 +2366,9 @@ class NeonPageserver(PgProtocol):
return self.env.repo_dir / f"pageserver_{self.id}"
def assert_no_errors(self):
logfile = self.workdir / "pageserver.log"
if not logfile.exists():
log.warning(f"Skipping log check: {logfile} does not exist")
return
with logfile.open("r") as f:
errors = scan_pageserver_log_for_errors(f, self.allowed_errors)
for _lineno, error in errors:
log.info(f"not allowed error: {error.strip()}")
assert not errors
assert_no_errors(
self.workdir / "pageserver.log", f"pageserver_{self.id}", self.allowed_errors
)
def assert_no_metric_errors(self):
"""

View File

@@ -89,6 +89,16 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
)
DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
# Many tests will take pageservers offline, resulting in log warnings on the controller
# failing to connect to them.
".*Call to node.*management API.*failed.*receive body.*",
".*Call to node.*management API.*failed.*ReceiveBody.*",
# Many tests will start up with a node offline
".*startup_reconcile: Could not scan node.*",
]
def _check_allowed_errors(input):
allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)

View File

@@ -11,6 +11,7 @@ from typing import (
Any,
Callable,
Dict,
Iterable,
List,
Optional,
Tuple,
@@ -447,3 +448,39 @@ def humantime_to_ms(humantime: str) -> float:
)
return round(total_ms, 3)
def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]:
error_or_warn = re.compile(r"\s(ERROR|WARN)")
errors = []
for lineno, line in enumerate(input, start=1):
if len(line) == 0:
continue
if error_or_warn.search(line):
# Is this a torn log line? This happens when force-killing a process and restarting
# Example: "2023-10-25T09:38:31.752314Z WARN deletion executo2023-10-25T09:38:31.875947Z INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192"
if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line):
continue
# It's an ERROR or WARN. Is it in the allow-list?
for a in allowed_errors:
if re.match(a, line):
break
else:
errors.append((lineno, line))
return errors
def assert_no_errors(log_file, service, allowed_errors):
if not log_file.exists():
log.warning(f"Skipping {service} log check: {log_file} does not exist")
return
with log_file.open("r") as f:
errors = scan_log_for_errors(f, allowed_errors)
for _lineno, error in errors:
log.info(f"not allowed {service} error: {error.strip()}")
assert not errors, f"Log errors on {service}: {errors[0]}"

View File

@@ -120,12 +120,12 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
env = neon_simple_env
pageserver_http_client = env.pageserver.http_client()
env.pageserver.allowed_errors.extend(
[
".*invalid branch start lsn: less than latest GC cutoff.*",
".*invalid branch start lsn: less than planned GC cutoff.*",
]
)
error_regexes = [
".*invalid branch start lsn: less than latest GC cutoff.*",
".*invalid branch start lsn: less than planned GC cutoff.*",
]
env.pageserver.allowed_errors.extend(error_regexes)
env.storage_controller.allowed_errors.extend(error_regexes)
# Disable background GC but set the `pitr_interval` to be small, so GC can delete something
tenant, _ = env.neon_cli.create_tenant(

View File

@@ -14,9 +14,12 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
env = neon_env_builder.init_start()
env.pageserver.allowed_errors.extend(
[".*invalid branch start lsn.*", ".*invalid start lsn .* for ancestor timeline.*"]
)
error_regexes = [
".*invalid branch start lsn.*",
".*invalid start lsn .* for ancestor timeline.*",
]
env.pageserver.allowed_errors.extend(error_regexes)
env.storage_controller.allowed_errors.extend(error_regexes)
# Branch at the point where only 100 rows were inserted
branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind")

View File

@@ -238,6 +238,10 @@ def test_forward_compatibility(
pg_distrib_dir=compatibility_postgres_distrib_dir,
)
# TODO: remove this workaround after release-5090 is no longer the most recent release.
# There was a bug in that code that generates a warning in the storage controller log.
env.storage_controller.allowed_errors.append(".*no tenant_shard_id specified.*")
# Use current neon_local even though we're using old binaries for
# everything else: our test code is written for latest CLI args.
env.neon_local_binpath = neon_local_binpath

View File

@@ -725,6 +725,20 @@ def test_sharding_split_failures(
tenant_id = env.initial_tenant
timeline_id = env.initial_timeline
env.storage_controller.allowed_errors.extend(
[
# All split failures log a warning when then enqueue the abort operation
".*Enqueuing background abort.*",
# We exercise failure cases where abort itself will also fail (node offline)
".*abort_tenant_shard_split.*",
".*Failed to abort.*",
# Tolerate any error lots that mention a failpoint
".*failpoint.*",
# Node offline cases will fail to send requests
".*Reconcile error: receive body: error sending request for url.*",
]
)
for ps in env.pageservers:
# When we do node failures and abandon a shard, it will de-facto have old generation and
# thereby be unable to publish remote consistent LSN updates

View File

@@ -177,6 +177,7 @@ def test_node_status_after_restart(
assert len(nodes) == 2
env.pageservers[1].stop()
env.storage_controller.allowed_errors.extend([".*Could not scan node"])
env.storage_controller.stop()
env.storage_controller.start()
@@ -681,6 +682,9 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
tenant_id = TenantId.generate()
body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}
env.storage_controller.allowed_errors.append(".*Unauthorized.*")
env.storage_controller.allowed_errors.append(".*Forbidden.*")
# No token
with pytest.raises(
StorageControllerApiException,
@@ -843,6 +847,12 @@ def test_sharding_service_heartbeats(
env = neon_env_builder.init_configs()
env.start()
# Default log allow list permits connection errors, but this test will use error responses on
# the utilization endpoint.
env.storage_controller.allowed_errors.append(
".*Call to node.*management API.*failed.*failpoint.*"
)
# Initially we have two online pageservers
nodes = env.storage_controller.node_list()
assert len(nodes) == 2

View File

@@ -36,7 +36,9 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
)
[d for d in tenants_dir.iterdir()]
neon_simple_env.pageserver.allowed_errors.append(".*tenant-config-before-write.*")
error_regexes = [".*tenant-config-before-write.*"]
neon_simple_env.pageserver.allowed_errors.extend(error_regexes)
neon_simple_env.storage_controller.allowed_errors.extend(error_regexes)
pageserver_http = neon_simple_env.pageserver.http_client()
pageserver_http.configure_failpoints(("tenant-config-before-write", "return"))