tests: log hygiene checks for storage controller (#6710)

## Problem As with the pageserver, we should fail tests that emit unexpected log errors/warnings. ## Summary of changes - Refactor existing log checks to be reusable - Run log checks for attachment_service - Add allow lists as needed.
2025-12-23 06:09:59 +00:00 · 2024-03-19 10:30:33 +00:00
parent 49be446d95
commit b80704cd34
10 changed files with 126 additions and 25 deletions
--- a/control_plane/attachment_service/src/service.rs
+++ b/control_plane/attachment_service/src/service.rs
@@ -7,7 +7,9 @@ use std::{
    time::{Duration, Instant},
 };

-use crate::{id_lock_map::IdLockMap, persistence::AbortShardSplitStatus};
+use crate::{
+    id_lock_map::IdLockMap, persistence::AbortShardSplitStatus, reconciler::ReconcileError,
+};
 use anyhow::Context;
 use control_plane::storage_controller::{
    AttachHookRequest, AttachHookResponse, InspectRequest, InspectResponse,
@@ -733,7 +735,19 @@ impl Service {
                tenant.waiter.advance(result.sequence);
            }
            Err(e) => {
-                tracing::warn!("Reconcile error: {}", e);
+                match e {
+                    ReconcileError::Cancel => {
+                        tracing::info!("Reconciler was cancelled");
+                    }
+                    ReconcileError::Remote(mgmt_api::Error::Cancelled) => {
+                        // This might be due to the reconciler getting cancelled, or it might
+                        // be due to the `Node` being marked offline.
+                        tracing::info!("Reconciler cancelled during pageserver API call");
+                    }
+                    _ => {
+                        tracing::warn!("Reconcile error: {}", e);
+                    }
+                }

                // Ordering: populate last_error before advancing error_seq,
                // so that waiters will see the correct error after waiting.
@@ -3631,6 +3645,13 @@ impl Service {
                        observed_loc.conf = None;
                    }

+                    if new_nodes.len() == 1 {
+                        // Special case for single-node cluster: there is no point trying to reschedule
+                        // any tenant shards: avoid doing so, in order to avoid spewing warnings about
+                        // failures to schedule them.
+                        continue;
+                    }
+
                    if tenant_state.intent.demote_attached(node_id) {
                        tenant_state.sequence = tenant_state.sequence.next();
                        match tenant_state.schedule(scheduler) {
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -51,7 +51,7 @@ from fixtures.log_helper import log
 from fixtures.metrics import Metrics, MetricsGetter, parse_metrics
 from fixtures.pageserver.allowed_errors import (
    DEFAULT_PAGESERVER_ALLOWED_ERRORS,
-    scan_pageserver_log_for_errors,
+    DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS,
 )
 from fixtures.pageserver.http import PageserverHttpClient
 from fixtures.pageserver.types import IndexPartDump
@@ -77,6 +77,7 @@ from fixtures.utils import (
    ATTACHMENT_NAME_REGEX,
    allure_add_grafana_links,
    allure_attach_from_dir,
+    assert_no_errors,
    get_self_dir,
    subprocess_capture,
    wait_until,
@@ -944,6 +945,8 @@ class NeonEnvBuilder:
            for pageserver in self.env.pageservers:
                pageserver.assert_no_errors()

+            self.env.storage_controller.assert_no_errors()
+
        try:
            self.overlay_cleanup_teardown()
        except Exception as e:
@@ -1961,6 +1964,7 @@ class NeonStorageController(MetricsGetter):
        self.env = env
        self.running = False
        self.auth_enabled = auth_enabled
+        self.allowed_errors: list[str] = DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS

    def start(self):
        assert not self.running
@@ -1985,6 +1989,11 @@ class NeonStorageController(MetricsGetter):
                msg = ""
            raise StorageControllerApiException(msg, res.status_code) from e

+    def assert_no_errors(self):
+        assert_no_errors(
+            self.env.repo_dir / "storage_controller.log", "storage_controller", self.allowed_errors
+        )
+
    def pageserver_api(self) -> PageserverHttpClient:
        """
        The storage controller implements a subset of the pageserver REST API, for mapping
@@ -2357,18 +2366,9 @@ class NeonPageserver(PgProtocol):
        return self.env.repo_dir / f"pageserver_{self.id}"

    def assert_no_errors(self):
-        logfile = self.workdir / "pageserver.log"
-        if not logfile.exists():
-            log.warning(f"Skipping log check: {logfile} does not exist")
-            return
-
-        with logfile.open("r") as f:
-            errors = scan_pageserver_log_for_errors(f, self.allowed_errors)
-
-        for _lineno, error in errors:
-            log.info(f"not allowed error: {error.strip()}")
-
-        assert not errors
+        assert_no_errors(
+            self.workdir / "pageserver.log", f"pageserver_{self.id}", self.allowed_errors
+        )

    def assert_no_metric_errors(self):
        """
--- a/test_runner/fixtures/pageserver/allowed_errors.py
+++ b/test_runner/fixtures/pageserver/allowed_errors.py
@@ -89,6 +89,16 @@ DEFAULT_PAGESERVER_ALLOWED_ERRORS = (
 )


+DEFAULT_STORAGE_CONTROLLER_ALLOWED_ERRORS = [
+    # Many tests will take pageservers offline, resulting in log warnings on the controller
+    # failing to connect to them.
+    ".*Call to node.*management API.*failed.*receive body.*",
+    ".*Call to node.*management API.*failed.*ReceiveBody.*",
+    # Many tests will start up with a node offline
+    ".*startup_reconcile: Could not scan node.*",
+]
+
+
 def _check_allowed_errors(input):
    allowed_errors: List[str] = list(DEFAULT_PAGESERVER_ALLOWED_ERRORS)

--- a/test_runner/fixtures/utils.py
+++ b/test_runner/fixtures/utils.py
@@ -11,6 +11,7 @@ from typing import (
    Any,
    Callable,
    Dict,
+    Iterable,
    List,
    Optional,
    Tuple,
@@ -447,3 +448,39 @@ def humantime_to_ms(humantime: str) -> float:
            )

    return round(total_ms, 3)
+
+
+def scan_log_for_errors(input: Iterable[str], allowed_errors: List[str]) -> List[Tuple[int, str]]:
+    error_or_warn = re.compile(r"\s(ERROR|WARN)")
+    errors = []
+    for lineno, line in enumerate(input, start=1):
+        if len(line) == 0:
+            continue
+
+        if error_or_warn.search(line):
+            # Is this a torn log line?  This happens when force-killing a process and restarting
+            # Example: "2023-10-25T09:38:31.752314Z  WARN deletion executo2023-10-25T09:38:31.875947Z  INFO version: git-env:0f9452f76e8ccdfc88291bccb3f53e3016f40192"
+            if re.match("\\d{4}-\\d{2}-\\d{2}T.+\\d{4}-\\d{2}-\\d{2}T.+INFO version.+", line):
+                continue
+
+            # It's an ERROR or WARN. Is it in the allow-list?
+            for a in allowed_errors:
+                if re.match(a, line):
+                    break
+            else:
+                errors.append((lineno, line))
+    return errors
+
+
+def assert_no_errors(log_file, service, allowed_errors):
+    if not log_file.exists():
+        log.warning(f"Skipping {service} log check: {log_file} does not exist")
+        return
+
+    with log_file.open("r") as f:
+        errors = scan_log_for_errors(f, allowed_errors)
+
+    for _lineno, error in errors:
+        log.info(f"not allowed {service} error: {error.strip()}")
+
+    assert not errors, f"Log errors on {service}: {errors[0]}"
--- a/test_runner/regress/test_branch_and_gc.py
+++ b/test_runner/regress/test_branch_and_gc.py
@@ -120,12 +120,12 @@ def test_branch_creation_before_gc(neon_simple_env: NeonEnv):
    env = neon_simple_env
    pageserver_http_client = env.pageserver.http_client()

-    env.pageserver.allowed_errors.extend(
-        [
-            ".*invalid branch start lsn: less than latest GC cutoff.*",
-            ".*invalid branch start lsn: less than planned GC cutoff.*",
-        ]
-    )
+    error_regexes = [
+        ".*invalid branch start lsn: less than latest GC cutoff.*",
+        ".*invalid branch start lsn: less than planned GC cutoff.*",
+    ]
+    env.pageserver.allowed_errors.extend(error_regexes)
+    env.storage_controller.allowed_errors.extend(error_regexes)

    # Disable background GC but set the `pitr_interval` to be small, so GC can delete something
    tenant, _ = env.neon_cli.create_tenant(
--- a/test_runner/regress/test_branch_behind.py
+++ b/test_runner/regress/test_branch_behind.py
@@ -14,9 +14,12 @@ def test_branch_behind(neon_env_builder: NeonEnvBuilder):
    neon_env_builder.pageserver_config_override = "tenant_config={pitr_interval = '0 sec'}"
    env = neon_env_builder.init_start()

-    env.pageserver.allowed_errors.extend(
-        [".*invalid branch start lsn.*", ".*invalid start lsn .* for ancestor timeline.*"]
-    )
+    error_regexes = [
+        ".*invalid branch start lsn.*",
+        ".*invalid start lsn .* for ancestor timeline.*",
+    ]
+    env.pageserver.allowed_errors.extend(error_regexes)
+    env.storage_controller.allowed_errors.extend(error_regexes)

    # Branch at the point where only 100 rows were inserted
    branch_behind_timeline_id = env.neon_cli.create_branch("test_branch_behind")
--- a/test_runner/regress/test_compatibility.py
+++ b/test_runner/regress/test_compatibility.py
@@ -238,6 +238,10 @@ def test_forward_compatibility(
            pg_distrib_dir=compatibility_postgres_distrib_dir,
        )

+        # TODO: remove this workaround after release-5090 is no longer the most recent release.
+        # There was a bug in that code that generates a warning in the storage controller log.
+        env.storage_controller.allowed_errors.append(".*no tenant_shard_id specified.*")
+
        # Use current neon_local even though we're using old binaries for
        # everything else: our test code is written for latest CLI args.
        env.neon_local_binpath = neon_local_binpath
--- a/test_runner/regress/test_sharding.py
+++ b/test_runner/regress/test_sharding.py
@@ -725,6 +725,20 @@ def test_sharding_split_failures(
    tenant_id = env.initial_tenant
    timeline_id = env.initial_timeline

+    env.storage_controller.allowed_errors.extend(
+        [
+            # All split failures log a warning when then enqueue the abort operation
+            ".*Enqueuing background abort.*",
+            # We exercise failure cases where abort itself will also fail (node offline)
+            ".*abort_tenant_shard_split.*",
+            ".*Failed to abort.*",
+            # Tolerate any error lots that mention a failpoint
+            ".*failpoint.*",
+            # Node offline cases will fail to send requests
+            ".*Reconcile error: receive body: error sending request for url.*",
+        ]
+    )
+
    for ps in env.pageservers:
        # When we do node failures and abandon a shard, it will de-facto have old generation and
        # thereby be unable to publish remote consistent LSN updates
--- a/test_runner/regress/test_sharding_service.py
+++ b/test_runner/regress/test_sharding_service.py
@@ -177,6 +177,7 @@ def test_node_status_after_restart(
    assert len(nodes) == 2

    env.pageservers[1].stop()
+    env.storage_controller.allowed_errors.extend([".*Could not scan node"])

    env.storage_controller.stop()
    env.storage_controller.start()
@@ -681,6 +682,9 @@ def test_sharding_service_auth(neon_env_builder: NeonEnvBuilder):
    tenant_id = TenantId.generate()
    body: Dict[str, Any] = {"new_tenant_id": str(tenant_id)}

+    env.storage_controller.allowed_errors.append(".*Unauthorized.*")
+    env.storage_controller.allowed_errors.append(".*Forbidden.*")
+
    # No token
    with pytest.raises(
        StorageControllerApiException,
@@ -843,6 +847,12 @@ def test_sharding_service_heartbeats(
    env = neon_env_builder.init_configs()
    env.start()

+    # Default log allow list permits connection errors, but this test will use error responses on
+    # the utilization endpoint.
+    env.storage_controller.allowed_errors.append(
+        ".*Call to node.*management API.*failed.*failpoint.*"
+    )
+
    # Initially we have two online pageservers
    nodes = env.storage_controller.node_list()
    assert len(nodes) == 2
--- a/test_runner/regress/test_tenants.py
+++ b/test_runner/regress/test_tenants.py
@@ -36,7 +36,9 @@ def test_tenant_creation_fails(neon_simple_env: NeonEnv):
    )
    [d for d in tenants_dir.iterdir()]

-    neon_simple_env.pageserver.allowed_errors.append(".*tenant-config-before-write.*")
+    error_regexes = [".*tenant-config-before-write.*"]
+    neon_simple_env.pageserver.allowed_errors.extend(error_regexes)
+    neon_simple_env.storage_controller.allowed_errors.extend(error_regexes)

    pageserver_http = neon_simple_env.pageserver.http_client()
    pageserver_http.configure_failpoints(("tenant-config-before-write", "return"))