From 87a5d7db9e0f47eab8b48cebc6ab47045a8c9b1b Mon Sep 17 00:00:00 2001 From: Joonas Koivunen Date: Tue, 13 Aug 2024 20:49:50 +0300 Subject: [PATCH] test: do better job of shutting everything down (#8714) After #8655 we've had a few issues (mostly tracked on #8708) with the graceful shutdown. In order to shutdown more of the processes and catch more errors, for example, from all pageservers, do an immediate shutdown for those nodes which fail the initial (possibly graceful) shutdown. Cc: #6485 --- test_runner/fixtures/neon_fixtures.py | 40 +++++++++++++++++++++++++-- 1 file changed, 37 insertions(+), 3 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 6600b44759..961dbde95c 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1254,20 +1254,54 @@ class NeonEnv: Unless of course, some stopping failed, in that case, all remaining child processes are leaked. """ - self.endpoints.stop_all(fail_on_endpoint_errors) + + # the commonly failing components have special try-except behavior, + # trying to get us to actually shutdown all processes over easier error + # reporting. + + raise_later = None + try: + self.endpoints.stop_all(fail_on_endpoint_errors) + except Exception as e: + raise_later = e # Stop storage controller before pageservers: we don't want it to spuriously # detect a pageserver "failure" during test teardown self.storage_controller.stop(immediate=immediate) + stop_later = [] + metric_errors = [] + for sk in self.safekeepers: sk.stop(immediate=immediate) for pageserver in self.pageservers: if ps_assert_metric_no_errors: - pageserver.assert_no_metric_errors() - pageserver.stop(immediate=immediate) + try: + pageserver.assert_no_metric_errors() + except Exception as e: + metric_errors.append(e) + log.error(f"metric validation failed on {pageserver.id}: {e}") + try: + pageserver.stop(immediate=immediate) + except RuntimeError: + stop_later.append(pageserver) self.broker.stop(immediate=immediate) + # TODO: for nice logging we need python 3.11 ExceptionGroup + for ps in stop_later: + ps.stop(immediate=True) + + if raise_later is not None: + raise raise_later + + for error in metric_errors: + raise error + + if len(stop_later) > 0: + raise RuntimeError( + f"{len(stop_later)} out of {len(self.pageservers)} pageservers failed to stop gracefully" + ) + @property def pageserver(self) -> NeonPageserver: """