test: do better job of shutting everything down (#8714)

After #8655 we've had a few issues (mostly tracked on #8708) with the
graceful shutdown. In order to shutdown more of the processes and catch
more errors, for example, from all pageservers, do an immediate shutdown
for those nodes which fail the initial (possibly graceful) shutdown.

Cc: #6485
This commit is contained in:
Joonas Koivunen
2024-08-13 20:49:50 +03:00
committed by GitHub
parent 9d2276323d
commit 87a5d7db9e

View File

@@ -1254,20 +1254,54 @@ class NeonEnv:
Unless of course, some stopping failed, in that case, all remaining child processes are leaked.
"""
self.endpoints.stop_all(fail_on_endpoint_errors)
# the commonly failing components have special try-except behavior,
# trying to get us to actually shutdown all processes over easier error
# reporting.
raise_later = None
try:
self.endpoints.stop_all(fail_on_endpoint_errors)
except Exception as e:
raise_later = e
# Stop storage controller before pageservers: we don't want it to spuriously
# detect a pageserver "failure" during test teardown
self.storage_controller.stop(immediate=immediate)
stop_later = []
metric_errors = []
for sk in self.safekeepers:
sk.stop(immediate=immediate)
for pageserver in self.pageservers:
if ps_assert_metric_no_errors:
pageserver.assert_no_metric_errors()
pageserver.stop(immediate=immediate)
try:
pageserver.assert_no_metric_errors()
except Exception as e:
metric_errors.append(e)
log.error(f"metric validation failed on {pageserver.id}: {e}")
try:
pageserver.stop(immediate=immediate)
except RuntimeError:
stop_later.append(pageserver)
self.broker.stop(immediate=immediate)
# TODO: for nice logging we need python 3.11 ExceptionGroup
for ps in stop_later:
ps.stop(immediate=True)
if raise_later is not None:
raise raise_later
for error in metric_errors:
raise error
if len(stop_later) > 0:
raise RuntimeError(
f"{len(stop_later)} out of {len(self.pageservers)} pageservers failed to stop gracefully"
)
@property
def pageserver(self) -> NeonPageserver:
"""