mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-30 11:30:37 +00:00
test: do better job of shutting everything down (#8714)
After #8655 we've had a few issues (mostly tracked on #8708) with the graceful shutdown. In order to shutdown more of the processes and catch more errors, for example, from all pageservers, do an immediate shutdown for those nodes which fail the initial (possibly graceful) shutdown. Cc: #6485
This commit is contained in:
@@ -1254,20 +1254,54 @@ class NeonEnv:
|
||||
|
||||
Unless of course, some stopping failed, in that case, all remaining child processes are leaked.
|
||||
"""
|
||||
self.endpoints.stop_all(fail_on_endpoint_errors)
|
||||
|
||||
# the commonly failing components have special try-except behavior,
|
||||
# trying to get us to actually shutdown all processes over easier error
|
||||
# reporting.
|
||||
|
||||
raise_later = None
|
||||
try:
|
||||
self.endpoints.stop_all(fail_on_endpoint_errors)
|
||||
except Exception as e:
|
||||
raise_later = e
|
||||
|
||||
# Stop storage controller before pageservers: we don't want it to spuriously
|
||||
# detect a pageserver "failure" during test teardown
|
||||
self.storage_controller.stop(immediate=immediate)
|
||||
|
||||
stop_later = []
|
||||
metric_errors = []
|
||||
|
||||
for sk in self.safekeepers:
|
||||
sk.stop(immediate=immediate)
|
||||
for pageserver in self.pageservers:
|
||||
if ps_assert_metric_no_errors:
|
||||
pageserver.assert_no_metric_errors()
|
||||
pageserver.stop(immediate=immediate)
|
||||
try:
|
||||
pageserver.assert_no_metric_errors()
|
||||
except Exception as e:
|
||||
metric_errors.append(e)
|
||||
log.error(f"metric validation failed on {pageserver.id}: {e}")
|
||||
try:
|
||||
pageserver.stop(immediate=immediate)
|
||||
except RuntimeError:
|
||||
stop_later.append(pageserver)
|
||||
self.broker.stop(immediate=immediate)
|
||||
|
||||
# TODO: for nice logging we need python 3.11 ExceptionGroup
|
||||
for ps in stop_later:
|
||||
ps.stop(immediate=True)
|
||||
|
||||
if raise_later is not None:
|
||||
raise raise_later
|
||||
|
||||
for error in metric_errors:
|
||||
raise error
|
||||
|
||||
if len(stop_later) > 0:
|
||||
raise RuntimeError(
|
||||
f"{len(stop_later)} out of {len(self.pageservers)} pageservers failed to stop gracefully"
|
||||
)
|
||||
|
||||
@property
|
||||
def pageserver(self) -> NeonPageserver:
|
||||
"""
|
||||
|
||||
Reference in New Issue
Block a user