mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-04 05:50:38 +00:00
tests: make Endpoint.stop() thread safe (occasional flakes in test_multi_attach) (#8110)
## Problem Tests using the `Workload` helper would occasionally fail in a strange way, where the endpoint appears to try and stop twice concurrently, and the second stop fails because the pidfile is already gone. `test_multi_attach` suffered from this. Workload has a `__del__` that stops the endpoint, and python is destroying this object in a different thread than NeonEnv.stop is called, resulting in racing stop() calls. Endpoint has a `running` attribute that avoids calling neon_local's stop twice, but that doesn't help in the concurrent case. ## Summary of changes - Make `Endpoint.stop` thread safe with a simple lock held across the updates to `running` and the actual act of stopping it. One could also work around this by letting Workload.endpoint outlive the Workload, or making Workload a context manager, but this change feels most robust, as it avoids all test code having to know that it must not try and stop an endpoint from a destructor.
This commit is contained in:
@@ -3446,6 +3446,12 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers))
|
||||
# path to conf is <repo_dir>/endpoints/<endpoint_id>/pgdata/postgresql.conf
|
||||
|
||||
# This lock prevents concurrent start & stop operations, keeping `self.running` consistent
|
||||
# with whether we're really running. Tests generally wouldn't try and do these concurrently,
|
||||
# but endpoints are also stopped during test teardown, which might happen concurrently with
|
||||
# destruction of objects in tests.
|
||||
self.lock = threading.Lock()
|
||||
|
||||
def http_client(
|
||||
self, auth_token: Optional[str] = None, retries: Optional[Retry] = None
|
||||
) -> EndpointHttpClient:
|
||||
@@ -3516,14 +3522,15 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
|
||||
log.info(f"Starting postgres endpoint {self.endpoint_id}")
|
||||
|
||||
self.env.neon_cli.endpoint_start(
|
||||
self.endpoint_id,
|
||||
safekeepers=self.active_safekeepers,
|
||||
remote_ext_config=remote_ext_config,
|
||||
pageserver_id=pageserver_id,
|
||||
allow_multiple=allow_multiple,
|
||||
)
|
||||
self.running = True
|
||||
with self.lock:
|
||||
self.env.neon_cli.endpoint_start(
|
||||
self.endpoint_id,
|
||||
safekeepers=self.active_safekeepers,
|
||||
remote_ext_config=remote_ext_config,
|
||||
pageserver_id=pageserver_id,
|
||||
allow_multiple=allow_multiple,
|
||||
)
|
||||
self.running = True
|
||||
|
||||
return self
|
||||
|
||||
@@ -3615,15 +3622,20 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
def stop(self, mode: str = "fast") -> "Endpoint":
|
||||
"""
|
||||
Stop the Postgres instance if it's running.
|
||||
|
||||
Because test teardown might try and stop an endpoint concurrently with test code
|
||||
stopping the endpoint, this method is thread safe
|
||||
|
||||
Returns self.
|
||||
"""
|
||||
|
||||
if self.running:
|
||||
assert self.endpoint_id is not None
|
||||
self.env.neon_cli.endpoint_stop(
|
||||
self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
|
||||
)
|
||||
self.running = False
|
||||
with self.lock:
|
||||
if self.running:
|
||||
assert self.endpoint_id is not None
|
||||
self.env.neon_cli.endpoint_stop(
|
||||
self.endpoint_id, check_return_code=self.check_stop_result, mode=mode
|
||||
)
|
||||
self.running = False
|
||||
|
||||
return self
|
||||
|
||||
@@ -3633,12 +3645,13 @@ class Endpoint(PgProtocol, LogUtils):
|
||||
Returns self.
|
||||
"""
|
||||
|
||||
assert self.endpoint_id is not None
|
||||
self.env.neon_cli.endpoint_stop(
|
||||
self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode
|
||||
)
|
||||
self.endpoint_id = None
|
||||
self.running = False
|
||||
with self.lock:
|
||||
assert self.endpoint_id is not None
|
||||
self.env.neon_cli.endpoint_stop(
|
||||
self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode
|
||||
)
|
||||
self.endpoint_id = None
|
||||
self.running = False
|
||||
|
||||
return self
|
||||
|
||||
|
||||
Reference in New Issue
Block a user