From 76aa6936e8e4303fff7809c6c2e9b9467086a0dd Mon Sep 17 00:00:00 2001 From: John Spray Date: Wed, 19 Jun 2024 13:14:50 +0100 Subject: [PATCH] tests: make Endpoint.stop() thread safe (occasional flakes in `test_multi_attach`) (#8110) ## Problem Tests using the `Workload` helper would occasionally fail in a strange way, where the endpoint appears to try and stop twice concurrently, and the second stop fails because the pidfile is already gone. `test_multi_attach` suffered from this. Workload has a `__del__` that stops the endpoint, and python is destroying this object in a different thread than NeonEnv.stop is called, resulting in racing stop() calls. Endpoint has a `running` attribute that avoids calling neon_local's stop twice, but that doesn't help in the concurrent case. ## Summary of changes - Make `Endpoint.stop` thread safe with a simple lock held across the updates to `running` and the actual act of stopping it. One could also work around this by letting Workload.endpoint outlive the Workload, or making Workload a context manager, but this change feels most robust, as it avoids all test code having to know that it must not try and stop an endpoint from a destructor. --- test_runner/fixtures/neon_fixtures.py | 53 +++++++++++++++++---------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 8994db8cf2..49857d5151 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -3446,6 +3446,12 @@ class Endpoint(PgProtocol, LogUtils): self.active_safekeepers: List[int] = list(map(lambda sk: sk.id, env.safekeepers)) # path to conf is /endpoints//pgdata/postgresql.conf + # This lock prevents concurrent start & stop operations, keeping `self.running` consistent + # with whether we're really running. Tests generally wouldn't try and do these concurrently, + # but endpoints are also stopped during test teardown, which might happen concurrently with + # destruction of objects in tests. + self.lock = threading.Lock() + def http_client( self, auth_token: Optional[str] = None, retries: Optional[Retry] = None ) -> EndpointHttpClient: @@ -3516,14 +3522,15 @@ class Endpoint(PgProtocol, LogUtils): log.info(f"Starting postgres endpoint {self.endpoint_id}") - self.env.neon_cli.endpoint_start( - self.endpoint_id, - safekeepers=self.active_safekeepers, - remote_ext_config=remote_ext_config, - pageserver_id=pageserver_id, - allow_multiple=allow_multiple, - ) - self.running = True + with self.lock: + self.env.neon_cli.endpoint_start( + self.endpoint_id, + safekeepers=self.active_safekeepers, + remote_ext_config=remote_ext_config, + pageserver_id=pageserver_id, + allow_multiple=allow_multiple, + ) + self.running = True return self @@ -3615,15 +3622,20 @@ class Endpoint(PgProtocol, LogUtils): def stop(self, mode: str = "fast") -> "Endpoint": """ Stop the Postgres instance if it's running. + + Because test teardown might try and stop an endpoint concurrently with test code + stopping the endpoint, this method is thread safe + Returns self. """ - if self.running: - assert self.endpoint_id is not None - self.env.neon_cli.endpoint_stop( - self.endpoint_id, check_return_code=self.check_stop_result, mode=mode - ) - self.running = False + with self.lock: + if self.running: + assert self.endpoint_id is not None + self.env.neon_cli.endpoint_stop( + self.endpoint_id, check_return_code=self.check_stop_result, mode=mode + ) + self.running = False return self @@ -3633,12 +3645,13 @@ class Endpoint(PgProtocol, LogUtils): Returns self. """ - assert self.endpoint_id is not None - self.env.neon_cli.endpoint_stop( - self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode - ) - self.endpoint_id = None - self.running = False + with self.lock: + assert self.endpoint_id is not None + self.env.neon_cli.endpoint_stop( + self.endpoint_id, True, check_return_code=self.check_stop_result, mode=mode + ) + self.endpoint_id = None + self.running = False return self