From fb4b6ce8dc1ffce66ebb5caf1350639cea5426d6 Mon Sep 17 00:00:00 2001 From: Heikki Linnakangas Date: Wed, 30 Nov 2022 14:36:19 +0200 Subject: [PATCH] Add debug information to hunt down port collisions. We've been seeing a lot of sporadic test failures with "Cannot assign requested address" lately. Add some debug information to help us find the cause: - When server startup fails, print "netstat -tnlap" output to the test log. If the failure was caused by "Cannot assign requested address", this will hopefully tell us which process was occupying the port. - In pageserver and safekeeper startup, print its PID. This way, we can correlate the PID from netstat output with the test that launched it. - In safekeeper startup, print the HTTP port it's using to the log, in addition to the libpq port. The pageserver was already doing it. --- pageserver/src/bin/pageserver.rs | 2 ++ safekeeper/src/bin/safekeeper.rs | 11 +++++++- test_runner/fixtures/neon_fixtures.py | 39 +++++++++++++++++++++++++-- 3 files changed, 49 insertions(+), 3 deletions(-) diff --git a/pageserver/src/bin/pageserver.rs b/pageserver/src/bin/pageserver.rs index 32d3fca47c..ef4e38b5ec 100644 --- a/pageserver/src/bin/pageserver.rs +++ b/pageserver/src/bin/pageserver.rs @@ -239,6 +239,8 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> { // we need to release the lock file only when the current process is gone let _ = Box::leak(Box::new(lock_file)); + info!("Created PID file with PID {}", Pid::this().to_string()); + // TODO: Check that it looks like a valid repository before going further // bind sockets before daemonizing so we report errors early and do not return until we are listening diff --git a/safekeeper/src/bin/safekeeper.rs b/safekeeper/src/bin/safekeeper.rs index 49e9e30cdc..fe23cf0202 100644 --- a/safekeeper/src/bin/safekeeper.rs +++ b/safekeeper/src/bin/safekeeper.rs @@ -165,18 +165,27 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option, init: bo // we need to release the lock file only when the current process is gone let _ = Box::leak(Box::new(lock_file)); + info!("Created PID file with PID {}", Pid::this().to_string()); + // Set or read our ID. set_id(&mut conf, given_id)?; if init { return Ok(()); } + info!( + "Starting safekeeper http handler on {}", + conf.listen_http_addr + ); let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| { error!("failed to bind to address {}: {}", conf.listen_http_addr, e); e })?; - info!("Starting safekeeper on {}", conf.listen_pg_addr); + info!( + "Starting safekeeper pg protocol handler on {}", + conf.listen_pg_addr + ); let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| { error!("failed to bind to address {}: {}", conf.listen_pg_addr, e); e diff --git a/test_runner/fixtures/neon_fixtures.py b/test_runner/fixtures/neon_fixtures.py index 7fc2a7c24b..5564e4520e 100644 --- a/test_runner/fixtures/neon_fixtures.py +++ b/test_runner/fixtures/neon_fixtures.py @@ -1580,7 +1580,17 @@ class NeonCli(AbstractNeonCli): s3_env_vars = self.env.remote_storage.access_env_vars() extra_env_vars = (extra_env_vars or {}) | s3_env_vars - return self.raw_cli(start_args, extra_env_vars=extra_env_vars) + try: + return self.raw_cli(start_args, extra_env_vars=extra_env_vars) + except Exception: + # A common reason for startup failure is that the port is already in use. We + # coordinate port assignment with PortDistributor, but it's a common mistake + # when writing a new test to use a hardcoded port, or assign the port without + # using the distributor, causing races where two tests runnign concurrently + # sometimes choose the same port. To help debug such cases, get a listing + # of all inuse ports and the processes holding them. + list_inuse_ports() + raise def pageserver_stop(self, immediate=False) -> "subprocess.CompletedProcess[str]": cmd = ["pageserver", "stop"] @@ -1595,7 +1605,11 @@ class NeonCli(AbstractNeonCli): if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage): s3_env_vars = self.env.remote_storage.access_env_vars() - return self.raw_cli(["safekeeper", "start", str(id)], extra_env_vars=s3_env_vars) + try: + return self.raw_cli(["safekeeper", "start", str(id)], extra_env_vars=s3_env_vars) + except Exception: + list_inuse_ports() # see comment in pageserver_start + raise def safekeeper_stop( self, id: Optional[int] = None, immediate=False @@ -2981,3 +2995,24 @@ def fork_at_current_lsn( """ current_lsn = pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0] return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn) + + +def list_inuse_ports(): + """ + Print "netstat -tnlap" output to the test log. This is useful for debugging + port collisions in tests. + """ + + # This won't work on all platforms, because not all platforms have 'netstat', + # and the CLI arguments vary across platforms, too. macOS's netstat doesn't have + # the -p option, for example. So this is just best-effort. + res = subprocess.run( + ["netstat", "-tnlap"], + check=False, + universal_newlines=True, + capture_output=True, + ) + if res.returncode: + log.info(f"netstat -tnlap failed with return code {res.returncode}") + log.info(f"netstat -tnlap stdout: \n{res.stdout}\n") + log.info(f"netstat -tnlap stderr: \n{res.stderr}\n")