Add debug information to hunt down port collisions.

We've been seeing a lot of sporadic test failures with "Cannot assign requested address" lately. Add some debug information to help us find the cause: - When server startup fails, print "netstat -tnlap" output to the test log. If the failure was caused by "Cannot assign requested address", this will hopefully tell us which process was occupying the port. - In pageserver and safekeeper startup, print its PID. This way, we can correlate the PID from netstat output with the test that launched it. - In safekeeper startup, print the HTTP port it's using to the log, in addition to the libpq port. The pageserver was already doing it.
2026-03-05 01:10:38 +00:00 · 2022-11-30 14:36:19 +02:00
3 changed files with 49 additions and 3 deletions
--- a/pageserver/src/bin/pageserver.rs
+++ b/pageserver/src/bin/pageserver.rs
@@ -239,6 +239,8 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
    // we need to release the lock file only when the current process is gone
    let _ = Box::leak(Box::new(lock_file));

+    info!("Created PID file with PID {}", Pid::this().to_string());
+
    // TODO: Check that it looks like a valid repository before going further

    // bind sockets before daemonizing so we report errors early and do not return until we are listening
--- a/safekeeper/src/bin/safekeeper.rs
+++ b/safekeeper/src/bin/safekeeper.rs
@@ -165,18 +165,27 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
    // we need to release the lock file only when the current process is gone
    let _ = Box::leak(Box::new(lock_file));

+    info!("Created PID file with PID {}", Pid::this().to_string());
+
    // Set or read our ID.
    set_id(&mut conf, given_id)?;
    if init {
        return Ok(());
    }

+    info!(
+        "Starting safekeeper http handler on {}",
+        conf.listen_http_addr
+    );
    let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
        error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
        e
    })?;

-    info!("Starting safekeeper on {}", conf.listen_pg_addr);
+    info!(
+        "Starting safekeeper pg protocol handler on {}",
+        conf.listen_pg_addr
+    );
    let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
        error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
        e
--- a/test_runner/fixtures/neon_fixtures.py
+++ b/test_runner/fixtures/neon_fixtures.py
@@ -1580,7 +1580,17 @@ class NeonCli(AbstractNeonCli):
            s3_env_vars = self.env.remote_storage.access_env_vars()
            extra_env_vars = (extra_env_vars or {}) | s3_env_vars

-        return self.raw_cli(start_args, extra_env_vars=extra_env_vars)
+        try:
+            return self.raw_cli(start_args, extra_env_vars=extra_env_vars)
+        except Exception:
+            # A common reason for startup failure is that the port is already in use. We
+            # coordinate port assignment with PortDistributor, but it's a common mistake
+            # when writing a new test to use a hardcoded port, or assign the port without
+            # using the distributor, causing races where two tests runnign concurrently
+            # sometimes choose the same port. To help debug such cases, get a listing
+            # of all inuse ports and the processes holding them.
+            list_inuse_ports()
+            raise

    def pageserver_stop(self, immediate=False) -> "subprocess.CompletedProcess[str]":
        cmd = ["pageserver", "stop"]
@@ -1595,7 +1605,11 @@ class NeonCli(AbstractNeonCli):
        if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage):
            s3_env_vars = self.env.remote_storage.access_env_vars()

-        return self.raw_cli(["safekeeper", "start", str(id)], extra_env_vars=s3_env_vars)
+        try:
+            return self.raw_cli(["safekeeper", "start", str(id)], extra_env_vars=s3_env_vars)
+        except Exception:
+            list_inuse_ports()  # see comment in  pageserver_start
+            raise

    def safekeeper_stop(
        self, id: Optional[int] = None, immediate=False
@@ -2981,3 +2995,24 @@ def fork_at_current_lsn(
    """
    current_lsn = pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0]
    return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn)
+
+
+def list_inuse_ports():
+    """
+    Print "netstat -tnlap" output to the test log. This is useful for debugging
+    port collisions in tests.
+    """
+
+    # This won't work on all platforms, because not all platforms have 'netstat',
+    # and the CLI arguments vary across platforms, too. macOS's netstat doesn't have
+    # the -p option, for example. So this is just best-effort.
+    res = subprocess.run(
+        ["netstat", "-tnlap"],
+        check=False,
+        universal_newlines=True,
+        capture_output=True,
+    )
+    if res.returncode:
+        log.info(f"netstat -tnlap failed with return code {res.returncode}")
+    log.info(f"netstat -tnlap stdout: \n{res.stdout}\n")
+    log.info(f"netstat -tnlap stderr: \n{res.stderr}\n")