mirror of
https://github.com/neondatabase/neon.git
synced 2026-06-03 13:30:38 +00:00
Add debug information to hunt down port collisions.
We've been seeing a lot of sporadic test failures with "Cannot assign requested address" lately. Add some debug information to help us find the cause: - When server startup fails, print "netstat -tnlap" output to the test log. If the failure was caused by "Cannot assign requested address", this will hopefully tell us which process was occupying the port. - In pageserver and safekeeper startup, print its PID. This way, we can correlate the PID from netstat output with the test that launched it. - In safekeeper startup, print the HTTP port it's using to the log, in addition to the libpq port. The pageserver was already doing it.
This commit is contained in:
@@ -239,6 +239,8 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
|
||||
// we need to release the lock file only when the current process is gone
|
||||
let _ = Box::leak(Box::new(lock_file));
|
||||
|
||||
info!("Created PID file with PID {}", Pid::this().to_string());
|
||||
|
||||
// TODO: Check that it looks like a valid repository before going further
|
||||
|
||||
// bind sockets before daemonizing so we report errors early and do not return until we are listening
|
||||
|
||||
@@ -165,18 +165,27 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
|
||||
// we need to release the lock file only when the current process is gone
|
||||
let _ = Box::leak(Box::new(lock_file));
|
||||
|
||||
info!("Created PID file with PID {}", Pid::this().to_string());
|
||||
|
||||
// Set or read our ID.
|
||||
set_id(&mut conf, given_id)?;
|
||||
if init {
|
||||
return Ok(());
|
||||
}
|
||||
|
||||
info!(
|
||||
"Starting safekeeper http handler on {}",
|
||||
conf.listen_http_addr
|
||||
);
|
||||
let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
|
||||
e
|
||||
})?;
|
||||
|
||||
info!("Starting safekeeper on {}", conf.listen_pg_addr);
|
||||
info!(
|
||||
"Starting safekeeper pg protocol handler on {}",
|
||||
conf.listen_pg_addr
|
||||
);
|
||||
let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
|
||||
error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
|
||||
e
|
||||
|
||||
@@ -1580,7 +1580,17 @@ class NeonCli(AbstractNeonCli):
|
||||
s3_env_vars = self.env.remote_storage.access_env_vars()
|
||||
extra_env_vars = (extra_env_vars or {}) | s3_env_vars
|
||||
|
||||
return self.raw_cli(start_args, extra_env_vars=extra_env_vars)
|
||||
try:
|
||||
return self.raw_cli(start_args, extra_env_vars=extra_env_vars)
|
||||
except Exception:
|
||||
# A common reason for startup failure is that the port is already in use. We
|
||||
# coordinate port assignment with PortDistributor, but it's a common mistake
|
||||
# when writing a new test to use a hardcoded port, or assign the port without
|
||||
# using the distributor, causing races where two tests runnign concurrently
|
||||
# sometimes choose the same port. To help debug such cases, get a listing
|
||||
# of all inuse ports and the processes holding them.
|
||||
list_inuse_ports()
|
||||
raise
|
||||
|
||||
def pageserver_stop(self, immediate=False) -> "subprocess.CompletedProcess[str]":
|
||||
cmd = ["pageserver", "stop"]
|
||||
@@ -1595,7 +1605,11 @@ class NeonCli(AbstractNeonCli):
|
||||
if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage):
|
||||
s3_env_vars = self.env.remote_storage.access_env_vars()
|
||||
|
||||
return self.raw_cli(["safekeeper", "start", str(id)], extra_env_vars=s3_env_vars)
|
||||
try:
|
||||
return self.raw_cli(["safekeeper", "start", str(id)], extra_env_vars=s3_env_vars)
|
||||
except Exception:
|
||||
list_inuse_ports() # see comment in pageserver_start
|
||||
raise
|
||||
|
||||
def safekeeper_stop(
|
||||
self, id: Optional[int] = None, immediate=False
|
||||
@@ -2981,3 +2995,24 @@ def fork_at_current_lsn(
|
||||
"""
|
||||
current_lsn = pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0]
|
||||
return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn)
|
||||
|
||||
|
||||
def list_inuse_ports():
|
||||
"""
|
||||
Print "netstat -tnlap" output to the test log. This is useful for debugging
|
||||
port collisions in tests.
|
||||
"""
|
||||
|
||||
# This won't work on all platforms, because not all platforms have 'netstat',
|
||||
# and the CLI arguments vary across platforms, too. macOS's netstat doesn't have
|
||||
# the -p option, for example. So this is just best-effort.
|
||||
res = subprocess.run(
|
||||
["netstat", "-tnlap"],
|
||||
check=False,
|
||||
universal_newlines=True,
|
||||
capture_output=True,
|
||||
)
|
||||
if res.returncode:
|
||||
log.info(f"netstat -tnlap failed with return code {res.returncode}")
|
||||
log.info(f"netstat -tnlap stdout: \n{res.stdout}\n")
|
||||
log.info(f"netstat -tnlap stderr: \n{res.stderr}\n")
|
||||
|
||||
Reference in New Issue
Block a user