Compare commits

...

1 Commits

Author SHA1 Message Date
Heikki Linnakangas
fb4b6ce8dc Add debug information to hunt down port collisions.
We've been seeing a lot of sporadic test failures with "Cannot assign
requested address" lately. Add some debug information to help us find
the cause:

- When server startup fails, print "netstat -tnlap" output to the test
  log. If the failure was caused by "Cannot assign requested address",
  this will hopefully tell us which process was occupying the port.
- In pageserver and safekeeper startup, print its PID. This way, we can
  correlate the PID from netstat output with the test that launched it.
- In safekeeper startup, print the HTTP port it's using to the log, in
  addition to the libpq port. The pageserver was already doing it.
2022-11-30 14:36:19 +02:00
3 changed files with 49 additions and 3 deletions

View File

@@ -239,6 +239,8 @@ fn start_pageserver(conf: &'static PageServerConf) -> anyhow::Result<()> {
// we need to release the lock file only when the current process is gone
let _ = Box::leak(Box::new(lock_file));
info!("Created PID file with PID {}", Pid::this().to_string());
// TODO: Check that it looks like a valid repository before going further
// bind sockets before daemonizing so we report errors early and do not return until we are listening

View File

@@ -165,18 +165,27 @@ fn start_safekeeper(mut conf: SafeKeeperConf, given_id: Option<NodeId>, init: bo
// we need to release the lock file only when the current process is gone
let _ = Box::leak(Box::new(lock_file));
info!("Created PID file with PID {}", Pid::this().to_string());
// Set or read our ID.
set_id(&mut conf, given_id)?;
if init {
return Ok(());
}
info!(
"Starting safekeeper http handler on {}",
conf.listen_http_addr
);
let http_listener = tcp_listener::bind(conf.listen_http_addr.clone()).map_err(|e| {
error!("failed to bind to address {}: {}", conf.listen_http_addr, e);
e
})?;
info!("Starting safekeeper on {}", conf.listen_pg_addr);
info!(
"Starting safekeeper pg protocol handler on {}",
conf.listen_pg_addr
);
let pg_listener = tcp_listener::bind(conf.listen_pg_addr.clone()).map_err(|e| {
error!("failed to bind to address {}: {}", conf.listen_pg_addr, e);
e

View File

@@ -1580,7 +1580,17 @@ class NeonCli(AbstractNeonCli):
s3_env_vars = self.env.remote_storage.access_env_vars()
extra_env_vars = (extra_env_vars or {}) | s3_env_vars
return self.raw_cli(start_args, extra_env_vars=extra_env_vars)
try:
return self.raw_cli(start_args, extra_env_vars=extra_env_vars)
except Exception:
# A common reason for startup failure is that the port is already in use. We
# coordinate port assignment with PortDistributor, but it's a common mistake
# when writing a new test to use a hardcoded port, or assign the port without
# using the distributor, causing races where two tests runnign concurrently
# sometimes choose the same port. To help debug such cases, get a listing
# of all inuse ports and the processes holding them.
list_inuse_ports()
raise
def pageserver_stop(self, immediate=False) -> "subprocess.CompletedProcess[str]":
cmd = ["pageserver", "stop"]
@@ -1595,7 +1605,11 @@ class NeonCli(AbstractNeonCli):
if self.env.remote_storage is not None and isinstance(self.env.remote_storage, S3Storage):
s3_env_vars = self.env.remote_storage.access_env_vars()
return self.raw_cli(["safekeeper", "start", str(id)], extra_env_vars=s3_env_vars)
try:
return self.raw_cli(["safekeeper", "start", str(id)], extra_env_vars=s3_env_vars)
except Exception:
list_inuse_ports() # see comment in pageserver_start
raise
def safekeeper_stop(
self, id: Optional[int] = None, immediate=False
@@ -2981,3 +2995,24 @@ def fork_at_current_lsn(
"""
current_lsn = pg.safe_psql("SELECT pg_current_wal_lsn()")[0][0]
return env.neon_cli.create_branch(new_branch_name, ancestor_branch_name, tenant_id, current_lsn)
def list_inuse_ports():
"""
Print "netstat -tnlap" output to the test log. This is useful for debugging
port collisions in tests.
"""
# This won't work on all platforms, because not all platforms have 'netstat',
# and the CLI arguments vary across platforms, too. macOS's netstat doesn't have
# the -p option, for example. So this is just best-effort.
res = subprocess.run(
["netstat", "-tnlap"],
check=False,
universal_newlines=True,
capture_output=True,
)
if res.returncode:
log.info(f"netstat -tnlap failed with return code {res.returncode}")
log.info(f"netstat -tnlap stdout: \n{res.stdout}\n")
log.info(f"netstat -tnlap stderr: \n{res.stderr}\n")