Don't panic if spawning a thread to handle a connection fails.

Log the error and continue. Hopefully it's a transient failure.

This might have been happening in staging earlier, when the safekeeper
had a problem where it opened connections very frequently to issue
"callmemaybe" commands. If you launch too many threads too fast, you might
run out of file descriptors or something. It's not totally clear what
happened, but with commit, at least the page server will continue to run
and accept new connections, if a transient error happens.
This commit is contained in:
Heikki Linnakangas
2022-01-14 18:02:30 +02:00
parent adb0b3dada
commit d29836d0d5

View File

@@ -194,19 +194,25 @@ pub fn thread_main(
while !tenant_mgr::shutdown_requested() {
let (socket, peer_addr) = listener.accept()?;
debug!("accepted connection from {}", peer_addr);
socket.set_nodelay(true).unwrap();
let local_auth = auth.clone();
let handle = thread::Builder::new()
match thread::Builder::new()
.name("serving Page Service thread".into())
.spawn(move || {
if let Err(err) = page_service_conn_main(conf, local_auth, socket, auth_type) {
error!("page server thread exited with error: {:?}", err);
}
})
.unwrap();
join_handles.push(handle);
}) {
Ok(handle) => {
// FIXME: There is no mechanism to remove the handle from this list
// when a thread exits
join_handles.push(handle);
}
Err(err) => {
// Thread creation failed. Log the error and continue.
error!(%err, "could not spawn page service thread");
}
}
}
debug!("page_service loop terminated. wait for connections to cancel");
@@ -232,6 +238,10 @@ fn page_service_conn_main(
gauge.dec();
}
socket
.set_nodelay(true)
.context("could not set TCP_NODELAY")?;
let mut conn_handler = PageServerHandler::new(conf, auth);
let pgbackend = PostgresBackend::new(socket, auth_type, None, true)?;
pgbackend.run(&mut conn_handler)