Add a temporary timeout to handling an IO request in the communicator

It's nicer to timeout in the communicator and return an error to the
backend, than PANIC the backend.
This commit is contained in:
Heikki Linnakangas
2025-07-04 15:57:52 +03:00
parent 5f2d476a58
commit bd46dd60a0
2 changed files with 21 additions and 5 deletions

View File

@@ -214,8 +214,23 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> {
// Spawn a separate task for every request. That's a little excessive for requests that
// can be quickly satisfied from the cache, but we expect that to be rare, because the
// requesting backend would have already checked the cache.
tokio::spawn(async {
let result = self.handle_request(slot.get_request()).await;
tokio::spawn(async move {
let request_id = slot.get_request().request_id();
trace!("spawned task to process request {request_id} at slot {request_idx}");
// FIXME: as a temporary hack, abort request if we don't get a response promptly.
// Lots of regression tests are getting stuck and failing at the moment,
// this makes them fail a little faster, which it faster to iterate.
// This needs to be removed once more regression tests are passing.
// See also similar hack in the backend code, in wait_request_completion()
let result = tokio::time::timeout(
tokio::time::Duration::from_secs(30),
self.handle_request(slot.get_request())
).await.unwrap_or_else(|_elapsed| {
info!("request {request_id} timed out");
NeonIOResult::Error(libc::ETIMEDOUT)
});
trace!("request {request_id} at slot {request_idx} completed");
let owner_procno = slot.get_owner_procno();
// Ok, we have completed the IO. Mark the request as completed. After that,

View File

@@ -571,9 +571,10 @@ start_request(NeonIORequest *request, struct NeonIOResult *immediate_result_p)
if (request_idx == -1)
{
/* -1 means the request was satisfied immediately. */
elog(LOG, "communicator request %lu was satisfied immediately", request->rel_exists.request_id);
return -1;
}
elog(DEBUG5, "sent request with idx %d: tag %d", request_idx, request->tag);
elog(LOG, "started communicator request %lu at slot %d", request->rel_exists.request_id, request_idx);
return request_idx;
}
@@ -611,9 +612,9 @@ wait_request_completion(int request_idx, struct NeonIOResult *result_p)
* This needs to be removed once more regression tests are passing.
*/
now = GetCurrentTimestamp();
if (now - start_time > 30 * 1000 * 1000)
if (now - start_time > 60 * 1000 * 1000)
{
elog(PANIC, "timed out waiting for response from communicator process");
elog(PANIC, "timed out waiting for response from communicator process at slot %d", request_idx);
}
(void) WaitLatch(MyIOCompletionLatch,