diff --git a/pgxn/neon/communicator/src/worker_process/main_loop.rs b/pgxn/neon/communicator/src/worker_process/main_loop.rs index 2b19024038..8abde953ce 100644 --- a/pgxn/neon/communicator/src/worker_process/main_loop.rs +++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs @@ -214,8 +214,23 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> { // Spawn a separate task for every request. That's a little excessive for requests that // can be quickly satisfied from the cache, but we expect that to be rare, because the // requesting backend would have already checked the cache. - tokio::spawn(async { - let result = self.handle_request(slot.get_request()).await; + tokio::spawn(async move { + let request_id = slot.get_request().request_id(); + trace!("spawned task to process request {request_id} at slot {request_idx}"); + + // FIXME: as a temporary hack, abort request if we don't get a response promptly. + // Lots of regression tests are getting stuck and failing at the moment, + // this makes them fail a little faster, which it faster to iterate. + // This needs to be removed once more regression tests are passing. + // See also similar hack in the backend code, in wait_request_completion() + let result = tokio::time::timeout( + tokio::time::Duration::from_secs(30), + self.handle_request(slot.get_request()) + ).await.unwrap_or_else(|_elapsed| { + info!("request {request_id} timed out"); + NeonIOResult::Error(libc::ETIMEDOUT) + }); + trace!("request {request_id} at slot {request_idx} completed"); let owner_procno = slot.get_owner_procno(); // Ok, we have completed the IO. Mark the request as completed. After that, diff --git a/pgxn/neon/communicator_new.c b/pgxn/neon/communicator_new.c index bd8166f721..af6f6b18a4 100644 --- a/pgxn/neon/communicator_new.c +++ b/pgxn/neon/communicator_new.c @@ -571,9 +571,10 @@ start_request(NeonIORequest *request, struct NeonIOResult *immediate_result_p) if (request_idx == -1) { /* -1 means the request was satisfied immediately. */ + elog(LOG, "communicator request %lu was satisfied immediately", request->rel_exists.request_id); return -1; } - elog(DEBUG5, "sent request with idx %d: tag %d", request_idx, request->tag); + elog(LOG, "started communicator request %lu at slot %d", request->rel_exists.request_id, request_idx); return request_idx; } @@ -611,9 +612,9 @@ wait_request_completion(int request_idx, struct NeonIOResult *result_p) * This needs to be removed once more regression tests are passing. */ now = GetCurrentTimestamp(); - if (now - start_time > 30 * 1000 * 1000) + if (now - start_time > 60 * 1000 * 1000) { - elog(PANIC, "timed out waiting for response from communicator process"); + elog(PANIC, "timed out waiting for response from communicator process at slot %d", request_idx); } (void) WaitLatch(MyIOCompletionLatch,