Add a temporary timeout to handling an IO request in the communicator

It's nicer to timeout in the communicator and return an error to the backend, than PANIC the backend.
2026-05-27 01:50:38 +00:00 · 2025-07-04 15:57:52 +03:00
parent 5f2d476a58
commit bd46dd60a0
2 changed files with 21 additions and 5 deletions
--- a/pgxn/neon/communicator/src/worker_process/main_loop.rs
+++ b/pgxn/neon/communicator/src/worker_process/main_loop.rs
@@ -214,8 +214,23 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> {
            // Spawn a separate task for every request. That's a little excessive for requests that
            // can be quickly satisfied from the cache, but we expect that to be rare, because the
            // requesting backend would have already checked the cache.
-            tokio::spawn(async {
-                let result = self.handle_request(slot.get_request()).await;
+            tokio::spawn(async move {
+                let request_id = slot.get_request().request_id();
+                trace!("spawned task to process request {request_id} at slot {request_idx}");
+
+		// FIXME: as a temporary hack, abort request if we don't get a response promptly.
+		// Lots of regression tests are getting stuck and failing at the moment,
+		// this makes them fail a little faster, which it faster to iterate.
+		// This needs to be removed once more regression tests are passing.
+                // See also similar hack in the backend code, in wait_request_completion()
+                let result = tokio::time::timeout(
+                    tokio::time::Duration::from_secs(30),
+                    self.handle_request(slot.get_request())
+                ).await.unwrap_or_else(|_elapsed| {
+                    info!("request {request_id} timed out");
+                    NeonIOResult::Error(libc::ETIMEDOUT)
+                });
+                trace!("request {request_id} at slot {request_idx} completed");
                let owner_procno = slot.get_owner_procno();

                // Ok, we have completed the IO. Mark the request as completed. After that,
--- a/pgxn/neon/communicator_new.c
+++ b/pgxn/neon/communicator_new.c
@@ -571,9 +571,10 @@ start_request(NeonIORequest *request, struct NeonIOResult *immediate_result_p)
 	if (request_idx == -1)
 	{
 		/* -1 means the request was satisfied immediately. */
+		elog(LOG, "communicator request %lu was satisfied immediately", request->rel_exists.request_id);
 		return -1;
 	}
-	elog(DEBUG5, "sent request with idx %d: tag %d", request_idx, request->tag);
+	elog(LOG, "started communicator request %lu at slot %d", request->rel_exists.request_id, request_idx);
 	return request_idx;
 }

@@ -611,9 +612,9 @@ wait_request_completion(int request_idx, struct NeonIOResult *result_p)
 			 * This needs to be removed once more regression tests are passing.
 			 */
 			now = GetCurrentTimestamp();
-			if (now - start_time > 30 * 1000 * 1000)
+			if (now - start_time > 60 * 1000 * 1000)
 			{
-				elog(PANIC, "timed out waiting for response from communicator process");
+				elog(PANIC, "timed out waiting for response from communicator process at slot %d", request_idx);
 			}

 			(void) WaitLatch(MyIOCompletionLatch,