mirror of
https://github.com/neondatabase/neon.git
synced 2026-05-27 01:50:38 +00:00
Add a temporary timeout to handling an IO request in the communicator
It's nicer to timeout in the communicator and return an error to the backend, than PANIC the backend.
This commit is contained in:
@@ -214,8 +214,23 @@ impl<'t> CommunicatorWorkerProcessStruct<'t> {
|
||||
// Spawn a separate task for every request. That's a little excessive for requests that
|
||||
// can be quickly satisfied from the cache, but we expect that to be rare, because the
|
||||
// requesting backend would have already checked the cache.
|
||||
tokio::spawn(async {
|
||||
let result = self.handle_request(slot.get_request()).await;
|
||||
tokio::spawn(async move {
|
||||
let request_id = slot.get_request().request_id();
|
||||
trace!("spawned task to process request {request_id} at slot {request_idx}");
|
||||
|
||||
// FIXME: as a temporary hack, abort request if we don't get a response promptly.
|
||||
// Lots of regression tests are getting stuck and failing at the moment,
|
||||
// this makes them fail a little faster, which it faster to iterate.
|
||||
// This needs to be removed once more regression tests are passing.
|
||||
// See also similar hack in the backend code, in wait_request_completion()
|
||||
let result = tokio::time::timeout(
|
||||
tokio::time::Duration::from_secs(30),
|
||||
self.handle_request(slot.get_request())
|
||||
).await.unwrap_or_else(|_elapsed| {
|
||||
info!("request {request_id} timed out");
|
||||
NeonIOResult::Error(libc::ETIMEDOUT)
|
||||
});
|
||||
trace!("request {request_id} at slot {request_idx} completed");
|
||||
let owner_procno = slot.get_owner_procno();
|
||||
|
||||
// Ok, we have completed the IO. Mark the request as completed. After that,
|
||||
|
||||
@@ -571,9 +571,10 @@ start_request(NeonIORequest *request, struct NeonIOResult *immediate_result_p)
|
||||
if (request_idx == -1)
|
||||
{
|
||||
/* -1 means the request was satisfied immediately. */
|
||||
elog(LOG, "communicator request %lu was satisfied immediately", request->rel_exists.request_id);
|
||||
return -1;
|
||||
}
|
||||
elog(DEBUG5, "sent request with idx %d: tag %d", request_idx, request->tag);
|
||||
elog(LOG, "started communicator request %lu at slot %d", request->rel_exists.request_id, request_idx);
|
||||
return request_idx;
|
||||
}
|
||||
|
||||
@@ -611,9 +612,9 @@ wait_request_completion(int request_idx, struct NeonIOResult *result_p)
|
||||
* This needs to be removed once more regression tests are passing.
|
||||
*/
|
||||
now = GetCurrentTimestamp();
|
||||
if (now - start_time > 30 * 1000 * 1000)
|
||||
if (now - start_time > 60 * 1000 * 1000)
|
||||
{
|
||||
elog(PANIC, "timed out waiting for response from communicator process");
|
||||
elog(PANIC, "timed out waiting for response from communicator process at slot %d", request_idx);
|
||||
}
|
||||
|
||||
(void) WaitLatch(MyIOCompletionLatch,
|
||||
|
||||
Reference in New Issue
Block a user