From 40a68e907753b6813d00d8fd1266601c7e929132 Mon Sep 17 00:00:00 2001 From: Alexey Kondratov Date: Tue, 11 Apr 2023 15:05:35 +0200 Subject: [PATCH] [compute_ctl] Add timeout for `tracing_utils::shutdown_tracing()` (#3982) Shutting down OTEL tracing provider may hang for quite some time, see, for example: - https://github.com/open-telemetry/opentelemetry-rust/issues/868 - and our problems with staging https://github.com/neondatabase/cloud/issues/3707#issuecomment-1493983636 Yet, we want computes to shut down fast enough, as we may need a new one for the same timeline ASAP. So wait no longer than 2s for the shutdown to complete, then just error out and exit the main thread. Related to neondatabase/cloud#3707 --- compute_tools/src/bin/compute_ctl.rs | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/compute_tools/src/bin/compute_ctl.rs b/compute_tools/src/bin/compute_ctl.rs index d61eae5f7a..bce860b56b 100644 --- a/compute_tools/src/bin/compute_ctl.rs +++ b/compute_tools/src/bin/compute_ctl.rs @@ -34,7 +34,7 @@ use std::fs::File; use std::panic; use std::path::Path; use std::process::exit; -use std::sync::{Arc, Condvar, Mutex}; +use std::sync::{mpsc, Arc, Condvar, Mutex}; use std::{thread, time::Duration}; use anyhow::{Context, Result}; @@ -239,10 +239,25 @@ fn main() -> Result<()> { thread::sleep(Duration::from_secs(30)); } - info!("shutting down tracing"); // Shutdown trace pipeline gracefully, so that it has a chance to send any - // pending traces before we exit. - tracing_utils::shutdown_tracing(); + // pending traces before we exit. Shutting down OTEL tracing provider may + // hang for quite some time, see, for example: + // - https://github.com/open-telemetry/opentelemetry-rust/issues/868 + // - and our problems with staging https://github.com/neondatabase/cloud/issues/3707#issuecomment-1493983636 + // + // Yet, we want computes to shut down fast enough, as we may need a new one + // for the same timeline ASAP. So wait no longer than 2s for the shutdown to + // complete, then just error out and exit the main thread. + info!("shutting down tracing"); + let (sender, receiver) = mpsc::channel(); + let _ = thread::spawn(move || { + tracing_utils::shutdown_tracing(); + sender.send(()).ok() + }); + let shutdown_res = receiver.recv_timeout(Duration::from_millis(2000)); + if shutdown_res.is_err() { + error!("timed out while shutting down tracing, exiting anyway"); + } info!("shutting down"); exit(exit_code.unwrap_or(1))