From 606caa0c5d9263b804b1fd7df0ed81f9cf194909 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Fri, 4 Aug 2023 10:37:18 +0100 Subject: [PATCH] proxy: update logs and span data to be consistent and have more info (#4878) ## Problem Pre-requisites for #4852 and #4853 ## Summary of changes 1. Includes the client's IP address (which we already log) with the span info so we can have it on all associated logs. This makes making dashboards based on IP addresses easier. 2. Switch to a consistent error/warning log for errors during connection. This includes error, num_retries, retriable=true/false and a consistent log message that we can grep for. --- proxy/src/auth/backend/classic.rs | 16 +++++++++++----- proxy/src/compute.rs | 17 ++--------------- proxy/src/proxy.rs | 30 +++++++++++++++++------------- 3 files changed, 30 insertions(+), 33 deletions(-) diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index 5ed0f747c2..d8ee1e8b64 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -10,7 +10,7 @@ use crate::{ stream::PqStream, }; use tokio::io::{AsyncRead, AsyncWrite}; -use tracing::info; +use tracing::{error, info, warn}; pub(super) async fn authenticate( api: &impl console::Api, @@ -55,11 +55,17 @@ pub(super) async fn authenticate( let mut num_retries = 0; let mut node = loop { let wake_res = api.wake_compute(extra, creds).await; - match handle_try_wake(wake_res, num_retries)? { - ControlFlow::Continue(_) => num_retries += 1, - ControlFlow::Break(n) => break n, + match handle_try_wake(wake_res, num_retries) { + Err(e) => { + error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); + return Err(e.into()); + } + Ok(ControlFlow::Continue(e)) => { + warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); + num_retries += 1; + } + Ok(ControlFlow::Break(n)) => break n, } - info!(num_retries, "retrying wake compute"); }; if let Some(keys) = scram_keys { use tokio_postgres::config::AuthKeys; diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index b1cf2a8559..e96b79ed92 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -230,7 +230,8 @@ pub struct PostgresConnection { } impl ConnCfg { - async fn do_connect( + /// Connect to a corresponding compute node. + pub async fn connect( &self, allow_self_signed_compute: bool, timeout: Duration, @@ -270,20 +271,6 @@ impl ConnCfg { Ok(connection) } - - /// Connect to a corresponding compute node. - pub async fn connect( - &self, - allow_self_signed_compute: bool, - timeout: Duration, - ) -> Result { - self.do_connect(allow_self_signed_compute, timeout) - .inspect_err(|err| { - // Immediately log the error we have at our disposal. - error!("couldn't connect to compute node: {err}"); - }) - .await - } } /// Retrieve `options` from a startup message, dropping all proxy-secific flags. diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index 5f59957b2d..e6fcf901d9 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -23,7 +23,7 @@ use tokio::{ time, }; use tokio_util::sync::CancellationToken; -use tracing::{error, info, warn}; +use tracing::{error, info, info_span, warn, Instrument}; use utils::measured_stream::MeasuredStream; /// Number of times we should retry the `/proxy_wake_compute` http request. @@ -101,21 +101,20 @@ pub async fn task_main( tokio::select! { accept_result = listener.accept() => { let (socket, peer_addr) = accept_result?; - info!("accepted postgres client connection from {peer_addr}"); let session_id = uuid::Uuid::new_v4(); let cancel_map = Arc::clone(&cancel_map); connections.spawn( async move { - info!("spawned a task for {peer_addr}"); + info!("accepted postgres client connection"); socket .set_nodelay(true) .context("failed to set socket option")?; - handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp) - .await + handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp).await } + .instrument(info_span!("handle_client", ?session_id, %peer_addr)) .unwrap_or_else(move |e| { // Acknowledge that the task has finished with an error. error!(?session_id, "per-client task finished with an error: {e:#}"); @@ -183,7 +182,6 @@ impl ClientMode { } } -#[tracing::instrument(fields(session_id = ?session_id), skip_all)] pub async fn handle_client( config: &'static ProxyConfig, cancel_map: &CancelMap, @@ -425,11 +423,17 @@ where auth::BackendType::Test(x) => x.wake_compute(), }; - match handle_try_wake(wake_res, num_retries)? { + match handle_try_wake(wake_res, num_retries) { + Err(e) => { + error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); + return Err(e.into()); + } // failed to wake up but we can continue to retry - ControlFlow::Continue(_) => {} + Ok(ControlFlow::Continue(e)) => { + warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); + } // successfully woke up a compute node and can break the wakeup loop - ControlFlow::Break(mut node_info) => { + Ok(ControlFlow::Break(mut node_info)) => { node_info.config.reuse_password(&config); mechanism.update_connect_config(&mut node_info.config); break node_info; @@ -440,7 +444,6 @@ where num_retries += 1; time::sleep(wait_duration).await; - info!(num_retries, "retrying wake compute"); }; // now that we have a new node, try connect to it repeatedly. @@ -451,10 +454,12 @@ where match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await { Ok(res) => return Ok(res), Err(e) => { - error!(error = ?e, "could not connect to compute node"); - if !e.should_retry(num_retries) { + let retriable = e.should_retry(num_retries); + if !retriable { + error!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); return Err(e.into()); } + warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); } } @@ -462,7 +467,6 @@ where num_retries += 1; time::sleep(wait_duration).await; - info!(num_retries, "retrying connect_once"); } }