proxy: update logs and span data to be consistent and have more info (#4878)

## Problem

Pre-requisites for #4852 and #4853

## Summary of changes

1. Includes the client's IP address (which we already log) with the span
info so we can have it on all associated logs. This makes making
dashboards based on IP addresses easier.
2. Switch to a consistent error/warning log for errors during
connection. This includes error, num_retries, retriable=true/false and a
consistent log message that we can grep for.
This commit is contained in:
Conrad Ludgate
2023-08-04 10:37:18 +01:00
committed by GitHub
parent 6a906c68c9
commit 606caa0c5d
3 changed files with 30 additions and 33 deletions

View File

@@ -10,7 +10,7 @@ use crate::{
stream::PqStream,
};
use tokio::io::{AsyncRead, AsyncWrite};
use tracing::info;
use tracing::{error, info, warn};
pub(super) async fn authenticate(
api: &impl console::Api,
@@ -55,11 +55,17 @@ pub(super) async fn authenticate(
let mut num_retries = 0;
let mut node = loop {
let wake_res = api.wake_compute(extra, creds).await;
match handle_try_wake(wake_res, num_retries)? {
ControlFlow::Continue(_) => num_retries += 1,
ControlFlow::Break(n) => break n,
match handle_try_wake(wake_res, num_retries) {
Err(e) => {
error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
return Err(e.into());
}
Ok(ControlFlow::Continue(e)) => {
warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
num_retries += 1;
}
Ok(ControlFlow::Break(n)) => break n,
}
info!(num_retries, "retrying wake compute");
};
if let Some(keys) = scram_keys {
use tokio_postgres::config::AuthKeys;

View File

@@ -230,7 +230,8 @@ pub struct PostgresConnection {
}
impl ConnCfg {
async fn do_connect(
/// Connect to a corresponding compute node.
pub async fn connect(
&self,
allow_self_signed_compute: bool,
timeout: Duration,
@@ -270,20 +271,6 @@ impl ConnCfg {
Ok(connection)
}
/// Connect to a corresponding compute node.
pub async fn connect(
&self,
allow_self_signed_compute: bool,
timeout: Duration,
) -> Result<PostgresConnection, ConnectionError> {
self.do_connect(allow_self_signed_compute, timeout)
.inspect_err(|err| {
// Immediately log the error we have at our disposal.
error!("couldn't connect to compute node: {err}");
})
.await
}
}
/// Retrieve `options` from a startup message, dropping all proxy-secific flags.

View File

@@ -23,7 +23,7 @@ use tokio::{
time,
};
use tokio_util::sync::CancellationToken;
use tracing::{error, info, warn};
use tracing::{error, info, info_span, warn, Instrument};
use utils::measured_stream::MeasuredStream;
/// Number of times we should retry the `/proxy_wake_compute` http request.
@@ -101,21 +101,20 @@ pub async fn task_main(
tokio::select! {
accept_result = listener.accept() => {
let (socket, peer_addr) = accept_result?;
info!("accepted postgres client connection from {peer_addr}");
let session_id = uuid::Uuid::new_v4();
let cancel_map = Arc::clone(&cancel_map);
connections.spawn(
async move {
info!("spawned a task for {peer_addr}");
info!("accepted postgres client connection");
socket
.set_nodelay(true)
.context("failed to set socket option")?;
handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp)
.await
handle_client(config, &cancel_map, session_id, socket, ClientMode::Tcp).await
}
.instrument(info_span!("handle_client", ?session_id, %peer_addr))
.unwrap_or_else(move |e| {
// Acknowledge that the task has finished with an error.
error!(?session_id, "per-client task finished with an error: {e:#}");
@@ -183,7 +182,6 @@ impl ClientMode {
}
}
#[tracing::instrument(fields(session_id = ?session_id), skip_all)]
pub async fn handle_client<S: AsyncRead + AsyncWrite + Unpin>(
config: &'static ProxyConfig,
cancel_map: &CancelMap,
@@ -425,11 +423,17 @@ where
auth::BackendType::Test(x) => x.wake_compute(),
};
match handle_try_wake(wake_res, num_retries)? {
match handle_try_wake(wake_res, num_retries) {
Err(e) => {
error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node");
return Err(e.into());
}
// failed to wake up but we can continue to retry
ControlFlow::Continue(_) => {}
Ok(ControlFlow::Continue(e)) => {
warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node");
}
// successfully woke up a compute node and can break the wakeup loop
ControlFlow::Break(mut node_info) => {
Ok(ControlFlow::Break(mut node_info)) => {
node_info.config.reuse_password(&config);
mechanism.update_connect_config(&mut node_info.config);
break node_info;
@@ -440,7 +444,6 @@ where
num_retries += 1;
time::sleep(wait_duration).await;
info!(num_retries, "retrying wake compute");
};
// now that we have a new node, try connect to it repeatedly.
@@ -451,10 +454,12 @@ where
match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
Ok(res) => return Ok(res),
Err(e) => {
error!(error = ?e, "could not connect to compute node");
if !e.should_retry(num_retries) {
let retriable = e.should_retry(num_retries);
if !retriable {
error!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
return Err(e.into());
}
warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node");
}
}
@@ -462,7 +467,6 @@ where
num_retries += 1;
time::sleep(wait_duration).await;
info!(num_retries, "retrying connect_once");
}
}