proxy: Exclude compute and retries (#7529)

## Problem

Alerts fire if the connection the compute is slow.

## Summary of changes

Exclude compute and retry from latencies.
This commit is contained in:
Anna Khanova
2024-04-29 11:49:42 +02:00
committed by GitHub
parent 84914434e3
commit 24ce878039
5 changed files with 47 additions and 0 deletions

View File

@@ -260,7 +260,9 @@ impl ConnCfg {
aux: MetricsAuxInfo,
timeout: Duration,
) -> Result<PostgresConnection, ConnectionError> {
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
let (socket_addr, stream, host) = self.connect_raw(timeout).await?;
drop(pause);
let tls_connector = native_tls::TlsConnector::builder()
.danger_accept_invalid_certs(allow_self_signed_compute)
@@ -270,7 +272,9 @@ impl ConnCfg {
let tls = MakeTlsConnect::<tokio::net::TcpStream>::make_tls_connect(&mut mk_tls, host)?;
// connect_raw() will not use TLS if sslmode is "disable"
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
let (client, connection) = self.0.connect_raw(stream, tls).await?;
drop(pause);
tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
let stream = connection.stream.into_inner();

View File

@@ -284,6 +284,8 @@ pub struct ComputeConnectionLatencyGroup {
pub enum LatencyExclusions {
Client,
ClientAndCplane,
ClientCplaneCompute,
ClientCplaneComputeRetry,
}
#[derive(FixedCardinalityLabel, Copy, Clone)]
@@ -352,6 +354,7 @@ pub enum Waiting {
Cplane,
Client,
Compute,
RetryTimeout,
}
#[derive(Default)]
@@ -359,6 +362,7 @@ struct Accumulated {
cplane: time::Duration,
client: time::Duration,
compute: time::Duration,
retry: time::Duration,
}
pub struct LatencyTimer {
@@ -421,6 +425,7 @@ impl Drop for LatencyTimerPause<'_> {
Waiting::Cplane => self.timer.accumulated.cplane += dur,
Waiting::Client => self.timer.accumulated.client += dur,
Waiting::Compute => self.timer.accumulated.compute += dur,
Waiting::RetryTimeout => self.timer.accumulated.retry += dur,
}
}
}
@@ -464,6 +469,34 @@ impl Drop for LatencyTimer {
},
duration.saturating_sub(accumulated_total).as_secs_f64(),
);
// Exclude client cplane, compue communication from the accumulated time.
let accumulated_total =
self.accumulated.client + self.accumulated.cplane + self.accumulated.compute;
metric.observe(
ComputeConnectionLatencyGroup {
protocol: self.protocol,
cold_start_info: self.cold_start_info,
outcome: self.outcome,
excluded: LatencyExclusions::ClientCplaneCompute,
},
duration.saturating_sub(accumulated_total).as_secs_f64(),
);
// Exclude client cplane, compue, retry communication from the accumulated time.
let accumulated_total = self.accumulated.client
+ self.accumulated.cplane
+ self.accumulated.compute
+ self.accumulated.retry;
metric.observe(
ComputeConnectionLatencyGroup {
protocol: self.protocol,
cold_start_info: self.cold_start_info,
outcome: self.outcome,
excluded: LatencyExclusions::ClientCplaneComputeRetry,
},
duration.saturating_sub(accumulated_total).as_secs_f64(),
);
}
}

View File

@@ -194,6 +194,10 @@ where
let wait_duration = retry_after(num_retries, connect_to_compute_retry_config);
num_retries += 1;
let pause = ctx
.latency_timer
.pause(crate::metrics::Waiting::RetryTimeout);
time::sleep(wait_duration).await;
drop(pause);
}
}

View File

@@ -54,7 +54,11 @@ pub async fn wake_compute<B: ComputeConnectBackend>(
let wait_duration = retry_after(*num_retries, config);
*num_retries += 1;
let pause = ctx
.latency_timer
.pause(crate::metrics::Waiting::RetryTimeout);
tokio::time::sleep(wait_duration).await;
drop(pause);
}
}

View File

@@ -179,7 +179,9 @@ impl ConnectMechanism for TokioMechanism {
.dbname(&self.conn_info.dbname)
.connect_timeout(timeout);
let pause = ctx.latency_timer.pause(crate::metrics::Waiting::Compute);
let (client, connection) = config.connect(tokio_postgres::NoTls).await?;
drop(pause);
tracing::Span::current().record("pid", &tracing::field::display(client.get_process_id()));
Ok(poll_client(