proxy: latency connect outcome (#5588)

## Problem

I recently updated the latency timers to include cache miss and pool
miss, as well as connection protocol. By moving the latency timer to
start before authentication, we count a lot more failures and it's
messed up the latency dashboard.

## Summary of changes

Add another label to LatencyTimer metrics for outcome. Explicitly report
on success
This commit is contained in:
Conrad Ludgate
2023-10-23 15:17:28 +01:00
committed by GitHub
parent b514da90cb
commit 94b4e76e13
2 changed files with 24 additions and 7 deletions

View File

@@ -194,9 +194,10 @@ impl GlobalConnPool {
info!("pool: cached connection '{conn_info}' is closed, opening a new one");
connect_to_compute(self.proxy_config, conn_info, session_id, latency_timer).await
} else {
latency_timer.pool_hit();
info!("pool: reusing connection '{conn_info}'");
client.session.send(session_id)?;
latency_timer.pool_hit();
latency_timer.success();
return Ok(Client {
inner: Some(client),
span: Span::current(),

View File

@@ -96,7 +96,9 @@ static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
register_histogram_vec!(
"proxy_compute_connection_latency_seconds",
"Time it took for proxy to establish a connection to the compute endpoint",
&["protocol", "cache_miss", "pool_miss"],
// http/ws/tcp, true/false, true/false, success/failure
// 3 * 2 * 2 * 2 = 24 counters
&["protocol", "cache_miss", "pool_miss", "outcome"],
// largest bucket = 2^16 * 0.5ms = 32s
exponential_buckets(0.0005, 2.0, 16).unwrap(),
)
@@ -105,19 +107,22 @@ static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
pub struct LatencyTimer {
start: Instant,
pool_miss: bool,
cache_miss: bool,
protocol: &'static str,
cache_miss: bool,
pool_miss: bool,
outcome: &'static str,
}
impl LatencyTimer {
pub fn new(protocol: &'static str) -> Self {
Self {
start: Instant::now(),
protocol,
cache_miss: false,
// by default we don't do pooling
pool_miss: true,
protocol,
// assume failed unless otherwise specified
outcome: "failed",
}
}
@@ -128,6 +133,10 @@ impl LatencyTimer {
pub fn pool_hit(&mut self) {
self.pool_miss = false;
}
pub fn success(mut self) {
self.outcome = "success";
}
}
impl Drop for LatencyTimer {
@@ -138,6 +147,7 @@ impl Drop for LatencyTimer {
self.protocol,
bool_to_str(self.cache_miss),
bool_to_str(self.pool_miss),
self.outcome,
])
.observe(duration)
}
@@ -547,7 +557,10 @@ where
// try once
let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
Ok(res) => return Ok(res),
Ok(res) => {
latency_timer.success();
return Ok(res);
}
Err(e) => {
error!(error = ?e, "could not connect to compute node");
(invalidate_cache(node_info), e)
@@ -601,7 +614,10 @@ where
info!("wake_compute success. attempting to connect");
loop {
match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await {
Ok(res) => return Ok(res),
Ok(res) => {
latency_timer.success();
return Ok(res);
}
Err(e) => {
let retriable = e.should_retry(num_retries);
if !retriable {