From 2ebd2ce2b6dbc143d2ec4715d3d4caa2cfb5d640 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Tue, 25 Jul 2023 16:57:42 +0100 Subject: [PATCH] proxy: record connection type (#4802) ## Problem We want to measure how many users are using TCP/WS connections. We also want to measure how long it takes to establish a connection with the compute node. I plan to also add a separate counter for HTTP requests, but because of pooling this needs to be disambiguated against new HTTP compute connections ## Summary of changes * record connection type (ws/tcp) in the connection counters. * record connection latency including retry latency --- proxy/src/proxy.rs | 43 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index d317d382a7..2cdd1582ac 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -17,7 +17,9 @@ use anyhow::{bail, Context}; use async_trait::async_trait; use futures::TryFutureExt; use hyper::StatusCode; -use metrics::{register_int_counter, register_int_counter_vec, IntCounter, IntCounterVec}; +use metrics::{ + exponential_buckets, register_histogram, register_int_counter_vec, Histogram, IntCounterVec, +}; use once_cell::sync::Lazy; use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; use std::{error::Error, io, ops::ControlFlow, sync::Arc}; @@ -38,18 +40,30 @@ const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(100 const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; const ERR_PROTO_VIOLATION: &str = "protocol violation"; -static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter!( +static NUM_CONNECTIONS_ACCEPTED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( "proxy_accepted_connections_total", - "Number of TCP client connections accepted." + "Number of TCP client connections accepted.", + &["protocol"], ) .unwrap() }); -static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter!( +static NUM_CONNECTIONS_CLOSED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( "proxy_closed_connections_total", - "Number of TCP client connections closed." + "Number of TCP client connections closed.", + &["protocol"], + ) + .unwrap() +}); + +static COMPUTE_CONNECTION_LATENCY: Lazy = Lazy::new(|| { + register_histogram!( + "proxy_compute_connection_latency_seconds", + "Time it took for proxy to establish a connection to the compute endpoint", + // largest bucket = 2^16 * 0.5ms = 32s + exponential_buckets(0.0005, 2.0, 16).unwrap(), ) .unwrap() }); @@ -137,6 +151,13 @@ pub enum ClientMode { /// Abstracts the logic of handling TCP vs WS clients impl ClientMode { + fn protocol_label(&self) -> &'static str { + match self { + ClientMode::Tcp => "tcp", + ClientMode::Websockets { .. } => "ws", + } + } + fn allow_cleartext(&self) -> bool { match self { ClientMode::Tcp => false, @@ -176,9 +197,11 @@ pub async fn handle_client( mode: ClientMode, ) -> anyhow::Result<()> { // The `closed` counter will increase when this future is destroyed. - NUM_CONNECTIONS_ACCEPTED_COUNTER.inc(); + NUM_CONNECTIONS_ACCEPTED_COUNTER + .with_label_values(&[mode.protocol_label()]) + .inc(); scopeguard::defer! { - NUM_CONNECTIONS_CLOSED_COUNTER.inc(); + NUM_CONNECTIONS_CLOSED_COUNTER.with_label_values(&[mode.protocol_label()]).inc(); } let tls = config.tls_config.as_ref(); @@ -380,6 +403,8 @@ where M::ConnectError: ShouldRetry + std::fmt::Debug, M::Error: From, { + let _timer = COMPUTE_CONNECTION_LATENCY.start_timer(); + mechanism.update_connect_config(&mut node_info.config); let mut num_retries = 0;