proxy: report metrics based on cold start info (#7324)

## Problem

Would be nice to have a bit more info on cold start metrics.

## Summary of changes

* Change connect compute latency to include `cold_start_info`.
* Update `ColdStartInfo` to include HttpPoolHit and WarmCached.
* Several changes to make more use of interned strings
This commit is contained in:
Conrad Ludgate
2024-04-05 16:14:50 +01:00
committed by GitHub
parent 0fa517eb80
commit 55da8eff4f
17 changed files with 274 additions and 160 deletions

View File

@@ -102,8 +102,7 @@ pub(super) async fn authenticate(
ctx.set_user(db_info.user.into()); ctx.set_user(db_info.user.into());
ctx.set_project(db_info.aux.clone()); ctx.set_project(db_info.aux.clone());
let cold_start_info = db_info.aux.cold_start_info.clone().unwrap_or_default(); info!("woken up a compute node");
info!(?cold_start_info, "woken up a compute node");
// Backwards compatibility. pg_sni_proxy uses "--" in domain names // Backwards compatibility. pg_sni_proxy uses "--" in domain names
// while direct connections do not. Once we migrate to pg_sni_proxy // while direct connections do not. Once we migrate to pg_sni_proxy

View File

@@ -10,6 +10,7 @@ use itertools::Itertools;
use proxy::config::TlsServerEndPoint; use proxy::config::TlsServerEndPoint;
use proxy::context::RequestMonitoring; use proxy::context::RequestMonitoring;
use proxy::proxy::run_until_cancelled; use proxy::proxy::run_until_cancelled;
use proxy::{BranchId, EndpointId, ProjectId};
use rustls::pki_types::PrivateKeyDer; use rustls::pki_types::PrivateKeyDer;
use tokio::net::TcpListener; use tokio::net::TcpListener;
@@ -269,7 +270,12 @@ async fn handle_client(
let client = tokio::net::TcpStream::connect(destination).await?; let client = tokio::net::TcpStream::connect(destination).await?;
let metrics_aux: MetricsAuxInfo = Default::default(); let metrics_aux: MetricsAuxInfo = MetricsAuxInfo {
endpoint_id: (&EndpointId::from("")).into(),
project_id: (&ProjectId::from("")).into(),
branch_id: (&BranchId::from("")).into(),
cold_start_info: proxy::console::messages::ColdStartInfo::Unknown,
};
// doesn't yet matter as pg-sni-router doesn't report analytics logs // doesn't yet matter as pg-sni-router doesn't report analytics logs
ctx.set_success(); ctx.set_success();

View File

@@ -16,7 +16,7 @@ use crate::{
config::ProjectInfoCacheOptions, config::ProjectInfoCacheOptions,
console::AuthSecret, console::AuthSecret,
intern::{EndpointIdInt, ProjectIdInt, RoleNameInt}, intern::{EndpointIdInt, ProjectIdInt, RoleNameInt},
EndpointId, ProjectId, RoleName, EndpointId, RoleName,
}; };
use super::{Cache, Cached}; use super::{Cache, Cached};
@@ -214,14 +214,11 @@ impl ProjectInfoCacheImpl {
} }
pub fn insert_role_secret( pub fn insert_role_secret(
&self, &self,
project_id: &ProjectId, project_id: ProjectIdInt,
endpoint_id: &EndpointId, endpoint_id: EndpointIdInt,
role_name: &RoleName, role_name: RoleNameInt,
secret: Option<AuthSecret>, secret: Option<AuthSecret>,
) { ) {
let project_id = ProjectIdInt::from(project_id);
let endpoint_id = EndpointIdInt::from(endpoint_id);
let role_name = RoleNameInt::from(role_name);
if self.cache.len() >= self.config.size { if self.cache.len() >= self.config.size {
// If there are too many entries, wait until the next gc cycle. // If there are too many entries, wait until the next gc cycle.
return; return;
@@ -234,12 +231,10 @@ impl ProjectInfoCacheImpl {
} }
pub fn insert_allowed_ips( pub fn insert_allowed_ips(
&self, &self,
project_id: &ProjectId, project_id: ProjectIdInt,
endpoint_id: &EndpointId, endpoint_id: EndpointIdInt,
allowed_ips: Arc<Vec<IpPattern>>, allowed_ips: Arc<Vec<IpPattern>>,
) { ) {
let project_id = ProjectIdInt::from(project_id);
let endpoint_id = EndpointIdInt::from(endpoint_id);
if self.cache.len() >= self.config.size { if self.cache.len() >= self.config.size {
// If there are too many entries, wait until the next gc cycle. // If there are too many entries, wait until the next gc cycle.
return; return;
@@ -358,7 +353,7 @@ impl Cache for ProjectInfoCacheImpl {
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::*; use super::*;
use crate::scram::ServerSecret; use crate::{scram::ServerSecret, ProjectId};
#[tokio::test] #[tokio::test]
async fn test_project_info_cache_settings() { async fn test_project_info_cache_settings() {
@@ -369,8 +364,8 @@ mod tests {
ttl: Duration::from_secs(1), ttl: Duration::from_secs(1),
gc_interval: Duration::from_secs(600), gc_interval: Duration::from_secs(600),
}); });
let project_id = "project".into(); let project_id: ProjectId = "project".into();
let endpoint_id = "endpoint".into(); let endpoint_id: EndpointId = "endpoint".into();
let user1: RoleName = "user1".into(); let user1: RoleName = "user1".into();
let user2: RoleName = "user2".into(); let user2: RoleName = "user2".into();
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -379,9 +374,23 @@ mod tests {
"127.0.0.1".parse().unwrap(), "127.0.0.1".parse().unwrap(),
"127.0.0.2".parse().unwrap(), "127.0.0.2".parse().unwrap(),
]); ]);
cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); cache.insert_role_secret(
cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); (&project_id).into(),
cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); (&endpoint_id).into(),
(&user1).into(),
secret1.clone(),
);
cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user2).into(),
secret2.clone(),
);
cache.insert_allowed_ips(
(&project_id).into(),
(&endpoint_id).into(),
allowed_ips.clone(),
);
let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
assert!(cached.cached()); assert!(cached.cached());
@@ -393,7 +402,12 @@ mod tests {
// Shouldn't add more than 2 roles. // Shouldn't add more than 2 roles.
let user3: RoleName = "user3".into(); let user3: RoleName = "user3".into();
let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32]))); let secret3 = Some(AuthSecret::Scram(ServerSecret::mock([3; 32])));
cache.insert_role_secret(&project_id, &endpoint_id, &user3, secret3.clone()); cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user3).into(),
secret3.clone(),
);
assert!(cache.get_role_secret(&endpoint_id, &user3).is_none()); assert!(cache.get_role_secret(&endpoint_id, &user3).is_none());
let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
@@ -421,8 +435,8 @@ mod tests {
cache.clone().disable_ttl(); cache.clone().disable_ttl();
tokio::time::advance(Duration::from_secs(2)).await; tokio::time::advance(Duration::from_secs(2)).await;
let project_id = "project".into(); let project_id: ProjectId = "project".into();
let endpoint_id = "endpoint".into(); let endpoint_id: EndpointId = "endpoint".into();
let user1: RoleName = "user1".into(); let user1: RoleName = "user1".into();
let user2: RoleName = "user2".into(); let user2: RoleName = "user2".into();
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -431,9 +445,23 @@ mod tests {
"127.0.0.1".parse().unwrap(), "127.0.0.1".parse().unwrap(),
"127.0.0.2".parse().unwrap(), "127.0.0.2".parse().unwrap(),
]); ]);
cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); cache.insert_role_secret(
cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); (&project_id).into(),
cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); (&endpoint_id).into(),
(&user1).into(),
secret1.clone(),
);
cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user2).into(),
secret2.clone(),
);
cache.insert_allowed_ips(
(&project_id).into(),
(&endpoint_id).into(),
allowed_ips.clone(),
);
tokio::time::advance(Duration::from_secs(2)).await; tokio::time::advance(Duration::from_secs(2)).await;
// Nothing should be invalidated. // Nothing should be invalidated.
@@ -470,8 +498,8 @@ mod tests {
gc_interval: Duration::from_secs(600), gc_interval: Duration::from_secs(600),
})); }));
let project_id = "project".into(); let project_id: ProjectId = "project".into();
let endpoint_id = "endpoint".into(); let endpoint_id: EndpointId = "endpoint".into();
let user1: RoleName = "user1".into(); let user1: RoleName = "user1".into();
let user2: RoleName = "user2".into(); let user2: RoleName = "user2".into();
let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32]))); let secret1 = Some(AuthSecret::Scram(ServerSecret::mock([1; 32])));
@@ -480,10 +508,20 @@ mod tests {
"127.0.0.1".parse().unwrap(), "127.0.0.1".parse().unwrap(),
"127.0.0.2".parse().unwrap(), "127.0.0.2".parse().unwrap(),
]); ]);
cache.insert_role_secret(&project_id, &endpoint_id, &user1, secret1.clone()); cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user1).into(),
secret1.clone(),
);
cache.clone().disable_ttl(); cache.clone().disable_ttl();
tokio::time::advance(Duration::from_millis(100)).await; tokio::time::advance(Duration::from_millis(100)).await;
cache.insert_role_secret(&project_id, &endpoint_id, &user2, secret2.clone()); cache.insert_role_secret(
(&project_id).into(),
(&endpoint_id).into(),
(&user2).into(),
secret2.clone(),
);
// Added before ttl was disabled + ttl should be still cached. // Added before ttl was disabled + ttl should be still cached.
let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap(); let cached = cache.get_role_secret(&endpoint_id, &user1).unwrap();
@@ -497,7 +535,11 @@ mod tests {
assert!(cache.get_role_secret(&endpoint_id, &user2).is_none()); assert!(cache.get_role_secret(&endpoint_id, &user2).is_none());
// Added after ttl was disabled + ttl should not be cached. // Added after ttl was disabled + ttl should not be cached.
cache.insert_allowed_ips(&project_id, &endpoint_id, allowed_ips.clone()); cache.insert_allowed_ips(
(&project_id).into(),
(&endpoint_id).into(),
allowed_ips.clone(),
);
let cached = cache.get_allowed_ips(&endpoint_id).unwrap(); let cached = cache.get_allowed_ips(&endpoint_id).unwrap();
assert!(!cached.cached()); assert!(!cached.cached());

View File

@@ -276,6 +276,7 @@ impl ConnCfg {
let stream = connection.stream.into_inner(); let stream = connection.stream.into_inner();
info!( info!(
cold_start_info = ctx.cold_start_info.as_str(),
"connected to compute node at {host} ({socket_addr}) sslmode={:?}", "connected to compute node at {host} ({socket_addr}) sslmode={:?}",
self.0.get_ssl_mode() self.0.get_ssl_mode()
); );

View File

@@ -3,7 +3,7 @@ use std::fmt;
use crate::auth::IpPattern; use crate::auth::IpPattern;
use crate::{BranchId, EndpointId, ProjectId}; use crate::intern::{BranchIdInt, EndpointIdInt, ProjectIdInt};
/// Generic error response with human-readable description. /// Generic error response with human-readable description.
/// Note that we can't always present it to user as is. /// Note that we can't always present it to user as is.
@@ -18,7 +18,7 @@ pub struct ConsoleError {
pub struct GetRoleSecret { pub struct GetRoleSecret {
pub role_secret: Box<str>, pub role_secret: Box<str>,
pub allowed_ips: Option<Vec<IpPattern>>, pub allowed_ips: Option<Vec<IpPattern>>,
pub project_id: Option<ProjectId>, pub project_id: Option<ProjectIdInt>,
} }
// Manually implement debug to omit sensitive info. // Manually implement debug to omit sensitive info.
@@ -93,22 +93,47 @@ impl fmt::Debug for DatabaseInfo {
/// Various labels for prometheus metrics. /// Various labels for prometheus metrics.
/// Also known as `ProxyMetricsAuxInfo` in the console. /// Also known as `ProxyMetricsAuxInfo` in the console.
#[derive(Debug, Deserialize, Clone, Default)] #[derive(Debug, Deserialize, Clone)]
pub struct MetricsAuxInfo { pub struct MetricsAuxInfo {
pub endpoint_id: EndpointId, pub endpoint_id: EndpointIdInt,
pub project_id: ProjectId, pub project_id: ProjectIdInt,
pub branch_id: BranchId, pub branch_id: BranchIdInt,
pub cold_start_info: Option<ColdStartInfo>, #[serde(default)]
pub cold_start_info: ColdStartInfo,
} }
#[derive(Debug, Default, Serialize, Deserialize, Clone)] #[derive(Debug, Default, Serialize, Deserialize, Clone, Copy)]
#[serde(rename_all = "snake_case")] #[serde(rename_all = "snake_case")]
pub enum ColdStartInfo { pub enum ColdStartInfo {
#[default] #[default]
Unknown = 0, Unknown,
Warm = 1, /// Compute was already running
PoolHit = 2, Warm,
PoolMiss = 3, #[serde(rename = "pool_hit")]
/// Compute was not running but there was an available VM
VmPoolHit,
#[serde(rename = "pool_miss")]
/// Compute was not running and there were no VMs available
VmPoolMiss,
// not provided by control plane
/// Connection available from HTTP pool
HttpPoolHit,
/// Cached connection info
WarmCached,
}
impl ColdStartInfo {
pub fn as_str(&self) -> &'static str {
match self {
ColdStartInfo::Unknown => "unknown",
ColdStartInfo::Warm => "warm",
ColdStartInfo::VmPoolHit => "pool_hit",
ColdStartInfo::VmPoolMiss => "pool_miss",
ColdStartInfo::HttpPoolHit => "http_pool_hit",
ColdStartInfo::WarmCached => "warm_cached",
}
}
} }
#[cfg(test)] #[cfg(test)]

View File

@@ -12,7 +12,8 @@ use crate::{
compute, compute,
config::{CacheOptions, ProjectInfoCacheOptions}, config::{CacheOptions, ProjectInfoCacheOptions},
context::RequestMonitoring, context::RequestMonitoring,
scram, EndpointCacheKey, ProjectId, intern::ProjectIdInt,
scram, EndpointCacheKey,
}; };
use dashmap::DashMap; use dashmap::DashMap;
use std::{sync::Arc, time::Duration}; use std::{sync::Arc, time::Duration};
@@ -271,7 +272,7 @@ pub struct AuthInfo {
/// List of IP addresses allowed for the autorization. /// List of IP addresses allowed for the autorization.
pub allowed_ips: Vec<IpPattern>, pub allowed_ips: Vec<IpPattern>,
/// Project ID. This is used for cache invalidation. /// Project ID. This is used for cache invalidation.
pub project_id: Option<ProjectId>, pub project_id: Option<ProjectIdInt>,
} }
/// Info for establishing a connection to a compute node. /// Info for establishing a connection to a compute node.

View File

@@ -4,10 +4,16 @@ use super::{
errors::{ApiError, GetAuthInfoError, WakeComputeError}, errors::{ApiError, GetAuthInfoError, WakeComputeError},
AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo, AuthInfo, AuthSecret, CachedNodeInfo, NodeInfo,
}; };
use crate::console::provider::{CachedAllowedIps, CachedRoleSecret};
use crate::context::RequestMonitoring; use crate::context::RequestMonitoring;
use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl}; use crate::{auth::backend::ComputeUserInfo, compute, error::io_error, scram, url::ApiUrl};
use crate::{auth::IpPattern, cache::Cached}; use crate::{auth::IpPattern, cache::Cached};
use crate::{
console::{
messages::MetricsAuxInfo,
provider::{CachedAllowedIps, CachedRoleSecret},
},
BranchId, EndpointId, ProjectId,
};
use futures::TryFutureExt; use futures::TryFutureExt;
use std::{str::FromStr, sync::Arc}; use std::{str::FromStr, sync::Arc};
use thiserror::Error; use thiserror::Error;
@@ -114,7 +120,12 @@ impl Api {
let node = NodeInfo { let node = NodeInfo {
config, config,
aux: Default::default(), aux: MetricsAuxInfo {
endpoint_id: (&EndpointId::from("endpoint")).into(),
project_id: (&ProjectId::from("project")).into(),
branch_id: (&BranchId::from("branch")).into(),
cold_start_info: crate::console::messages::ColdStartInfo::Warm,
},
allow_self_signed_compute: false, allow_self_signed_compute: false,
}; };

View File

@@ -181,15 +181,16 @@ impl super::Api for Api {
} }
let auth_info = self.do_get_auth_info(ctx, user_info).await?; let auth_info = self.do_get_auth_info(ctx, user_info).await?;
if let Some(project_id) = auth_info.project_id { if let Some(project_id) = auth_info.project_id {
let ep_int = ep.into();
self.caches.project_info.insert_role_secret( self.caches.project_info.insert_role_secret(
&project_id, project_id,
ep, ep_int,
user, user.into(),
auth_info.secret.clone(), auth_info.secret.clone(),
); );
self.caches.project_info.insert_allowed_ips( self.caches.project_info.insert_allowed_ips(
&project_id, project_id,
ep, ep_int,
Arc::new(auth_info.allowed_ips), Arc::new(auth_info.allowed_ips),
); );
ctx.set_project_id(project_id); ctx.set_project_id(project_id);
@@ -217,15 +218,16 @@ impl super::Api for Api {
let allowed_ips = Arc::new(auth_info.allowed_ips); let allowed_ips = Arc::new(auth_info.allowed_ips);
let user = &user_info.user; let user = &user_info.user;
if let Some(project_id) = auth_info.project_id { if let Some(project_id) = auth_info.project_id {
let ep_int = ep.into();
self.caches.project_info.insert_role_secret( self.caches.project_info.insert_role_secret(
&project_id, project_id,
ep, ep_int,
user, user.into(),
auth_info.secret.clone(), auth_info.secret.clone(),
); );
self.caches self.caches
.project_info .project_info
.insert_allowed_ips(&project_id, ep, allowed_ips.clone()); .insert_allowed_ips(project_id, ep_int, allowed_ips.clone());
ctx.set_project_id(project_id); ctx.set_project_id(project_id);
} }
Ok(( Ok((
@@ -248,8 +250,7 @@ impl super::Api for Api {
// which means that we might cache it to reduce the load and latency. // which means that we might cache it to reduce the load and latency.
if let Some(cached) = self.caches.node_info.get(&key) { if let Some(cached) = self.caches.node_info.get(&key) {
info!(key = &*key, "found cached compute node info"); info!(key = &*key, "found cached compute node info");
info!("cold_start_info=warm"); ctx.set_project(cached.aux.clone());
ctx.set_cold_start_info(ColdStartInfo::Warm);
return Ok(cached); return Ok(cached);
} }
@@ -260,17 +261,21 @@ impl super::Api for Api {
if permit.should_check_cache() { if permit.should_check_cache() {
if let Some(cached) = self.caches.node_info.get(&key) { if let Some(cached) = self.caches.node_info.get(&key) {
info!(key = &*key, "found cached compute node info"); info!(key = &*key, "found cached compute node info");
info!("cold_start_info=warm"); ctx.set_project(cached.aux.clone());
ctx.set_cold_start_info(ColdStartInfo::Warm);
return Ok(cached); return Ok(cached);
} }
} }
let node = self.do_wake_compute(ctx, user_info).await?; let mut node = self.do_wake_compute(ctx, user_info).await?;
ctx.set_project(node.aux.clone()); ctx.set_project(node.aux.clone());
let cold_start_info = node.aux.cold_start_info.clone().unwrap_or_default(); let cold_start_info = node.aux.cold_start_info;
info!(?cold_start_info, "woken up a compute node"); info!("woken up a compute node");
let (_, cached) = self.caches.node_info.insert(key.clone(), node);
// store the cached node as 'warm'
node.aux.cold_start_info = ColdStartInfo::WarmCached;
let (_, mut cached) = self.caches.node_info.insert(key.clone(), node);
cached.aux.cold_start_info = cold_start_info;
info!(key = &*key, "created a cache entry for compute node info"); info!(key = &*key, "created a cache entry for compute node info");
Ok(cached) Ok(cached)

View File

@@ -11,8 +11,9 @@ use uuid::Uuid;
use crate::{ use crate::{
console::messages::{ColdStartInfo, MetricsAuxInfo}, console::messages::{ColdStartInfo, MetricsAuxInfo},
error::ErrorKind, error::ErrorKind,
intern::{BranchIdInt, ProjectIdInt},
metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND}, metrics::{LatencyTimer, ENDPOINT_ERRORS_BY_KIND, ERROR_BY_KIND},
BranchId, DbName, EndpointId, ProjectId, RoleName, DbName, EndpointId, RoleName,
}; };
use self::parquet::RequestData; use self::parquet::RequestData;
@@ -34,8 +35,8 @@ pub struct RequestMonitoring {
pub span: Span, pub span: Span,
// filled in as they are discovered // filled in as they are discovered
project: Option<ProjectId>, project: Option<ProjectIdInt>,
branch: Option<BranchId>, branch: Option<BranchIdInt>,
endpoint_id: Option<EndpointId>, endpoint_id: Option<EndpointId>,
dbname: Option<DbName>, dbname: Option<DbName>,
user: Option<RoleName>, user: Option<RoleName>,
@@ -43,7 +44,7 @@ pub struct RequestMonitoring {
error_kind: Option<ErrorKind>, error_kind: Option<ErrorKind>,
pub(crate) auth_method: Option<AuthMethod>, pub(crate) auth_method: Option<AuthMethod>,
success: bool, success: bool,
cold_start_info: Option<ColdStartInfo>, pub(crate) cold_start_info: ColdStartInfo,
// extra // extra
// This sender is here to keep the request monitoring channel open while requests are taking place. // This sender is here to keep the request monitoring channel open while requests are taking place.
@@ -92,7 +93,7 @@ impl RequestMonitoring {
error_kind: None, error_kind: None,
auth_method: None, auth_method: None,
success: false, success: false,
cold_start_info: None, cold_start_info: ColdStartInfo::Unknown,
sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()), sender: LOG_CHAN.get().and_then(|tx| tx.upgrade()),
latency_timer: LatencyTimer::new(protocol), latency_timer: LatencyTimer::new(protocol),
@@ -113,26 +114,31 @@ impl RequestMonitoring {
} }
pub fn set_cold_start_info(&mut self, info: ColdStartInfo) { pub fn set_cold_start_info(&mut self, info: ColdStartInfo) {
self.cold_start_info = Some(info); self.cold_start_info = info;
self.latency_timer.cold_start_info(info);
} }
pub fn set_project(&mut self, x: MetricsAuxInfo) { pub fn set_project(&mut self, x: MetricsAuxInfo) {
self.set_endpoint_id(x.endpoint_id); if self.endpoint_id.is_none() {
self.set_endpoint_id(x.endpoint_id.as_str().into())
}
self.branch = Some(x.branch_id); self.branch = Some(x.branch_id);
self.project = Some(x.project_id); self.project = Some(x.project_id);
self.cold_start_info = x.cold_start_info; self.set_cold_start_info(x.cold_start_info);
} }
pub fn set_project_id(&mut self, project_id: ProjectId) { pub fn set_project_id(&mut self, project_id: ProjectIdInt) {
self.project = Some(project_id); self.project = Some(project_id);
} }
pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) { pub fn set_endpoint_id(&mut self, endpoint_id: EndpointId) {
self.span.record("ep", display(&endpoint_id)); if self.endpoint_id.is_none() {
crate::metrics::CONNECTING_ENDPOINTS self.span.record("ep", display(&endpoint_id));
.with_label_values(&[self.protocol]) crate::metrics::CONNECTING_ENDPOINTS
.measure(&endpoint_id); .with_label_values(&[self.protocol])
self.endpoint_id = Some(endpoint_id); .measure(&endpoint_id);
self.endpoint_id = Some(endpoint_id);
}
} }
pub fn set_application(&mut self, app: Option<SmolStr>) { pub fn set_application(&mut self, app: Option<SmolStr>) {

View File

@@ -87,7 +87,7 @@ pub struct RequestData {
/// Or if we make it to proxy_pass /// Or if we make it to proxy_pass
success: bool, success: bool,
/// Indicates if the cplane started the new compute node for this request. /// Indicates if the cplane started the new compute node for this request.
cold_start_info: Option<&'static str>, cold_start_info: &'static str,
/// Tracks time from session start (HTTP request/libpq TCP handshake) /// Tracks time from session start (HTTP request/libpq TCP handshake)
/// Through to success/failure /// Through to success/failure
duration_us: u64, duration_us: u64,
@@ -115,12 +115,7 @@ impl From<&RequestMonitoring> for RequestData {
region: value.region, region: value.region,
error: value.error_kind.as_ref().map(|e| e.to_metric_label()), error: value.error_kind.as_ref().map(|e| e.to_metric_label()),
success: value.success, success: value.success,
cold_start_info: value.cold_start_info.as_ref().map(|x| match x { cold_start_info: value.cold_start_info.as_str(),
crate::console::messages::ColdStartInfo::Unknown => "unknown",
crate::console::messages::ColdStartInfo::Warm => "warm",
crate::console::messages::ColdStartInfo::PoolHit => "pool_hit",
crate::console::messages::ColdStartInfo::PoolMiss => "pool_miss",
}),
duration_us: SystemTime::from(value.first_packet) duration_us: SystemTime::from(value.first_packet)
.elapsed() .elapsed()
.unwrap_or_default() .unwrap_or_default()
@@ -454,7 +449,7 @@ mod tests {
region: "us-east-1", region: "us-east-1",
error: None, error: None,
success: rng.gen(), success: rng.gen(),
cold_start_info: Some("no"), cold_start_info: "no",
duration_us: rng.gen_range(0..30_000_000), duration_us: rng.gen_range(0..30_000_000),
} }
} }
@@ -524,15 +519,15 @@ mod tests {
assert_eq!( assert_eq!(
file_stats, file_stats,
[ [
(1314406, 3, 6000), (1314385, 3, 6000),
(1314399, 3, 6000), (1314378, 3, 6000),
(1314459, 3, 6000), (1314438, 3, 6000),
(1314416, 3, 6000), (1314395, 3, 6000),
(1314546, 3, 6000), (1314525, 3, 6000),
(1314388, 3, 6000), (1314367, 3, 6000),
(1314180, 3, 6000), (1314159, 3, 6000),
(1314416, 3, 6000), (1314395, 3, 6000),
(438359, 1, 2000) (438352, 1, 2000)
] ]
); );
@@ -562,11 +557,11 @@ mod tests {
assert_eq!( assert_eq!(
file_stats, file_stats,
[ [
(1220668, 5, 10000), (1220633, 5, 10000),
(1226818, 5, 10000), (1226783, 5, 10000),
(1228612, 5, 10000), (1228577, 5, 10000),
(1227974, 5, 10000), (1227939, 5, 10000),
(1219252, 5, 10000) (1219217, 5, 10000)
] ]
); );
@@ -598,11 +593,11 @@ mod tests {
assert_eq!( assert_eq!(
file_stats, file_stats,
[ [
(1206315, 5, 10000), (1206280, 5, 10000),
(1206046, 5, 10000), (1206011, 5, 10000),
(1206339, 5, 10000), (1206304, 5, 10000),
(1206327, 5, 10000), (1206292, 5, 10000),
(1206582, 5, 10000) (1206547, 5, 10000)
] ]
); );
@@ -627,15 +622,15 @@ mod tests {
assert_eq!( assert_eq!(
file_stats, file_stats,
[ [
(1314406, 3, 6000), (1314385, 3, 6000),
(1314399, 3, 6000), (1314378, 3, 6000),
(1314459, 3, 6000), (1314438, 3, 6000),
(1314416, 3, 6000), (1314395, 3, 6000),
(1314546, 3, 6000), (1314525, 3, 6000),
(1314388, 3, 6000), (1314367, 3, 6000),
(1314180, 3, 6000), (1314159, 3, 6000),
(1314416, 3, 6000), (1314395, 3, 6000),
(438359, 1, 2000) (438352, 1, 2000)
] ]
); );
@@ -672,7 +667,7 @@ mod tests {
// files are smaller than the size threshold, but they took too long to fill so were flushed early // files are smaller than the size threshold, but they took too long to fill so were flushed early
assert_eq!( assert_eq!(
file_stats, file_stats,
[(658837, 2, 3001), (658551, 2, 3000), (658347, 2, 2999)] [(658823, 2, 3001), (658537, 2, 3000), (658333, 2, 2999)]
); );
tmpdir.close().unwrap(); tmpdir.close().unwrap();

View File

@@ -12,6 +12,8 @@ use metrics::{
use once_cell::sync::Lazy; use once_cell::sync::Lazy;
use tokio::time::{self, Instant}; use tokio::time::{self, Instant};
use crate::console::messages::ColdStartInfo;
pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| { pub static NUM_DB_CONNECTIONS_GAUGE: Lazy<IntCounterPairVec> = Lazy::new(|| {
register_int_counter_pair_vec!( register_int_counter_pair_vec!(
"proxy_opened_db_connections_total", "proxy_opened_db_connections_total",
@@ -50,8 +52,8 @@ pub static COMPUTE_CONNECTION_LATENCY: Lazy<HistogramVec> = Lazy::new(|| {
"proxy_compute_connection_latency_seconds", "proxy_compute_connection_latency_seconds",
"Time it took for proxy to establish a connection to the compute endpoint", "Time it took for proxy to establish a connection to the compute endpoint",
// http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane // http/ws/tcp, true/false, true/false, success/failure, client/client_and_cplane
// 3 * 2 * 2 * 2 * 2 = 48 counters // 3 * 6 * 2 * 2 = 72 counters
&["protocol", "cache_miss", "pool_miss", "outcome", "excluded"], &["protocol", "cold_start_info", "outcome", "excluded"],
// largest bucket = 2^16 * 0.5ms = 32s // largest bucket = 2^16 * 0.5ms = 32s
exponential_buckets(0.0005, 2.0, 16).unwrap(), exponential_buckets(0.0005, 2.0, 16).unwrap(),
) )
@@ -183,6 +185,20 @@ struct Accumulated {
compute: time::Duration, compute: time::Duration,
} }
enum Outcome {
Success,
Failed,
}
impl Outcome {
fn as_str(&self) -> &'static str {
match self {
Outcome::Success => "success",
Outcome::Failed => "failed",
}
}
}
pub struct LatencyTimer { pub struct LatencyTimer {
// time since the stopwatch was started // time since the stopwatch was started
start: time::Instant, start: time::Instant,
@@ -192,9 +208,8 @@ pub struct LatencyTimer {
accumulated: Accumulated, accumulated: Accumulated,
// label data // label data
protocol: &'static str, protocol: &'static str,
cache_miss: bool, cold_start_info: ColdStartInfo,
pool_miss: bool, outcome: Outcome,
outcome: &'static str,
} }
pub struct LatencyTimerPause<'a> { pub struct LatencyTimerPause<'a> {
@@ -210,11 +225,9 @@ impl LatencyTimer {
stop: None, stop: None,
accumulated: Accumulated::default(), accumulated: Accumulated::default(),
protocol, protocol,
cache_miss: false, cold_start_info: ColdStartInfo::Unknown,
// by default we don't do pooling
pool_miss: true,
// assume failed unless otherwise specified // assume failed unless otherwise specified
outcome: "failed", outcome: Outcome::Failed,
} }
} }
@@ -226,12 +239,8 @@ impl LatencyTimer {
} }
} }
pub fn cache_miss(&mut self) { pub fn cold_start_info(&mut self, cold_start_info: ColdStartInfo) {
self.cache_miss = true; self.cold_start_info = cold_start_info;
}
pub fn pool_hit(&mut self) {
self.pool_miss = false;
} }
pub fn success(&mut self) { pub fn success(&mut self) {
@@ -239,7 +248,7 @@ impl LatencyTimer {
self.stop = Some(time::Instant::now()); self.stop = Some(time::Instant::now());
// success // success
self.outcome = "success"; self.outcome = Outcome::Success;
} }
} }
@@ -264,9 +273,8 @@ impl Drop for LatencyTimer {
COMPUTE_CONNECTION_LATENCY COMPUTE_CONNECTION_LATENCY
.with_label_values(&[ .with_label_values(&[
self.protocol, self.protocol,
bool_to_str(self.cache_miss), self.cold_start_info.as_str(),
bool_to_str(self.pool_miss), self.outcome.as_str(),
self.outcome,
"client", "client",
]) ])
.observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64()); .observe((duration.saturating_sub(self.accumulated.client)).as_secs_f64());
@@ -275,9 +283,8 @@ impl Drop for LatencyTimer {
COMPUTE_CONNECTION_LATENCY COMPUTE_CONNECTION_LATENCY
.with_label_values(&[ .with_label_values(&[
self.protocol, self.protocol,
bool_to_str(self.cache_miss), self.cold_start_info.as_str(),
bool_to_str(self.pool_miss), self.outcome.as_str(),
self.outcome,
"client_and_cplane", "client_and_cplane",
]) ])
.observe((duration.saturating_sub(accumulated_total)).as_secs_f64()); .observe((duration.saturating_sub(accumulated_total)).as_secs_f64());

View File

@@ -87,7 +87,6 @@ impl ConnectMechanism for TcpMechanism<'_> {
} }
/// Try to connect to the compute node, retrying if necessary. /// Try to connect to the compute node, retrying if necessary.
/// This function might update `node_info`, so we take it by `&mut`.
#[tracing::instrument(skip_all)] #[tracing::instrument(skip_all)]
pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>( pub async fn connect_to_compute<M: ConnectMechanism, B: ComputeConnectBackend>(
ctx: &mut RequestMonitoring, ctx: &mut RequestMonitoring,
@@ -132,7 +131,6 @@ where
} else { } else {
// if we failed to connect, it's likely that the compute node was suspended, wake a new compute node // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node
info!("compute node's state has likely changed; requesting a wake-up"); info!("compute node's state has likely changed; requesting a wake-up");
ctx.latency_timer.cache_miss();
let old_node_info = invalidate_cache(node_info); let old_node_info = invalidate_cache(node_info);
let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?; let mut node_info = wake_compute(&mut num_retries, ctx, user_info).await?;
node_info.reuse_settings(old_node_info); node_info.reuse_settings(old_node_info);

View File

@@ -19,8 +19,8 @@ pub async fn proxy_pass(
aux: MetricsAuxInfo, aux: MetricsAuxInfo,
) -> anyhow::Result<()> { ) -> anyhow::Result<()> {
let usage = USAGE_METRICS.register(Ids { let usage = USAGE_METRICS.register(Ids {
endpoint_id: aux.endpoint_id.clone(), endpoint_id: aux.endpoint_id,
branch_id: aux.branch_id.clone(), branch_id: aux.branch_id,
}); });
let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]); let m_sent = NUM_BYTES_PROXIED_COUNTER.with_label_values(&["tx"]);

View File

@@ -12,11 +12,12 @@ use crate::auth::backend::{
}; };
use crate::config::CertResolver; use crate::config::CertResolver;
use crate::console::caches::NodeInfoCache; use crate::console::caches::NodeInfoCache;
use crate::console::messages::MetricsAuxInfo;
use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend}; use crate::console::provider::{CachedAllowedIps, CachedRoleSecret, ConsoleBackend};
use crate::console::{self, CachedNodeInfo, NodeInfo}; use crate::console::{self, CachedNodeInfo, NodeInfo};
use crate::error::ErrorKind; use crate::error::ErrorKind;
use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT}; use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT};
use crate::{http, sasl, scram}; use crate::{http, sasl, scram, BranchId, EndpointId, ProjectId};
use anyhow::{bail, Context}; use anyhow::{bail, Context};
use async_trait::async_trait; use async_trait::async_trait;
use rstest::rstest; use rstest::rstest;
@@ -512,7 +513,12 @@ impl TestBackend for TestConnectMechanism {
fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo { fn helper_create_cached_node_info(cache: &'static NodeInfoCache) -> CachedNodeInfo {
let node = NodeInfo { let node = NodeInfo {
config: compute::ConnCfg::new(), config: compute::ConnCfg::new(),
aux: Default::default(), aux: MetricsAuxInfo {
endpoint_id: (&EndpointId::from("endpoint")).into(),
project_id: (&ProjectId::from("project")).into(),
branch_id: (&BranchId::from("branch")).into(),
cold_start_info: crate::console::messages::ColdStartInfo::Warm,
},
allow_self_signed_compute: false, allow_self_signed_compute: false,
}; };
let (_, node) = cache.insert("key".into(), node); let (_, node) = cache.insert("key".into(), node);

View File

@@ -9,7 +9,6 @@ use crate::{
config::ProxyConfig, config::ProxyConfig,
console::{ console::{
errors::{GetAuthInfoError, WakeComputeError}, errors::{GetAuthInfoError, WakeComputeError},
messages::ColdStartInfo,
CachedNodeInfo, CachedNodeInfo,
}, },
context::RequestMonitoring, context::RequestMonitoring,
@@ -57,7 +56,10 @@ impl PoolingBackend {
let auth_outcome = let auth_outcome =
crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?; crate::auth::validate_password_and_exchange(&conn_info.password, secret).await?;
let res = match auth_outcome { let res = match auth_outcome {
crate::sasl::Outcome::Success(key) => Ok(key), crate::sasl::Outcome::Success(key) => {
info!("user successfully authenticated");
Ok(key)
}
crate::sasl::Outcome::Failure(reason) => { crate::sasl::Outcome::Failure(reason) => {
info!("auth backend failed with an error: {reason}"); info!("auth backend failed with an error: {reason}");
Err(AuthError::auth_failed(&*conn_info.user_info.user)) Err(AuthError::auth_failed(&*conn_info.user_info.user))
@@ -89,8 +91,6 @@ impl PoolingBackend {
}; };
if let Some(client) = maybe_client { if let Some(client) = maybe_client {
info!("cold_start_info=warm");
ctx.set_cold_start_info(ColdStartInfo::Warm);
return Ok(client); return Ok(client);
} }
let conn_id = uuid::Uuid::new_v4(); let conn_id = uuid::Uuid::new_v4();

View File

@@ -17,7 +17,7 @@ use tokio::time::Instant;
use tokio_postgres::tls::NoTlsStream; use tokio_postgres::tls::NoTlsStream;
use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket}; use tokio_postgres::{AsyncMessage, ReadyForQueryStatus, Socket};
use crate::console::messages::MetricsAuxInfo; use crate::console::messages::{ColdStartInfo, MetricsAuxInfo};
use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL}; use crate::metrics::{ENDPOINT_POOLS, GC_LATENCY, NUM_OPEN_CLIENTS_IN_HTTP_POOL};
use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS}; use crate::usage_metrics::{Ids, MetricCounter, USAGE_METRICS};
use crate::{ use crate::{
@@ -383,9 +383,12 @@ impl<C: ClientInnerExt> GlobalConnPool<C> {
"pid", "pid",
&tracing::field::display(client.inner.get_process_id()), &tracing::field::display(client.inner.get_process_id()),
); );
info!("pool: reusing connection '{conn_info}'"); info!(
cold_start_info = ColdStartInfo::HttpPoolHit.as_str(),
"pool: reusing connection '{conn_info}'"
);
client.session.send(ctx.session_id)?; client.session.send(ctx.session_id)?;
ctx.latency_timer.pool_hit(); ctx.set_cold_start_info(ColdStartInfo::HttpPoolHit);
ctx.latency_timer.success(); ctx.latency_timer.success();
return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool))); return Ok(Some(Client::new(client, conn_info.clone(), endpoint_pool)));
} }
@@ -454,8 +457,9 @@ pub fn poll_client<C: ClientInnerExt>(
let (tx, mut rx) = tokio::sync::watch::channel(session_id); let (tx, mut rx) = tokio::sync::watch::channel(session_id);
let span = info_span!(parent: None, "connection", %conn_id); let span = info_span!(parent: None, "connection", %conn_id);
let cold_start_info = ctx.cold_start_info;
span.in_scope(|| { span.in_scope(|| {
info!(%conn_info, %session_id, "new connection"); info!(cold_start_info = cold_start_info.as_str(), %conn_info, %session_id, "new connection");
}); });
let pool = match conn_info.endpoint_cache_key() { let pool = match conn_info.endpoint_cache_key() {
Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)), Some(endpoint) => Arc::downgrade(&global_pool.get_or_create_endpoint_pool(&endpoint)),
@@ -565,8 +569,8 @@ impl<C: ClientInnerExt> Client<C> {
pub fn metrics(&self) -> Arc<MetricCounter> { pub fn metrics(&self) -> Arc<MetricCounter> {
let aux = &self.inner.as_ref().unwrap().aux; let aux = &self.inner.as_ref().unwrap().aux;
USAGE_METRICS.register(Ids { USAGE_METRICS.register(Ids {
endpoint_id: aux.endpoint_id.clone(), endpoint_id: aux.endpoint_id,
branch_id: aux.branch_id.clone(), branch_id: aux.branch_id,
}) })
} }
} }
@@ -666,6 +670,8 @@ impl<C: ClientInnerExt> Drop for Client<C> {
mod tests { mod tests {
use std::{mem, sync::atomic::AtomicBool}; use std::{mem, sync::atomic::AtomicBool};
use crate::{BranchId, EndpointId, ProjectId};
use super::*; use super::*;
struct MockClient(Arc<AtomicBool>); struct MockClient(Arc<AtomicBool>);
@@ -691,7 +697,12 @@ mod tests {
ClientInner { ClientInner {
inner: client, inner: client,
session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()), session: tokio::sync::watch::Sender::new(uuid::Uuid::new_v4()),
aux: Default::default(), aux: MetricsAuxInfo {
endpoint_id: (&EndpointId::from("endpoint")).into(),
project_id: (&ProjectId::from("project")).into(),
branch_id: (&BranchId::from("branch")).into(),
cold_start_info: crate::console::messages::ColdStartInfo::Warm,
},
conn_id: uuid::Uuid::new_v4(), conn_id: uuid::Uuid::new_v4(),
} }
} }

View File

@@ -3,7 +3,8 @@
use crate::{ use crate::{
config::{MetricBackupCollectionConfig, MetricCollectionConfig}, config::{MetricBackupCollectionConfig, MetricCollectionConfig},
context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD}, context::parquet::{FAILED_UPLOAD_MAX_RETRIES, FAILED_UPLOAD_WARN_THRESHOLD},
http, BranchId, EndpointId, http,
intern::{BranchIdInt, EndpointIdInt},
}; };
use anyhow::Context; use anyhow::Context;
use async_compression::tokio::write::GzipEncoder; use async_compression::tokio::write::GzipEncoder;
@@ -43,8 +44,8 @@ const DEFAULT_HTTP_REPORTING_TIMEOUT: Duration = Duration::from_secs(60);
/// because we enrich the event with project_id in the control-plane endpoint. /// because we enrich the event with project_id in the control-plane endpoint.
#[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)] #[derive(Eq, Hash, PartialEq, Serialize, Deserialize, Debug, Clone)]
pub struct Ids { pub struct Ids {
pub endpoint_id: EndpointId, pub endpoint_id: EndpointIdInt,
pub branch_id: BranchId, pub branch_id: BranchIdInt,
} }
pub trait MetricCounterRecorder { pub trait MetricCounterRecorder {
@@ -494,7 +495,7 @@ mod tests {
use url::Url; use url::Url;
use super::*; use super::*;
use crate::{http, rate_limiter::RateLimiterConfig}; use crate::{http, rate_limiter::RateLimiterConfig, BranchId, EndpointId};
#[tokio::test] #[tokio::test]
async fn metrics() { async fn metrics() {
@@ -536,8 +537,8 @@ mod tests {
// register a new counter // register a new counter
let counter = metrics.register(Ids { let counter = metrics.register(Ids {
endpoint_id: "e1".into(), endpoint_id: (&EndpointId::from("e1")).into(),
branch_id: "b1".into(), branch_id: (&BranchId::from("b1")).into(),
}); });
// the counter should be observed despite 0 egress // the counter should be observed despite 0 egress