From e1935f42a1d1de4948fa6f689f2fe4cd83d15549 Mon Sep 17 00:00:00 2001 From: Arseny Sher Date: Mon, 27 Nov 2023 15:40:01 +0300 Subject: [PATCH 1/4] Don't generate core dump when walproposer intentionally panics. Walproposer sometimes intentionally PANICs when its term is defeated as the basebackup is likely spoiled by that time. We don't want core dumped in this case. --- pgxn/neon/neon_utils.c | 25 +++++++++++++++++++++++++ pgxn/neon/neon_utils.h | 1 + pgxn/neon/walproposer.c | 15 ++++++++++++++- 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/pgxn/neon/neon_utils.c b/pgxn/neon/neon_utils.c index 06faea7490..807d2decf6 100644 --- a/pgxn/neon/neon_utils.c +++ b/pgxn/neon/neon_utils.c @@ -1,3 +1,6 @@ + +#include + #include "postgres.h" #include "access/timeline.h" @@ -114,3 +117,25 @@ pq_sendint64_le(StringInfo buf, uint64 i) memcpy(buf->data + buf->len, &i, sizeof(uint64)); buf->len += sizeof(uint64); } + +/* + * Disables core dump for the current process. + */ +void +disable_core_dump() +{ + struct rlimit rlim; + +#ifdef WALPROPOSER_LIB /* skip in simulation mode */ + return; +#endif + + rlim.rlim_cur = 0; + rlim.rlim_max = 0; + if (setrlimit(RLIMIT_CORE, &rlim)) + { + int save_errno = errno; + + fprintf(stderr, "WARNING: disable cores setrlimit failed: %s", strerror(save_errno)); + } +} diff --git a/pgxn/neon/neon_utils.h b/pgxn/neon/neon_utils.h index e3fafc8d0f..20745d8b26 100644 --- a/pgxn/neon/neon_utils.h +++ b/pgxn/neon/neon_utils.h @@ -8,5 +8,6 @@ uint32 pq_getmsgint32_le(StringInfo msg); uint64 pq_getmsgint64_le(StringInfo msg); void pq_sendint32_le(StringInfo buf, uint32 i); void pq_sendint64_le(StringInfo buf, uint64 i); +extern void disable_core_dump(); #endif /* __NEON_UTILS_H__ */ diff --git a/pgxn/neon/walproposer.c b/pgxn/neon/walproposer.c index 7d9dbfdb7f..fc3332612c 100644 --- a/pgxn/neon/walproposer.c +++ b/pgxn/neon/walproposer.c @@ -35,6 +35,8 @@ * *------------------------------------------------------------------------- */ +#include + #include "postgres.h" #include "libpq/pqformat.h" #include "neon.h" @@ -1069,6 +1071,12 @@ DetermineEpochStartLsn(WalProposer *wp) if (!((dth->n_entries >= 1) && (dth->entries[dth->n_entries - 1].term == walprop_shared->mineLastElectedTerm))) { + /* + * Panic to restart PG as we need to retake basebackup. + * However, don't dump core as this is kinda expected + * scenario. + */ + disable_core_dump(); walprop_log(PANIC, "collected propEpochStartLsn %X/%X, but basebackup LSN %X/%X", LSN_FORMAT_ARGS(wp->propEpochStartLsn), @@ -1445,7 +1453,12 @@ RecvAppendResponses(Safekeeper *sk) if (sk->appendResponse.term > wp->propTerm) { - /* Another compute with higher term is running. */ + /* + * Another compute with higher term is running. Panic to restart + * PG as we likely need to retake basebackup. However, don't dump + * core as this is kinda expected scenario. + */ + disable_core_dump(); walprop_log(PANIC, "WAL acceptor %s:%s with term " INT64_FORMAT " rejected our request, our term " INT64_FORMAT "", sk->host, sk->port, sk->appendResponse.term, wp->propTerm); From dbdb1d21f2eae8e9616dd12656868fcdb59c603e Mon Sep 17 00:00:00 2001 From: John Spray Date: Mon, 18 Dec 2023 10:29:19 +0000 Subject: [PATCH 2/4] pageserver: on-demand activation cleanups (#6157) ## Problem #6112 added some logs and metrics: clean these up a bit: - Avoid counting startup completions for tenants launched after startup - exclude no-op cases from timing histograms - remove a rogue log messages --- pageserver/src/tenant.rs | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/pageserver/src/tenant.rs b/pageserver/src/tenant.rs index 1478a1a445..eceef6bf78 100644 --- a/pageserver/src/tenant.rs +++ b/pageserver/src/tenant.rs @@ -629,9 +629,12 @@ impl Tenant { "attach tenant", false, async move { + // Is this tenant being spawned as part of process startup? + let starting_up = init_order.is_some(); scopeguard::defer! { - tracing::info!("Increment complete count"); - TENANT.startup_complete.inc(); + if starting_up { + TENANT.startup_complete.inc(); + } } // Ideally we should use Tenant::set_broken_no_wait, but it is not supposed to be used when tenant is in loading state. @@ -711,7 +714,11 @@ impl Tenant { let preload_timer = TENANT.preload.start_timer(); let preload = match mode { - SpawnMode::Create => {None}, + SpawnMode::Create => { + // Don't count the skipped preload into the histogram of preload durations + preload_timer.stop_and_discard(); + None + }, SpawnMode::Normal => { match &remote_storage { Some(remote_storage) => Some( @@ -721,7 +728,11 @@ impl Tenant { tracing::info_span!(parent: None, "attach_preload", tenant_id=%tenant_shard_id.tenant_id, shard_id=%tenant_shard_id.shard_slug()), ) .await { - Ok(p) => p, + Ok(p) => { + preload_timer.observe_duration(); + p + } + , Err(e) => { make_broken(&tenant_clone, anyhow::anyhow!(e)); return Ok(()); @@ -732,7 +743,6 @@ impl Tenant { } } }; - preload_timer.observe_duration(); // Remote preload is complete. drop(remote_load_completion); @@ -784,15 +794,19 @@ impl Tenant { } } - let attach_timer = TENANT.attach.start_timer(); + // We will time the duration of the attach phase unless this is a creation (attach will do no work) + let attach_timer = match mode { + SpawnMode::Create => None, + SpawnMode::Normal => {Some(TENANT.attach.start_timer())} + }; match tenant_clone.attach(preload, &ctx).await { Ok(()) => { info!("attach finished, activating"); - attach_timer.observe_duration(); + if let Some(t)= attach_timer {t.observe_duration();} tenant_clone.activate(broker_client, None, &ctx); } Err(e) => { - attach_timer.observe_duration(); + if let Some(t)= attach_timer {t.observe_duration();} make_broken(&tenant_clone, anyhow::anyhow!(e)); } } From 17bde7eda516302a57f46bd95140e8d89e68af73 Mon Sep 17 00:00:00 2001 From: Conrad Ludgate Date: Mon, 18 Dec 2023 10:59:49 +0000 Subject: [PATCH 3/4] proxy refactor large files (#6153) ## Problem The `src/proxy.rs` file is far too large ## Summary of changes Creates 3 new files: ``` src/metrics.rs src/proxy/retry.rs src/proxy/connect_compute.rs ``` --- proxy/src/auth/backend.rs | 4 +- proxy/src/auth/backend/classic.rs | 2 +- proxy/src/auth/backend/hacks.rs | 2 +- proxy/src/auth/credentials.rs | 5 +- proxy/src/compute.rs | 7 +- proxy/src/console/provider.rs | 2 +- proxy/src/console/provider/neon.rs | 2 +- proxy/src/http.rs | 2 +- proxy/src/lib.rs | 1 + proxy/src/metrics.rs | 232 +++++++++++ proxy/src/proxy.rs | 542 +------------------------- proxy/src/proxy/connect_compute.rs | 238 +++++++++++ proxy/src/proxy/retry.rs | 68 ++++ proxy/src/proxy/tests.rs | 5 +- proxy/src/proxy/tests/mitm.rs | 4 +- proxy/src/rate_limiter/limiter.rs | 6 +- proxy/src/serverless.rs | 2 +- proxy/src/serverless/conn_pool.rs | 7 +- proxy/src/serverless/sql_over_http.rs | 2 +- 19 files changed, 579 insertions(+), 554 deletions(-) create mode 100644 proxy/src/metrics.rs create mode 100644 proxy/src/proxy/connect_compute.rs create mode 100644 proxy/src/proxy/retry.rs diff --git a/proxy/src/auth/backend.rs b/proxy/src/auth/backend.rs index ba054b53eb..3b09e05bd2 100644 --- a/proxy/src/auth/backend.rs +++ b/proxy/src/auth/backend.rs @@ -11,7 +11,8 @@ use crate::auth::validate_password_and_exchange; use crate::console::errors::GetAuthInfoError; use crate::console::provider::AuthInfo; use crate::console::AuthSecret; -use crate::proxy::{handle_try_wake, retry_after, LatencyTimer}; +use crate::proxy::connect_compute::handle_try_wake; +use crate::proxy::retry::retry_after; use crate::scram; use crate::stream::Stream; use crate::{ @@ -22,6 +23,7 @@ use crate::{ provider::{CachedNodeInfo, ConsoleReqExtra}, Api, }, + metrics::LatencyTimer, stream, url, }; use futures::TryFutureExt; diff --git a/proxy/src/auth/backend/classic.rs b/proxy/src/auth/backend/classic.rs index ce52daf16c..5c394ec649 100644 --- a/proxy/src/auth/backend/classic.rs +++ b/proxy/src/auth/backend/classic.rs @@ -4,7 +4,7 @@ use crate::{ compute, config::AuthenticationConfig, console::AuthSecret, - proxy::LatencyTimer, + metrics::LatencyTimer, sasl, stream::{PqStream, Stream}, }; diff --git a/proxy/src/auth/backend/hacks.rs b/proxy/src/auth/backend/hacks.rs index abbd25008b..5dde514bca 100644 --- a/proxy/src/auth/backend/hacks.rs +++ b/proxy/src/auth/backend/hacks.rs @@ -4,7 +4,7 @@ use super::{ use crate::{ auth::{self, AuthFlow}, console::AuthSecret, - proxy::LatencyTimer, + metrics::LatencyTimer, sasl, stream::{self, Stream}, }; diff --git a/proxy/src/auth/credentials.rs b/proxy/src/auth/credentials.rs index 72149e8e29..c04769a199 100644 --- a/proxy/src/auth/credentials.rs +++ b/proxy/src/auth/credentials.rs @@ -1,9 +1,8 @@ //! User credentials used in authentication. use crate::{ - auth::password_hack::parse_endpoint_param, - error::UserFacingError, - proxy::{neon_options_str, NUM_CONNECTION_ACCEPTED_BY_SNI}, + auth::password_hack::parse_endpoint_param, error::UserFacingError, + metrics::NUM_CONNECTION_ACCEPTED_BY_SNI, proxy::neon_options_str, }; use itertools::Itertools; use pq_proto::StartupMessageParams; diff --git a/proxy/src/compute.rs b/proxy/src/compute.rs index f5f7270bf4..a54ba56e43 100644 --- a/proxy/src/compute.rs +++ b/proxy/src/compute.rs @@ -1,9 +1,6 @@ use crate::{ - auth::parse_endpoint_param, - cancellation::CancelClosure, - console::errors::WakeComputeError, - error::UserFacingError, - proxy::{neon_option, NUM_DB_CONNECTIONS_GAUGE}, + auth::parse_endpoint_param, cancellation::CancelClosure, console::errors::WakeComputeError, + error::UserFacingError, metrics::NUM_DB_CONNECTIONS_GAUGE, proxy::neon_option, }; use futures::{FutureExt, TryFutureExt}; use itertools::Itertools; diff --git a/proxy/src/console/provider.rs b/proxy/src/console/provider.rs index deab966d9e..8d399f26ea 100644 --- a/proxy/src/console/provider.rs +++ b/proxy/src/console/provider.rs @@ -21,7 +21,7 @@ pub mod errors { use crate::{ error::{io_error, UserFacingError}, http, - proxy::ShouldRetry, + proxy::retry::ShouldRetry, }; use thiserror::Error; diff --git a/proxy/src/console/provider/neon.rs b/proxy/src/console/provider/neon.rs index 192252a0df..f748c9a41f 100644 --- a/proxy/src/console/provider/neon.rs +++ b/proxy/src/console/provider/neon.rs @@ -5,7 +5,7 @@ use super::{ errors::{ApiError, GetAuthInfoError, WakeComputeError}, ApiCaches, ApiLocks, AuthInfo, AuthSecret, CachedNodeInfo, ConsoleReqExtra, NodeInfo, }; -use crate::proxy::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER}; +use crate::metrics::{ALLOWED_IPS_BY_CACHE_OUTCOME, ALLOWED_IPS_NUMBER}; use crate::{auth::backend::ComputeUserInfo, compute, http, scram}; use async_trait::async_trait; use futures::TryFutureExt; diff --git a/proxy/src/http.rs b/proxy/src/http.rs index 09423eca77..59e1492ed4 100644 --- a/proxy/src/http.rs +++ b/proxy/src/http.rs @@ -13,7 +13,7 @@ pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use tokio::time::Instant; use tracing::trace; -use crate::{proxy::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl}; +use crate::{metrics::CONSOLE_REQUEST_LATENCY, rate_limiter, url::ApiUrl}; use reqwest_middleware::RequestBuilder; /// This is the preferred way to create new http clients, diff --git a/proxy/src/lib.rs b/proxy/src/lib.rs index a22600cbb3..2da1eaf482 100644 --- a/proxy/src/lib.rs +++ b/proxy/src/lib.rs @@ -16,6 +16,7 @@ pub mod console; pub mod error; pub mod http; pub mod logging; +pub mod metrics; pub mod parse; pub mod protocol2; pub mod proxy; diff --git a/proxy/src/metrics.rs b/proxy/src/metrics.rs new file mode 100644 index 0000000000..8e2a6105b1 --- /dev/null +++ b/proxy/src/metrics.rs @@ -0,0 +1,232 @@ +use ::metrics::{ + exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec, + IntCounterPairVec, IntCounterVec, +}; +use prometheus::{ + register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec, + IntGaugeVec, +}; + +use once_cell::sync::Lazy; +use tokio::time; + +pub static NUM_DB_CONNECTIONS_GAUGE: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( + "proxy_opened_db_connections_total", + "Number of opened connections to a database.", + "proxy_closed_db_connections_total", + "Number of closed connections to a database.", + &["protocol"], + ) + .unwrap() +}); + +pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( + "proxy_opened_client_connections_total", + "Number of opened connections from a client.", + "proxy_closed_client_connections_total", + "Number of closed connections from a client.", + &["protocol"], + ) + .unwrap() +}); + +pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy = Lazy::new(|| { + register_int_counter_pair_vec!( + "proxy_accepted_connections_total", + "Number of client connections accepted.", + "proxy_closed_connections_total", + "Number of client connections closed.", + &["protocol"], + ) + .unwrap() +}); + +pub static COMPUTE_CONNECTION_LATENCY: Lazy = Lazy::new(|| { + register_histogram_vec!( + "proxy_compute_connection_latency_seconds", + "Time it took for proxy to establish a connection to the compute endpoint", + // http/ws/tcp, true/false, true/false, success/failure + // 3 * 2 * 2 * 2 = 24 counters + &["protocol", "cache_miss", "pool_miss", "outcome"], + // largest bucket = 2^16 * 0.5ms = 32s + exponential_buckets(0.0005, 2.0, 16).unwrap(), + ) + .unwrap() +}); + +pub static CONSOLE_REQUEST_LATENCY: Lazy = Lazy::new(|| { + register_histogram_vec!( + "proxy_console_request_latency", + "Time it took for proxy to establish a connection to the compute endpoint", + // proxy_wake_compute/proxy_get_role_info + &["request"], + // largest bucket = 2^16 * 0.2ms = 13s + exponential_buckets(0.0002, 2.0, 16).unwrap(), + ) + .unwrap() +}); + +pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "proxy_allowed_ips_cache_misses", + "Number of cache hits/misses for allowed ips", + // hit/miss + &["outcome"], + ) + .unwrap() +}); + +pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy = Lazy::new(|| { + register_histogram!( + "proxy_control_plane_token_acquire_seconds", + "Time it took for proxy to establish a connection to the compute endpoint", + // largest bucket = 3^16 * 0.05ms = 2.15s + exponential_buckets(0.00005, 3.0, 16).unwrap(), + ) + .unwrap() +}); + +pub static RATE_LIMITER_LIMIT: Lazy = Lazy::new(|| { + register_int_gauge_vec!( + "semaphore_control_plane_limit", + "Current limit of the semaphore control plane", + &["limit"], // 2 counters + ) + .unwrap() +}); + +pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "proxy_accepted_connections_by_sni", + "Number of connections (per sni).", + &["kind"], + ) + .unwrap() +}); + +pub static ALLOWED_IPS_NUMBER: Lazy = Lazy::new(|| { + register_histogram!( + "proxy_allowed_ips_number", + "Number of allowed ips", + vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0], + ) + .unwrap() +}); + +pub struct LatencyTimer { + // time since the stopwatch was started + start: Option, + // accumulated time on the stopwatch + accumulated: std::time::Duration, + // label data + protocol: &'static str, + cache_miss: bool, + pool_miss: bool, + outcome: &'static str, +} + +pub struct LatencyTimerPause<'a> { + timer: &'a mut LatencyTimer, +} + +impl LatencyTimer { + pub fn new(protocol: &'static str) -> Self { + Self { + start: Some(time::Instant::now()), + accumulated: std::time::Duration::ZERO, + protocol, + cache_miss: false, + // by default we don't do pooling + pool_miss: true, + // assume failed unless otherwise specified + outcome: "failed", + } + } + + pub fn pause(&mut self) -> LatencyTimerPause<'_> { + // stop the stopwatch and record the time that we have accumulated + let start = self.start.take().expect("latency timer should be started"); + self.accumulated += start.elapsed(); + LatencyTimerPause { timer: self } + } + + pub fn cache_miss(&mut self) { + self.cache_miss = true; + } + + pub fn pool_hit(&mut self) { + self.pool_miss = false; + } + + pub fn success(mut self) { + self.outcome = "success"; + } +} + +impl Drop for LatencyTimerPause<'_> { + fn drop(&mut self) { + // start the stopwatch again + self.timer.start = Some(time::Instant::now()); + } +} + +impl Drop for LatencyTimer { + fn drop(&mut self) { + let duration = + self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated; + COMPUTE_CONNECTION_LATENCY + .with_label_values(&[ + self.protocol, + bool_to_str(self.cache_miss), + bool_to_str(self.pool_miss), + self.outcome, + ]) + .observe(duration.as_secs_f64()) + } +} + +pub static NUM_CONNECTION_FAILURES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "proxy_connection_failures_total", + "Number of connection failures (per kind).", + &["kind"], + ) + .unwrap() +}); + +pub static NUM_WAKEUP_FAILURES: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "proxy_connection_failures_breakdown", + "Number of wake-up failures (per kind).", + &["retry", "kind"], + ) + .unwrap() +}); + +pub static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "proxy_io_bytes_per_client", + "Number of bytes sent/received between client and backend.", + crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS, + ) + .unwrap() +}); + +pub static NUM_BYTES_PROXIED_COUNTER: Lazy = Lazy::new(|| { + register_int_counter_vec!( + "proxy_io_bytes", + "Number of bytes sent/received between all clients and backends.", + &["direction"], + ) + .unwrap() +}); + +pub const fn bool_to_str(x: bool) -> &'static str { + if x { + "true" + } else { + "false" + } +} diff --git a/proxy/src/proxy.rs b/proxy/src/proxy.rs index da65065179..17e910860c 100644 --- a/proxy/src/proxy.rs +++ b/proxy/src/proxy.rs @@ -1,265 +1,41 @@ #[cfg(test)] mod tests; +pub mod connect_compute; +pub mod retry; + use crate::{ auth, cancellation::{self, CancelMap}, - compute::{self, PostgresConnection}, + compute, config::{AuthenticationConfig, ProxyConfig, TlsConfig}, - console::{self, errors::WakeComputeError, messages::MetricsAuxInfo, Api}, - http::StatusCode, + console::{self, messages::MetricsAuxInfo}, + metrics::{ + LatencyTimer, NUM_BYTES_PROXIED_COUNTER, NUM_BYTES_PROXIED_PER_CLIENT_COUNTER, + NUM_CLIENT_CONNECTION_GAUGE, NUM_CONNECTION_REQUESTS_GAUGE, + }, protocol2::WithClientIp, rate_limiter::EndpointRateLimiter, stream::{PqStream, Stream}, usage_metrics::{Ids, USAGE_METRICS}, }; use anyhow::{bail, Context}; -use async_trait::async_trait; use futures::TryFutureExt; use itertools::Itertools; -use metrics::{ - exponential_buckets, register_int_counter_pair_vec, register_int_counter_vec, - IntCounterPairVec, IntCounterVec, -}; -use once_cell::sync::{Lazy, OnceCell}; +use once_cell::sync::OnceCell; use pq_proto::{BeMessage as Be, FeStartupPacket, StartupMessageParams}; -use prometheus::{ - register_histogram, register_histogram_vec, register_int_gauge_vec, Histogram, HistogramVec, - IntGaugeVec, -}; use regex::Regex; -use std::{error::Error, io, net::IpAddr, ops::ControlFlow, sync::Arc, time::Instant}; -use tokio::{ - io::{AsyncRead, AsyncWrite, AsyncWriteExt}, - time, -}; +use std::{net::IpAddr, sync::Arc}; +use tokio::io::{AsyncRead, AsyncWrite, AsyncWriteExt}; use tokio_util::sync::CancellationToken; -use tracing::{error, info, info_span, warn, Instrument}; +use tracing::{error, info, info_span, Instrument}; use utils::measured_stream::MeasuredStream; -/// Number of times we should retry the `/proxy_wake_compute` http request. -/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0 -pub const NUM_RETRIES_CONNECT: u32 = 16; -const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); -const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25); -const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2; +use self::connect_compute::{connect_to_compute, TcpMechanism}; const ERR_INSECURE_CONNECTION: &str = "connection is insecure (try using `sslmode=require`)"; const ERR_PROTO_VIOLATION: &str = "protocol violation"; -pub static NUM_DB_CONNECTIONS_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_opened_db_connections_total", - "Number of opened connections to a database.", - "proxy_closed_db_connections_total", - "Number of closed connections to a database.", - &["protocol"], - ) - .unwrap() -}); - -pub static NUM_CLIENT_CONNECTION_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_opened_client_connections_total", - "Number of opened connections from a client.", - "proxy_closed_client_connections_total", - "Number of closed connections from a client.", - &["protocol"], - ) - .unwrap() -}); - -pub static NUM_CONNECTION_REQUESTS_GAUGE: Lazy = Lazy::new(|| { - register_int_counter_pair_vec!( - "proxy_accepted_connections_total", - "Number of client connections accepted.", - "proxy_closed_connections_total", - "Number of client connections closed.", - &["protocol"], - ) - .unwrap() -}); - -static COMPUTE_CONNECTION_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_compute_connection_latency_seconds", - "Time it took for proxy to establish a connection to the compute endpoint", - // http/ws/tcp, true/false, true/false, success/failure - // 3 * 2 * 2 * 2 = 24 counters - &["protocol", "cache_miss", "pool_miss", "outcome"], - // largest bucket = 2^16 * 0.5ms = 32s - exponential_buckets(0.0005, 2.0, 16).unwrap(), - ) - .unwrap() -}); - -pub static CONSOLE_REQUEST_LATENCY: Lazy = Lazy::new(|| { - register_histogram_vec!( - "proxy_console_request_latency", - "Time it took for proxy to establish a connection to the compute endpoint", - // proxy_wake_compute/proxy_get_role_info - &["request"], - // largest bucket = 2^16 * 0.2ms = 13s - exponential_buckets(0.0002, 2.0, 16).unwrap(), - ) - .unwrap() -}); - -pub static ALLOWED_IPS_BY_CACHE_OUTCOME: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_allowed_ips_cache_misses", - "Number of cache hits/misses for allowed ips", - // hit/miss - &["outcome"], - ) - .unwrap() -}); - -pub static RATE_LIMITER_ACQUIRE_LATENCY: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_control_plane_token_acquire_seconds", - "Time it took for proxy to establish a connection to the compute endpoint", - // largest bucket = 3^16 * 0.05ms = 2.15s - exponential_buckets(0.00005, 3.0, 16).unwrap(), - ) - .unwrap() -}); - -pub static RATE_LIMITER_LIMIT: Lazy = Lazy::new(|| { - register_int_gauge_vec!( - "semaphore_control_plane_limit", - "Current limit of the semaphore control plane", - &["limit"], // 2 counters - ) - .unwrap() -}); - -pub static NUM_CONNECTION_ACCEPTED_BY_SNI: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_accepted_connections_by_sni", - "Number of connections (per sni).", - &["kind"], - ) - .unwrap() -}); - -pub static ALLOWED_IPS_NUMBER: Lazy = Lazy::new(|| { - register_histogram!( - "proxy_allowed_ips_number", - "Number of allowed ips", - vec![0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 10.0, 20.0, 50.0, 100.0], - ) - .unwrap() -}); - -pub struct LatencyTimer { - // time since the stopwatch was started - start: Option, - // accumulated time on the stopwatch - accumulated: std::time::Duration, - // label data - protocol: &'static str, - cache_miss: bool, - pool_miss: bool, - outcome: &'static str, -} - -pub struct LatencyTimerPause<'a> { - timer: &'a mut LatencyTimer, -} - -impl LatencyTimer { - pub fn new(protocol: &'static str) -> Self { - Self { - start: Some(Instant::now()), - accumulated: std::time::Duration::ZERO, - protocol, - cache_miss: false, - // by default we don't do pooling - pool_miss: true, - // assume failed unless otherwise specified - outcome: "failed", - } - } - - pub fn pause(&mut self) -> LatencyTimerPause<'_> { - // stop the stopwatch and record the time that we have accumulated - let start = self.start.take().expect("latency timer should be started"); - self.accumulated += start.elapsed(); - LatencyTimerPause { timer: self } - } - - pub fn cache_miss(&mut self) { - self.cache_miss = true; - } - - pub fn pool_hit(&mut self) { - self.pool_miss = false; - } - - pub fn success(mut self) { - self.outcome = "success"; - } -} - -impl Drop for LatencyTimerPause<'_> { - fn drop(&mut self) { - // start the stopwatch again - self.timer.start = Some(Instant::now()); - } -} - -impl Drop for LatencyTimer { - fn drop(&mut self) { - let duration = - self.start.map(|start| start.elapsed()).unwrap_or_default() + self.accumulated; - COMPUTE_CONNECTION_LATENCY - .with_label_values(&[ - self.protocol, - bool_to_str(self.cache_miss), - bool_to_str(self.pool_miss), - self.outcome, - ]) - .observe(duration.as_secs_f64()) - } -} - -static NUM_CONNECTION_FAILURES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_connection_failures_total", - "Number of connection failures (per kind).", - &["kind"], - ) - .unwrap() -}); - -static NUM_WAKEUP_FAILURES: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_connection_failures_breakdown", - "Number of wake-up failures (per kind).", - &["retry", "kind"], - ) - .unwrap() -}); - -static NUM_BYTES_PROXIED_PER_CLIENT_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_io_bytes_per_client", - "Number of bytes sent/received between client and backend.", - crate::console::messages::MetricsAuxInfo::TRAFFIC_LABELS, - ) - .unwrap() -}); - -static NUM_BYTES_PROXIED_COUNTER: Lazy = Lazy::new(|| { - register_int_counter_vec!( - "proxy_io_bytes", - "Number of bytes sent/received between all clients and backends.", - &["direction"], - ) - .unwrap() -}); - pub async fn run_until_cancelled( f: F, cancellation_token: &CancellationToken, @@ -539,296 +315,6 @@ async fn handshake( } } -/// If we couldn't connect, a cached connection info might be to blame -/// (e.g. the compute node's address might've changed at the wrong time). -/// Invalidate the cache entry (if any) to prevent subsequent errors. -#[tracing::instrument(name = "invalidate_cache", skip_all)] -pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg { - let is_cached = node_info.cached(); - if is_cached { - warn!("invalidating stalled compute node info cache entry"); - } - let label = match is_cached { - true => "compute_cached", - false => "compute_uncached", - }; - NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc(); - - node_info.invalidate().config -} - -/// Try to connect to the compute node once. -#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)] -async fn connect_to_compute_once( - node_info: &console::CachedNodeInfo, - timeout: time::Duration, - proto: &'static str, -) -> Result { - let allow_self_signed_compute = node_info.allow_self_signed_compute; - - node_info - .config - .connect(allow_self_signed_compute, timeout, proto) - .await -} - -#[async_trait] -pub trait ConnectMechanism { - type Connection; - type ConnectError; - type Error: From; - async fn connect_once( - &self, - node_info: &console::CachedNodeInfo, - timeout: time::Duration, - ) -> Result; - - fn update_connect_config(&self, conf: &mut compute::ConnCfg); -} - -pub struct TcpMechanism<'a> { - /// KV-dictionary with PostgreSQL connection params. - pub params: &'a StartupMessageParams, - pub proto: &'static str, -} - -#[async_trait] -impl ConnectMechanism for TcpMechanism<'_> { - type Connection = PostgresConnection; - type ConnectError = compute::ConnectionError; - type Error = compute::ConnectionError; - - async fn connect_once( - &self, - node_info: &console::CachedNodeInfo, - timeout: time::Duration, - ) -> Result { - connect_to_compute_once(node_info, timeout, self.proto).await - } - - fn update_connect_config(&self, config: &mut compute::ConnCfg) { - config.set_startup_params(self.params); - } -} - -const fn bool_to_str(x: bool) -> &'static str { - if x { - "true" - } else { - "false" - } -} - -fn report_error(e: &WakeComputeError, retry: bool) { - use crate::console::errors::ApiError; - let retry = bool_to_str(retry); - let kind = match e { - WakeComputeError::BadComputeAddress(_) => "bad_compute_address", - WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error", - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::LOCKED, - ref text, - }) if text.contains("written data quota exceeded") - || text.contains("the limit for current plan reached") => - { - "quota_exceeded" - } - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::LOCKED, - .. - }) => "api_console_locked", - WakeComputeError::ApiError(ApiError::Console { - status: StatusCode::BAD_REQUEST, - .. - }) => "api_console_bad_request", - WakeComputeError::ApiError(ApiError::Console { status, .. }) - if status.is_server_error() => - { - "api_console_other_server_error" - } - WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error", - WakeComputeError::TimeoutError => "timeout_error", - }; - NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc(); -} - -/// Try to connect to the compute node, retrying if necessary. -/// This function might update `node_info`, so we take it by `&mut`. -#[tracing::instrument(skip_all)] -pub async fn connect_to_compute( - mechanism: &M, - mut node_info: console::CachedNodeInfo, - extra: &console::ConsoleReqExtra, - creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>, - mut latency_timer: LatencyTimer, -) -> Result -where - M::ConnectError: ShouldRetry + std::fmt::Debug, - M::Error: From, -{ - mechanism.update_connect_config(&mut node_info.config); - - // try once - let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await { - Ok(res) => { - latency_timer.success(); - return Ok(res); - } - Err(e) => { - error!(error = ?e, "could not connect to compute node"); - (invalidate_cache(node_info), e) - } - }; - - latency_timer.cache_miss(); - - let mut num_retries = 1; - - // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node - info!("compute node's state has likely changed; requesting a wake-up"); - let node_info = loop { - let wake_res = match creds { - auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await, - #[cfg(feature = "testing")] - auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await, - // nothing to do? - auth::BackendType::Link(_) => return Err(err.into()), - // test backend - #[cfg(test)] - auth::BackendType::Test(x) => x.wake_compute(), - }; - - match handle_try_wake(wake_res, num_retries) { - Err(e) => { - error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); - report_error(&e, false); - return Err(e.into()); - } - // failed to wake up but we can continue to retry - Ok(ControlFlow::Continue(e)) => { - report_error(&e, true); - warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); - } - // successfully woke up a compute node and can break the wakeup loop - Ok(ControlFlow::Break(mut node_info)) => { - node_info.config.reuse_password(&config); - mechanism.update_connect_config(&mut node_info.config); - break node_info; - } - } - - let wait_duration = retry_after(num_retries); - num_retries += 1; - - time::sleep(wait_duration).await; - }; - - // now that we have a new node, try connect to it repeatedly. - // this can error for a few reasons, for instance: - // * DNS connection settings haven't quite propagated yet - info!("wake_compute success. attempting to connect"); - loop { - match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await { - Ok(res) => { - latency_timer.success(); - return Ok(res); - } - Err(e) => { - let retriable = e.should_retry(num_retries); - if !retriable { - error!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); - return Err(e.into()); - } - warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); - } - } - - let wait_duration = retry_after(num_retries); - num_retries += 1; - - time::sleep(wait_duration).await; - } -} - -/// Attempts to wake up the compute node. -/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable -/// * Returns Ok(Break(node)) if the wakeup succeeded -/// * Returns Err(e) if there was an error -pub fn handle_try_wake( - result: Result, - num_retries: u32, -) -> Result, WakeComputeError> { - match result { - Err(err) => match &err { - WakeComputeError::ApiError(api) if api.should_retry(num_retries) => { - Ok(ControlFlow::Continue(err)) - } - _ => Err(err), - }, - // Ready to try again. - Ok(new) => Ok(ControlFlow::Break(new)), - } -} - -pub trait ShouldRetry { - fn could_retry(&self) -> bool; - fn should_retry(&self, num_retries: u32) -> bool { - match self { - _ if num_retries >= NUM_RETRIES_CONNECT => false, - err => err.could_retry(), - } - } -} - -impl ShouldRetry for io::Error { - fn could_retry(&self) -> bool { - use std::io::ErrorKind; - matches!( - self.kind(), - ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut - ) - } -} - -impl ShouldRetry for tokio_postgres::error::DbError { - fn could_retry(&self) -> bool { - use tokio_postgres::error::SqlState; - matches!( - self.code(), - &SqlState::CONNECTION_FAILURE - | &SqlState::CONNECTION_EXCEPTION - | &SqlState::CONNECTION_DOES_NOT_EXIST - | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION, - ) - } -} - -impl ShouldRetry for tokio_postgres::Error { - fn could_retry(&self) -> bool { - if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) { - io::Error::could_retry(io_err) - } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) { - tokio_postgres::error::DbError::could_retry(db_err) - } else { - false - } - } -} - -impl ShouldRetry for compute::ConnectionError { - fn could_retry(&self) -> bool { - match self { - compute::ConnectionError::Postgres(err) => err.could_retry(), - compute::ConnectionError::CouldNotConnect(err) => err.could_retry(), - _ => false, - } - } -} - -pub fn retry_after(num_retries: u32) -> time::Duration { - BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1)) -} - /// Finish client connection initialization: confirm auth success, send params, etc. #[tracing::instrument(skip_all)] async fn prepare_client_connection( diff --git a/proxy/src/proxy/connect_compute.rs b/proxy/src/proxy/connect_compute.rs new file mode 100644 index 0000000000..88b0019c49 --- /dev/null +++ b/proxy/src/proxy/connect_compute.rs @@ -0,0 +1,238 @@ +use crate::{ + auth, + compute::{self, PostgresConnection}, + console::{self, errors::WakeComputeError, Api}, + metrics::{bool_to_str, LatencyTimer, NUM_CONNECTION_FAILURES, NUM_WAKEUP_FAILURES}, + proxy::retry::{retry_after, ShouldRetry}, +}; +use async_trait::async_trait; +use hyper::StatusCode; +use pq_proto::StartupMessageParams; +use std::ops::ControlFlow; +use tokio::time; +use tracing::{error, info, warn}; + +const CONNECT_TIMEOUT: time::Duration = time::Duration::from_secs(2); + +/// If we couldn't connect, a cached connection info might be to blame +/// (e.g. the compute node's address might've changed at the wrong time). +/// Invalidate the cache entry (if any) to prevent subsequent errors. +#[tracing::instrument(name = "invalidate_cache", skip_all)] +pub fn invalidate_cache(node_info: console::CachedNodeInfo) -> compute::ConnCfg { + let is_cached = node_info.cached(); + if is_cached { + warn!("invalidating stalled compute node info cache entry"); + } + let label = match is_cached { + true => "compute_cached", + false => "compute_uncached", + }; + NUM_CONNECTION_FAILURES.with_label_values(&[label]).inc(); + + node_info.invalidate().config +} + +/// Try to connect to the compute node once. +#[tracing::instrument(name = "connect_once", fields(pid = tracing::field::Empty), skip_all)] +async fn connect_to_compute_once( + node_info: &console::CachedNodeInfo, + timeout: time::Duration, + proto: &'static str, +) -> Result { + let allow_self_signed_compute = node_info.allow_self_signed_compute; + + node_info + .config + .connect(allow_self_signed_compute, timeout, proto) + .await +} + +#[async_trait] +pub trait ConnectMechanism { + type Connection; + type ConnectError; + type Error: From; + async fn connect_once( + &self, + node_info: &console::CachedNodeInfo, + timeout: time::Duration, + ) -> Result; + + fn update_connect_config(&self, conf: &mut compute::ConnCfg); +} + +pub struct TcpMechanism<'a> { + /// KV-dictionary with PostgreSQL connection params. + pub params: &'a StartupMessageParams, + pub proto: &'static str, +} + +#[async_trait] +impl ConnectMechanism for TcpMechanism<'_> { + type Connection = PostgresConnection; + type ConnectError = compute::ConnectionError; + type Error = compute::ConnectionError; + + async fn connect_once( + &self, + node_info: &console::CachedNodeInfo, + timeout: time::Duration, + ) -> Result { + connect_to_compute_once(node_info, timeout, self.proto).await + } + + fn update_connect_config(&self, config: &mut compute::ConnCfg) { + config.set_startup_params(self.params); + } +} + +fn report_error(e: &WakeComputeError, retry: bool) { + use crate::console::errors::ApiError; + let retry = bool_to_str(retry); + let kind = match e { + WakeComputeError::BadComputeAddress(_) => "bad_compute_address", + WakeComputeError::ApiError(ApiError::Transport(_)) => "api_transport_error", + WakeComputeError::ApiError(ApiError::Console { + status: StatusCode::LOCKED, + ref text, + }) if text.contains("written data quota exceeded") + || text.contains("the limit for current plan reached") => + { + "quota_exceeded" + } + WakeComputeError::ApiError(ApiError::Console { + status: StatusCode::LOCKED, + .. + }) => "api_console_locked", + WakeComputeError::ApiError(ApiError::Console { + status: StatusCode::BAD_REQUEST, + .. + }) => "api_console_bad_request", + WakeComputeError::ApiError(ApiError::Console { status, .. }) + if status.is_server_error() => + { + "api_console_other_server_error" + } + WakeComputeError::ApiError(ApiError::Console { .. }) => "api_console_other_error", + WakeComputeError::TimeoutError => "timeout_error", + }; + NUM_WAKEUP_FAILURES.with_label_values(&[retry, kind]).inc(); +} + +/// Try to connect to the compute node, retrying if necessary. +/// This function might update `node_info`, so we take it by `&mut`. +#[tracing::instrument(skip_all)] +pub async fn connect_to_compute( + mechanism: &M, + mut node_info: console::CachedNodeInfo, + extra: &console::ConsoleReqExtra, + creds: &auth::BackendType<'_, auth::backend::ComputeUserInfo>, + mut latency_timer: LatencyTimer, +) -> Result +where + M::ConnectError: ShouldRetry + std::fmt::Debug, + M::Error: From, +{ + mechanism.update_connect_config(&mut node_info.config); + + // try once + let (config, err) = match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await { + Ok(res) => { + latency_timer.success(); + return Ok(res); + } + Err(e) => { + error!(error = ?e, "could not connect to compute node"); + (invalidate_cache(node_info), e) + } + }; + + latency_timer.cache_miss(); + + let mut num_retries = 1; + + // if we failed to connect, it's likely that the compute node was suspended, wake a new compute node + info!("compute node's state has likely changed; requesting a wake-up"); + let node_info = loop { + let wake_res = match creds { + auth::BackendType::Console(api, creds) => api.wake_compute(extra, creds).await, + #[cfg(feature = "testing")] + auth::BackendType::Postgres(api, creds) => api.wake_compute(extra, creds).await, + // nothing to do? + auth::BackendType::Link(_) => return Err(err.into()), + // test backend + #[cfg(test)] + auth::BackendType::Test(x) => x.wake_compute(), + }; + + match handle_try_wake(wake_res, num_retries) { + Err(e) => { + error!(error = ?e, num_retries, retriable = false, "couldn't wake compute node"); + report_error(&e, false); + return Err(e.into()); + } + // failed to wake up but we can continue to retry + Ok(ControlFlow::Continue(e)) => { + report_error(&e, true); + warn!(error = ?e, num_retries, retriable = true, "couldn't wake compute node"); + } + // successfully woke up a compute node and can break the wakeup loop + Ok(ControlFlow::Break(mut node_info)) => { + node_info.config.reuse_password(&config); + mechanism.update_connect_config(&mut node_info.config); + break node_info; + } + } + + let wait_duration = retry_after(num_retries); + num_retries += 1; + + time::sleep(wait_duration).await; + }; + + // now that we have a new node, try connect to it repeatedly. + // this can error for a few reasons, for instance: + // * DNS connection settings haven't quite propagated yet + info!("wake_compute success. attempting to connect"); + loop { + match mechanism.connect_once(&node_info, CONNECT_TIMEOUT).await { + Ok(res) => { + latency_timer.success(); + return Ok(res); + } + Err(e) => { + let retriable = e.should_retry(num_retries); + if !retriable { + error!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); + return Err(e.into()); + } + warn!(error = ?e, num_retries, retriable, "couldn't connect to compute node"); + } + } + + let wait_duration = retry_after(num_retries); + num_retries += 1; + + time::sleep(wait_duration).await; + } +} + +/// Attempts to wake up the compute node. +/// * Returns Ok(Continue(e)) if there was an error waking but retries are acceptable +/// * Returns Ok(Break(node)) if the wakeup succeeded +/// * Returns Err(e) if there was an error +pub fn handle_try_wake( + result: Result, + num_retries: u32, +) -> Result, WakeComputeError> { + match result { + Err(err) => match &err { + WakeComputeError::ApiError(api) if api.should_retry(num_retries) => { + Ok(ControlFlow::Continue(err)) + } + _ => Err(err), + }, + // Ready to try again. + Ok(new) => Ok(ControlFlow::Break(new)), + } +} diff --git a/proxy/src/proxy/retry.rs b/proxy/src/proxy/retry.rs new file mode 100644 index 0000000000..a85ed380b0 --- /dev/null +++ b/proxy/src/proxy/retry.rs @@ -0,0 +1,68 @@ +use crate::compute; +use std::{error::Error, io}; +use tokio::time; + +/// Number of times we should retry the `/proxy_wake_compute` http request. +/// Retry duration is BASE_RETRY_WAIT_DURATION * RETRY_WAIT_EXPONENT_BASE ^ n, where n starts at 0 +pub const NUM_RETRIES_CONNECT: u32 = 16; +const BASE_RETRY_WAIT_DURATION: time::Duration = time::Duration::from_millis(25); +const RETRY_WAIT_EXPONENT_BASE: f64 = std::f64::consts::SQRT_2; + +pub trait ShouldRetry { + fn could_retry(&self) -> bool; + fn should_retry(&self, num_retries: u32) -> bool { + match self { + _ if num_retries >= NUM_RETRIES_CONNECT => false, + err => err.could_retry(), + } + } +} + +impl ShouldRetry for io::Error { + fn could_retry(&self) -> bool { + use std::io::ErrorKind; + matches!( + self.kind(), + ErrorKind::ConnectionRefused | ErrorKind::AddrNotAvailable | ErrorKind::TimedOut + ) + } +} + +impl ShouldRetry for tokio_postgres::error::DbError { + fn could_retry(&self) -> bool { + use tokio_postgres::error::SqlState; + matches!( + self.code(), + &SqlState::CONNECTION_FAILURE + | &SqlState::CONNECTION_EXCEPTION + | &SqlState::CONNECTION_DOES_NOT_EXIST + | &SqlState::SQLCLIENT_UNABLE_TO_ESTABLISH_SQLCONNECTION, + ) + } +} + +impl ShouldRetry for tokio_postgres::Error { + fn could_retry(&self) -> bool { + if let Some(io_err) = self.source().and_then(|x| x.downcast_ref()) { + io::Error::could_retry(io_err) + } else if let Some(db_err) = self.source().and_then(|x| x.downcast_ref()) { + tokio_postgres::error::DbError::could_retry(db_err) + } else { + false + } + } +} + +impl ShouldRetry for compute::ConnectionError { + fn could_retry(&self) -> bool { + match self { + compute::ConnectionError::Postgres(err) => err.could_retry(), + compute::ConnectionError::CouldNotConnect(err) => err.could_retry(), + _ => false, + } + } +} + +pub fn retry_after(num_retries: u32) -> time::Duration { + BASE_RETRY_WAIT_DURATION.mul_f64(RETRY_WAIT_EXPONENT_BASE.powi((num_retries as i32) - 1)) +} diff --git a/proxy/src/proxy/tests.rs b/proxy/src/proxy/tests.rs index 4691abbfb9..3c483c59ee 100644 --- a/proxy/src/proxy/tests.rs +++ b/proxy/src/proxy/tests.rs @@ -2,10 +2,13 @@ mod mitm; +use super::connect_compute::ConnectMechanism; +use super::retry::ShouldRetry; use super::*; use crate::auth::backend::{ComputeUserInfo, TestBackend}; use crate::config::CertResolver; use crate::console::{CachedNodeInfo, NodeInfo}; +use crate::proxy::retry::{retry_after, NUM_RETRIES_CONNECT}; use crate::{auth, http, sasl, scram}; use async_trait::async_trait; use rstest::rstest; @@ -423,7 +426,7 @@ impl ConnectMechanism for TestConnectMechanism { async fn connect_once( &self, _node_info: &console::CachedNodeInfo, - _timeout: time::Duration, + _timeout: std::time::Duration, ) -> Result { let mut counter = self.counter.lock().unwrap(); let action = self.sequence[*counter]; diff --git a/proxy/src/proxy/tests/mitm.rs b/proxy/src/proxy/tests/mitm.rs index 50b3034936..a0a84a1dc0 100644 --- a/proxy/src/proxy/tests/mitm.rs +++ b/proxy/src/proxy/tests/mitm.rs @@ -120,7 +120,7 @@ where struct PgFrame; impl Decoder for PgFrame { type Item = Bytes; - type Error = io::Error; + type Error = std::io::Error; fn decode(&mut self, src: &mut BytesMut) -> Result, Self::Error> { if src.len() < 5 { @@ -136,7 +136,7 @@ impl Decoder for PgFrame { } } impl Encoder for PgFrame { - type Error = io::Error; + type Error = std::io::Error; fn encode(&mut self, item: Bytes, dst: &mut BytesMut) -> Result<(), Self::Error> { dst.extend_from_slice(&item); diff --git a/proxy/src/rate_limiter/limiter.rs b/proxy/src/rate_limiter/limiter.rs index 8dfdfcd3db..a190b2cf8f 100644 --- a/proxy/src/rate_limiter/limiter.rs +++ b/proxy/src/rate_limiter/limiter.rs @@ -393,10 +393,10 @@ impl Limiter { } new_limit }; - crate::proxy::RATE_LIMITER_LIMIT + crate::metrics::RATE_LIMITER_LIMIT .with_label_values(&["expected"]) .set(new_limit as i64); - crate::proxy::RATE_LIMITER_LIMIT + crate::metrics::RATE_LIMITER_LIMIT .with_label_values(&["actual"]) .set(actual_limit as i64); self.limits.store(new_limit, Ordering::Release); @@ -470,7 +470,7 @@ impl reqwest_middleware::Middleware for Limiter { ) })?; info!(duration = ?start.elapsed(), "waiting for token to connect to the control plane"); - crate::proxy::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64()); + crate::metrics::RATE_LIMITER_ACQUIRE_LATENCY.observe(start.elapsed().as_secs_f64()); match next.run(req, extensions).await { Ok(response) => { self.release(token, Some(Outcome::from_reqwest_response(&response))) diff --git a/proxy/src/serverless.rs b/proxy/src/serverless.rs index 870e9c1103..e358a0712f 100644 --- a/proxy/src/serverless.rs +++ b/proxy/src/serverless.rs @@ -13,8 +13,8 @@ pub use reqwest_middleware::{ClientWithMiddleware, Error}; pub use reqwest_retry::{policies::ExponentialBackoff, RetryTransientMiddleware}; use tokio_util::task::TaskTracker; +use crate::metrics::NUM_CLIENT_CONNECTION_GAUGE; use crate::protocol2::{ProxyProtocolAccept, WithClientIp}; -use crate::proxy::NUM_CLIENT_CONNECTION_GAUGE; use crate::rate_limiter::EndpointRateLimiter; use crate::{cancellation::CancelMap, config::ProxyConfig}; use futures::StreamExt; diff --git a/proxy/src/serverless/conn_pool.rs b/proxy/src/serverless/conn_pool.rs index 69198d79d3..ab8903418b 100644 --- a/proxy/src/serverless/conn_pool.rs +++ b/proxy/src/serverless/conn_pool.rs @@ -24,13 +24,12 @@ use tokio_postgres::{AsyncMessage, ReadyForQueryStatus}; use crate::{ auth::{self, backend::ComputeUserInfo, check_peer_addr_is_in_list}, console, - proxy::{neon_options, LatencyTimer, NUM_DB_CONNECTIONS_GAUGE}, + metrics::{LatencyTimer, NUM_DB_CONNECTIONS_GAUGE}, + proxy::{connect_compute::ConnectMechanism, neon_options}, usage_metrics::{Ids, MetricCounter, USAGE_METRICS}, }; use crate::{compute, config}; -use crate::proxy::ConnectMechanism; - use tracing::{error, warn, Span}; use tracing::{info, info_span, Instrument}; @@ -444,7 +443,7 @@ async fn connect_to_compute( .await? .context("missing cache entry from wake_compute")?; - crate::proxy::connect_to_compute( + crate::proxy::connect_compute::connect_to_compute( &TokioMechanism { conn_id, conn_info, diff --git a/proxy/src/serverless/sql_over_http.rs b/proxy/src/serverless/sql_over_http.rs index 795ba819c1..307b085ce0 100644 --- a/proxy/src/serverless/sql_over_http.rs +++ b/proxy/src/serverless/sql_over_http.rs @@ -29,7 +29,7 @@ use utils::http::error::ApiError; use utils::http::json::json_response; use crate::config::HttpConfig; -use crate::proxy::NUM_CONNECTION_REQUESTS_GAUGE; +use crate::metrics::NUM_CONNECTION_REQUESTS_GAUGE; use super::conn_pool::ConnInfo; use super::conn_pool::GlobalConnPool; From e67b8f69c0e30389cbd037521d5f6024180c6752 Mon Sep 17 00:00:00 2001 From: Christian Schwarz Date: Mon, 18 Dec 2023 13:39:48 +0100 Subject: [PATCH 4/4] [PRE-MERGE] pageserver: Reduce tracing overhead in timeline::get #6115 Pre-merge `git merge --squash` of https://github.com/neondatabase/neon/pull/6115 Lowering the tracing level in get_value_reconstruct_data and get_or_maybe_download from info to debug reduces the overhead of span creation in non-debug environments. --- pageserver/src/tenant/storage_layer/layer.rs | 160 ++++++++++--------- test_runner/regress/test_broken_timeline.py | 4 +- 2 files changed, 85 insertions(+), 79 deletions(-) diff --git a/pageserver/src/tenant/storage_layer/layer.rs b/pageserver/src/tenant/storage_layer/layer.rs index a4b102c314..9a8ddc1a6b 100644 --- a/pageserver/src/tenant/storage_layer/layer.rs +++ b/pageserver/src/tenant/storage_layer/layer.rs @@ -259,8 +259,9 @@ impl Layer { layer .get_value_reconstruct_data(key, lsn_range, reconstruct_data, &self.0, ctx) - .instrument(tracing::info_span!("get_value_reconstruct_data", layer=%self)) + .instrument(tracing::debug_span!("get_value_reconstruct_data", layer=%self)) .await + .with_context(|| format!("get_value_reconstruct_data for layer {self}")) } /// Download the layer if evicted. @@ -654,7 +655,6 @@ impl LayerInner { } /// Cancellation safe. - #[tracing::instrument(skip_all, fields(layer=%self))] async fn get_or_maybe_download( self: &Arc, allow_download: bool, @@ -663,95 +663,101 @@ impl LayerInner { let mut init_permit = None; loop { - let download = move |permit| async move { - // disable any scheduled but not yet running eviction deletions for this - let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); + let download = move |permit| { + async move { + // disable any scheduled but not yet running eviction deletions for this + let next_version = 1 + self.version.fetch_add(1, Ordering::Relaxed); - // count cancellations, which currently remain largely unexpected - let init_cancelled = - scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); + // count cancellations, which currently remain largely unexpected + let init_cancelled = + scopeguard::guard((), |_| LAYER_IMPL_METRICS.inc_init_cancelled()); - // no need to make the evict_and_wait wait for the actual download to complete - drop(self.status.send(Status::Downloaded)); + // no need to make the evict_and_wait wait for the actual download to complete + drop(self.status.send(Status::Downloaded)); - let timeline = self - .timeline - .upgrade() - .ok_or_else(|| DownloadError::TimelineShutdown)?; + let timeline = self + .timeline + .upgrade() + .ok_or_else(|| DownloadError::TimelineShutdown)?; - // FIXME: grab a gate + // FIXME: grab a gate - let can_ever_evict = timeline.remote_client.as_ref().is_some(); + let can_ever_evict = timeline.remote_client.as_ref().is_some(); - // check if we really need to be downloaded; could have been already downloaded by a - // cancelled previous attempt. - let needs_download = self - .needs_download() - .await - .map_err(DownloadError::PreStatFailed)?; + // check if we really need to be downloaded; could have been already downloaded by a + // cancelled previous attempt. + let needs_download = self + .needs_download() + .await + .map_err(DownloadError::PreStatFailed)?; - let permit = if let Some(reason) = needs_download { - if let NeedsDownload::NotFile(ft) = reason { - return Err(DownloadError::NotFile(ft)); + let permit = if let Some(reason) = needs_download { + if let NeedsDownload::NotFile(ft) = reason { + return Err(DownloadError::NotFile(ft)); + } + + // only reset this after we've decided we really need to download. otherwise it'd + // be impossible to mark cancelled downloads for eviction, like one could imagine + // we would like to do for prefetching which was not needed. + self.wanted_evicted.store(false, Ordering::Release); + + if !can_ever_evict { + return Err(DownloadError::NoRemoteStorage); + } + + if let Some(ctx) = ctx { + self.check_expected_download(ctx)?; + } + + if !allow_download { + // this does look weird, but for LayerInner the "downloading" means also changing + // internal once related state ... + return Err(DownloadError::DownloadRequired); + } + + tracing::info!(%reason, "downloading on-demand"); + + self.spawn_download_and_wait(timeline, permit).await? + } else { + // the file is present locally, probably by a previous but cancelled call to + // get_or_maybe_download. alternatively we might be running without remote storage. + LAYER_IMPL_METRICS.inc_init_needed_no_download(); + + permit + }; + + let since_last_eviction = + self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed()); + if let Some(since_last_eviction) = since_last_eviction { + // FIXME: this will not always be recorded correctly until #6028 (the no + // download needed branch above) + LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); } - // only reset this after we've decided we really need to download. otherwise it'd - // be impossible to mark cancelled downloads for eviction, like one could imagine - // we would like to do for prefetching which was not needed. - self.wanted_evicted.store(false, Ordering::Release); + let res = Arc::new(DownloadedLayer { + owner: Arc::downgrade(self), + kind: tokio::sync::OnceCell::default(), + version: next_version, + }); - if !can_ever_evict { - return Err(DownloadError::NoRemoteStorage); + self.access_stats.record_residence_event( + LayerResidenceStatus::Resident, + LayerResidenceEventReason::ResidenceChange, + ); + + let waiters = self.inner.initializer_count(); + if waiters > 0 { + tracing::info!( + waiters, + "completing the on-demand download for other tasks" + ); } - if let Some(ctx) = ctx { - self.check_expected_download(ctx)?; - } + scopeguard::ScopeGuard::into_inner(init_cancelled); - if !allow_download { - // this does look weird, but for LayerInner the "downloading" means also changing - // internal once related state ... - return Err(DownloadError::DownloadRequired); - } - - tracing::info!(%reason, "downloading on-demand"); - - self.spawn_download_and_wait(timeline, permit).await? - } else { - // the file is present locally, probably by a previous but cancelled call to - // get_or_maybe_download. alternatively we might be running without remote storage. - LAYER_IMPL_METRICS.inc_init_needed_no_download(); - - permit - }; - - let since_last_eviction = - self.last_evicted_at.lock().unwrap().map(|ts| ts.elapsed()); - if let Some(since_last_eviction) = since_last_eviction { - // FIXME: this will not always be recorded correctly until #6028 (the no - // download needed branch above) - LAYER_IMPL_METRICS.record_redownloaded_after(since_last_eviction); + Ok((ResidentOrWantedEvicted::Resident(res), permit)) } - - let res = Arc::new(DownloadedLayer { - owner: Arc::downgrade(self), - kind: tokio::sync::OnceCell::default(), - version: next_version, - }); - - self.access_stats.record_residence_event( - LayerResidenceStatus::Resident, - LayerResidenceEventReason::ResidenceChange, - ); - - let waiters = self.inner.initializer_count(); - if waiters > 0 { - tracing::info!(waiters, "completing the on-demand download for other tasks"); - } - - scopeguard::ScopeGuard::into_inner(init_cancelled); - - Ok((ResidentOrWantedEvicted::Resident(res), permit)) + .instrument(tracing::info_span!("get_or_maybe_download", layer=%self)) }; if let Some(init_permit) = init_permit.take() { diff --git a/test_runner/regress/test_broken_timeline.py b/test_runner/regress/test_broken_timeline.py index 53eeb8bbe9..4da0ba7b20 100644 --- a/test_runner/regress/test_broken_timeline.py +++ b/test_runner/regress/test_broken_timeline.py @@ -20,7 +20,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): env.pageserver.allowed_errors.extend( [ - ".*layer loading failed:.*", + ".*get_value_reconstruct_data for layer .*", ".*could not find data for key.*", ".*is not active. Current state: Broken.*", ".*will not become active. Current state: Broken.*", @@ -83,7 +83,7 @@ def test_local_corruption(neon_env_builder: NeonEnvBuilder): # (We don't check layer file contents on startup, when loading the timeline) # # This will change when we implement checksums for layers - with pytest.raises(Exception, match="layer loading failed:") as err: + with pytest.raises(Exception, match="get_value_reconstruct_data for layer ") as err: pg2.start() log.info( f"As expected, compute startup failed for timeline {tenant2}/{timeline2} with corrupt layers: {err}"